In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

# Load the dataset
df = pd.read_csv(r'C:\Users\user\Downloads\student_enrollment_data.csv')

df.head()


Unnamed: 0,student_id,program_type,enrollment_status,admission_score,start_date,gpa,credit_load,failed_courses,attendance_rate,age,gender,ethnicity,financial_aid,parental_education_level,employment_hours,distance_to_campus_km,course_completion_rate,core_subject_performance,support_sessions_used
0,1,Certificate,graduated,67.9,2020-01-31,3.79,jobless,1,81.6,32,male,Other,Yes,PostGraduate,0,6.7,0.84,2.71,9
1,2,Bachelor,active,60.2,2020-04-30,2.64,part-time,1,95.5,20,female,Black,No,High School,12,3.8,0.79,3.79,8
2,3,Certificate,graduated,92.6,2020-07-31,2.22,jobless,2,79.5,25,female,Other,Yes,High School,0,47.0,0.77,2.26,4
3,4,Certificate,active,88.3,2020-10-31,2.46,jobless,4,75.1,33,female,Hispanic,Yes,Diploma,12,20.8,0.91,2.66,5
4,5,Bachelor,graduated,89.2,2021-01-31,2.85,jobless,0,86.7,30,male,Other,No,PostGraduate,12,29.1,0.86,2.64,3


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   student_id                50 non-null     int64  
 1   program_type              50 non-null     object 
 2   enrollment_status         50 non-null     object 
 3   admission_score           50 non-null     float64
 4   start_date                50 non-null     object 
 5   gpa                       50 non-null     float64
 6   credit_load               50 non-null     object 
 7   failed_courses            50 non-null     int64  
 8   attendance_rate           50 non-null     float64
 9   age                       50 non-null     int64  
 10  gender                    50 non-null     object 
 11  ethnicity                 50 non-null     object 
 12  financial_aid             50 non-null     object 
 13  parental_education_level  50 non-null     object 
 14  employment_h

In [19]:
df.describe()

Unnamed: 0,student_id,admission_score,gpa,failed_courses,attendance_rate,age,employment_hours,distance_to_campus_km,course_completion_rate,core_subject_performance,support_sessions_used
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,25.5,79.65,2.9276,2.18,85.43,25.96,8.4,23.908,0.8168,2.9386,4.76
std,14.57738,11.692056,0.573925,1.49407,8.577313,5.547421,8.369039,15.594882,0.096942,0.549968,2.462453
min,1.0,60.2,2.01,0.0,70.9,18.0,0.0,0.7,0.61,2.01,0.0
25%,13.25,69.4,2.48,1.0,78.975,21.25,0.0,10.325,0.76,2.47,3.0
50%,25.5,80.6,2.84,2.0,86.5,25.5,12.0,23.9,0.82,2.99,5.0
75%,37.75,90.35,3.355,3.0,92.875,31.75,12.0,37.875,0.88,3.3775,7.0
max,50.0,97.2,3.97,4.0,99.7,34.0,20.0,49.3,0.99,3.97,9.0


In [42]:
categorical_columns = ['program_type', 'credit_load', 'gender', 'ethnicity', 'financial_aid', 'parental_education_level']

# Filter the list to include only columns that exist in the DataFrame
existing_categorical_columns = [col for col in categorical_columns if col in df.columns]

# Apply one-hot encoding only to the existing columns
df = pd.get_dummies(df, columns=existing_categorical_columns, drop_first=True)




In [44]:
# Define features and target variable for enrollment prediction
enrollment_features = df.columns.difference(['enrollment_status', 'enrolled'])
X = df[enrollment_features]
y = df['enrolled']

In [45]:
# Ensure only numeric columns are present in X
X = X.select_dtypes(include=[np.number])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:

# Model Training - Random Forest Classifier for enrollment prediction
enrollment_model = RandomForestClassifier(random_state=42)
enrollment_model.fit(X_train, y_train)

In [48]:
# Predict and Evaluate
enrollment_pred = enrollment_model.predict(X_test)
print("Enrollment Prediction Model:")
print(classification_report(y_test, enrollment_pred))
print("Accuracy:", accuracy_score(y_test, enrollment_pred))


Enrollment Prediction Model:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      1.00      0.95         9

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10

Accuracy: 0.9


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# Define features and target variable for support need prediction
# Drop 'start_date' and any other non-numeric columns from the feature set
df = df.drop(['start_date'], axis=1, errors='ignore')

In [56]:
# Encode categorical variables if not already done
categorical_columns = ['program_type', 'credit_load', 'gender', 'ethnicity', 'financial_aid', 'parental_education_level']
existing_categorical_columns = [col for col in categorical_columns if col in df.columns]
df = pd.get_dummies(df, columns=existing_categorical_columns, drop_first=True)


In [57]:
# Define features and target
support_features = df.columns.difference(['enrollment_status', 'needs_support'])
X = df[support_features]
y = df['needs_support']

In [58]:
# Ensure only numeric columns are present in X
X = X.select_dtypes(include=[np.number])

In [59]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [60]:
# Standardize features
scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [61]:
# Model Training - Logistic Regression for support need prediction
support_model = LogisticRegression(random_state=42)
support_model.fit(X_train, y_train)

In [62]:
# Predict and Evaluate
support_pred = support_model.predict(X_test)
print("\nSupport Need Prediction Model:")
print(classification_report(y_test, support_pred))
print("Accuracy:", accuracy_score(y_test, support_pred))


Support Need Prediction Model:
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.33      1.00      0.50         1

    accuracy                           0.80        10
   macro avg       0.67      0.89      0.69        10
weighted avg       0.93      0.80      0.84        10

Accuracy: 0.8
