In [2]:
import pandas as pd
import io
import requests
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, roc_curve,
    roc_auc_score, precision_recall_curve, recall_score, precision_score, f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# Specify file path and create target variable

In [3]:
# Load the data
file_id = '17eoOjbTriXdOnuUC2LSHDe-9lA-V_h1X'   # File ID from Google Drive points to the dataset
url = f'https://drive.google.com/uc?id={file_id}'

# Send GET request to download the file
response = requests.get(url)

# Use io.BytesIO to read the content into pandas directly
df = pd.read_csv(io.BytesIO(response.content))

# Create target variable column, "Graduated" based on Graduation_Rate
df['Graduated'] = (df['Graduation_Rate'] >= 0.6).astype(int)

In [4]:
# Setup columns that need to be scaled or encoded
numerical_columns = ["GPA", "SAT_Score", "ACT_Score", "Family_Size", "Support_Center_Utilization",
                     "Retention_Rate", "Graduation_Age", "Study_Hours_Per_Week", "Student_Loan_Amount", "Distance_From_Home", "Work_Hours_Per_Week"]
nominal_columns = ["Marital_Status", "Life_Event", "Major"]
ordinal_columns = ["Income_Level", "Institution_Type", "Campus_Engagement", "First_Gen_Student", "Enrollment_Status"]

Displaying the split between "Graduated" 1 and 0

In [6]:
df['Graduated'].value_counts()

Graduated
1    7159
0    2841
Name: count, dtype: int64

Display the dataset and information

In [7]:
# Show the first several rows of the dataframe
df.head(13)

Unnamed: 0,Student_ID,GPA,SAT_Score,ACT_Score,Family_Size,Income_Level,Marital_Status,Support_Center_Utilization,Retention_Rate,Graduation_Rate,...,Graduation_Age,Major,Study_Hours_Per_Week,Student_Loan_Amount,Campus_Engagement,First_Gen_Student,Enrollment_Status,Distance_From_Home,Work_Hours_Per_Week,Graduated
0,1,2.73,1174,26,1,High,Married,0.23,0.72,0.62,...,22.3,STEM,13.3,30968.51,Low,False,Full-Time,42.0,16.9,1
1,2,2.61,1079,24,4,High,Married,0.15,0.68,0.63,...,23.0,STEM,25.1,18679.95,Low,True,Full-Time,5.0,4.4,1
2,3,2.81,1197,26,4,Low,Married,0.47,0.61,0.58,...,22.4,Education,15.1,39004.41,Low,False,Full-Time,62.9,9.5,0
3,4,3.35,1328,29,1,High,Divorced,0.0,0.9,0.9,...,22.5,STEM,13.1,15563.23,Low,False,Full-Time,93.1,4.2,1
4,5,3.02,1064,23,1,Middle,Single,0.22,0.61,0.66,...,22.9,Arts,21.8,6533.81,Low,False,Full-Time,63.1,15.5,1
5,6,2.43,1064,23,1,Middle,Divorced,0.26,0.54,0.58,...,22.5,STEM,13.0,28718.06,Medium,False,Full-Time,64.8,14.7,0
6,7,3.64,1336,30,4,Low,Single,0.28,0.71,0.65,...,22.0,Arts,17.1,21571.52,Low,False,Full-Time,65.2,8.7,1
7,8,2.8,1215,27,1,Low,Single,0.36,0.86,0.84,...,23.0,Health Sciences,15.3,8249.42,High,False,Full-Time,95.8,8.9,1
8,9,2.32,1029,23,1,Middle,Single,0.56,0.56,0.56,...,23.1,Arts,15.7,19196.86,Low,False,Part-Time,65.8,8.3,0
9,10,3.2,1181,26,1,Middle,Married,0.37,1.0,0.99,...,22.0,Business,3.1,16445.08,Medium,False,Full-Time,90.8,14.4,1


In [8]:
# Check for missing values
print(df.isnull().sum())

Student_ID                       0
GPA                              0
SAT_Score                        0
ACT_Score                        0
Family_Size                      0
Income_Level                     0
Marital_Status                   0
Support_Center_Utilization       0
Retention_Rate                   0
Graduation_Rate                  0
Life_Event                    6022
Institution_Type                 0
Graduation_Age                   0
Major                            0
Study_Hours_Per_Week             0
Student_Loan_Amount              0
Campus_Engagement                0
First_Gen_Student                0
Enrollment_Status                0
Distance_From_Home               0
Work_Hours_Per_Week              0
Graduated                        0
dtype: int64


In [5]:
# Replacing empty values in Life_event to 'None'
df['Life_Event'] = df['Life_Event'].fillna('None')

In [6]:
# Prepare data
X = df.drop(columns=["Student_ID", "Graduation_Rate", "Graduated"])
y = df["Graduated"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of testing samples: {X_test.shape[0]}")

Number of training samples: 6700
Number of testing samples: 3300


In [11]:
feature_names = X.columns.tolist()
print(feature_names)

['GPA', 'SAT_Score', 'ACT_Score', 'Family_Size', 'Income_Level', 'Marital_Status', 'Support_Center_Utilization', 'Retention_Rate', 'Life_Event', 'Institution_Type', 'Graduation_Age', 'Major', 'Study_Hours_Per_Week', 'Student_Loan_Amount', 'Campus_Engagement', 'First_Gen_Student', 'Enrollment_Status', 'Distance_From_Home', 'Work_Hours_Per_Week']


# Creating the Pipeline

GridSearch for K-Nearest Neighbors

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numeric columns
        ('cat', OneHotEncoder(), nominal_columns),  # One-hot encode categorical columns
        ('ord', OrdinalEncoder(), ordinal_columns)  # Ordinal encode ordinal columns
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=5))
])

# Define parameter grid for KNN
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}

# Grid search with pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_knn_model = grid_search.best_estimator_

# Make predictions on training data
train_predictions = best_knn_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)

# Make predictions on testing data
test_predictions = best_knn_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
conf_matrix = confusion_matrix(y_test, test_predictions)
report = classification_report(y_test, test_predictions)

# Print the results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}\n")
print(f"Training Recall: {train_recall:.4f}")
print(f"Test Recall: {test_recall:.4f}\n")
print(f"Training Precision: {train_precision:.4f}")
print(f"Test Precision: {test_precision:.4f}\n")
print(f"Training F1: {train_f1:.4f}")
print(f"Test F1: {test_f1:.4f}")
print("\nConfusion Matrix:\n", conf_matrix, "\n")
print("Classification Report:\n", report)

GridSearch for Logistic Regression

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numeric columns
        ('cat', OneHotEncoder(), nominal_columns),  # One-hot encode categorical columns
        ('ord', OrdinalEncoder(), ordinal_columns)  # Ordinal encode ordinal columns
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

# Define parameter grid for Logistic Regression
param_grid = {
    'model__C': [0.1, 1.0, 10.0],
    'model__solver': ['liblinear', 'saga'],
    'model__penalty': ['l1', 'l2']
}

# Grid search with pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_logistic_regression_model = grid_search.best_estimator_

# Make predictions on training data
train_predictions = best_logistic_regression_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)

# Make predictions on testing data
test_predictions = best_logistic_regression_model.predict(X_test)
y_test_proba = best_logistic_regression_model.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
conf_matrix = confusion_matrix(y_test, test_predictions)
report = classification_report(y_test, test_predictions)

# Print the results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}\n")
print(f"Training Recall: {train_recall:.4f}")
print(f"Test Recall: {test_recall:.4f}\n")
print(f"Training Precision: {train_precision:.4f}")
print(f"Test Precision: {test_precision:.4f}\n")
print(f"Training F1: {train_f1:.4f}")
print(f"Test F1: {test_f1:.4f}\n")
print("Confusion Matrix:\n", conf_matrix, "\n")
print("Classification Report:\n", report, "\n")
print(f"Predicted Class Distribution: {np.bincount(test_predictions)}")


Best Parameters: {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'saga'}
Training Accuracy: 0.9104
Testing Accuracy: 0.9064

Training Recall: 0.9441
Test Recall: 0.9399

Training Precision: 0.9321
Test Precision: 0.9292

Training F1: 0.9381
Test F1: 0.9345

Confusion Matrix:
 [[ 785  168]
 [ 141 2206]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.84       953
           1       0.93      0.94      0.93      2347

    accuracy                           0.91      3300
   macro avg       0.89      0.88      0.89      3300
weighted avg       0.91      0.91      0.91      3300
 

Predicted Class Distribution: [ 926 2374]


In [None]:
# Visualization 1: Confusion Matrix Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Graduated", "Graduated"], yticklabels=["Not Graduated", "Graduated"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Visualization 2: ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
roc_auc = roc_auc_score(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

# Visualization 3: Precision-Recall Curve (with zero_division parameter)
precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label="Precision-Recall Curve", color='green')
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="lower left")
plt.show()

# Code that iterates through different models

In [None]:
# Create the preprocessor (without scaling for models that don't need it)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), nominal_columns),      # One-hot encode categorical columns
        ('ord', OrdinalEncoder(), ordinal_columns)    # Ordinal encode ordinal columns
    ]
)

# List of models to iterate over
models = [
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Naive Bayes', GaussianNB())
]

# Iterate through models and compare train and test accuracy
for model_name, model in models:
    # If the model requires scaling, add StandardScaler
    if isinstance(model, SVC):  # these models requires scaling
        preprocessor_with_scaling = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_columns),  # Scale numeric columns
                ('cat', OneHotEncoder(), nominal_columns),      # One-hot encode categorical columns
                ('ord', OrdinalEncoder(), ordinal_columns)    # Ordinal encode ordinal columns
            ]
        )
    else:
        preprocessor_with_scaling = preprocessor  # Use the preprocessor without scaling for other models

    # Create the pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor_with_scaling),  # Preprocessing steps
        ('model', model)  # Model
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the training and test data
    train_predictions = pipeline.predict(X_train)
    test_predictions = pipeline.predict(X_test)

    # Calculate accuracy on the training set
    train_accuracy = accuracy_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)

    # Calculate accuracy on the test set
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_recall = recall_score(y_test, test_predictions)
    test_precision = precision_score(y_test, test_predictions)
    test_f1 = f1_score(y_test, test_predictions)

    # Print the model and its accuracy results
    print(f"{model_name}:")
    print(f"  Training Accuracy: {train_accuracy:.4f}")
    print(f"  Test Accuracy: {test_accuracy:.4f}\n")
    print(f"  Training Recall: {train_recall:.4f}")
    print(f"  Test Recall: {test_recall:.4f}\n")
    print(f"  Training Precision: {train_precision:.4f}")
    print(f"  Test Precision: {test_precision:.4f}\n")
    print(f"  Training F1: {train_f1:.4f}")
    print(f"  Test F1: {test_f1:.4f}\n")

# Looking at Feature Importance

This is broken.

In [16]:
import shap
import pandas as pd
import requests
import io
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load the data
file_id = '17eoOjbTriXdOnuUC2LSHDe-9lA-V_h1X'   # File ID from Google Drive points to the dataset
url = f'https://drive.google.com/uc?id={file_id}'

# Send GET request to download the file
response = requests.get(url)

# Use io.BytesIO to read the content into pandas directly
df = pd.read_csv(io.BytesIO(response.content))

# Create target variable column, "Graduated" based on Graduation_Rate
df['Graduated'] = (df['Graduation_Rate'] >= 0.6).astype(int)

# Prepare data
X = df.drop(columns=["Student_ID", "Graduation_Rate", "Graduated"])
y = df["Graduated"]

# Extract feature names
feature_names = X.columns.tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of testing samples: {X_test.shape[0]}")

# Setup columns that need to be scaled or encoded
numerical_columns = ["GPA", "SAT_Score", "ACT_Score", "Family_Size", "Support_Center_Utilization",
                     "Retention_Rate", "Graduation_Age", "Study_Hours_Per_Week", "Student_Loan_Amount", "Distance_From_Home", "Work_Hours_Per_Week"]
nominal_columns = ["Marital_Status", "Life_Event", "Major"]
ordinal_columns = ["Income_Level", "Institution_Type", "Campus_Engagement", "First_Gen_Student", "Enrollment_Status"]

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numeric columns
        ('cat', OneHotEncoder(), nominal_columns),  # One-hot encode categorical columns
        ('ord', OrdinalEncoder(), ordinal_columns)  # Ordinal encode ordinal columns
    ]
)

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, C=0.1, penalty='l2', solver='saga'))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Get the transformed feature names
def get_feature_names(column_transformer):
    output_feature_names = []
    for name, transformer, columns in column_transformer.transformers:
        if name == 'remainder':
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            output_feature_names.extend(transformer.get_feature_names_out())
        else:
            if isinstance(columns, list):
                output_feature_names.extend(columns)
            else:
                output_feature_names.extend([f"{name}__{col}" for col in columns])
    return output_feature_names

transformed_feature_names = get_feature_names(preprocessor)

# Create a SHAP explainer
explainer = shap.Explainer(pipeline.named_steps['model'], pipeline.named_steps['preprocessor'].transform(X_train))

# Get SHAP values for the test set
shap_values = explainer(pipeline.named_steps['preprocessor'].transform(X_test))

# Plot feature importance
shap.summary_plot(shap_values, pipeline.named_steps['preprocessor'].transform(X_test), feature_names=transformed_feature_names)

Number of training samples: 6700
Number of testing samples: 3300


NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), nominal_columns),  # One-hot encode categorical columns
        ('ord', OrdinalEncoder(), ordinal_columns)  # Ordinal encode ordinal columns
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Define parameter grid for Logistic Regression
param_grid = {
    'model__n_estimators': [50, 100, 200],  # Number of trees
    'model__max_depth': [None, 10, 20, 30],  # Depth of the trees
    'model__min_samples_split': [2, 5, 10],  # Minimum samples to split
    'model__min_samples_leaf': [1, 2, 4],    # Minimum samples at leaf nodes
    'model__max_features': ['sqrt', 'log2'],  # Features to consider for each split
    'model__bootstrap': [True, False]        # Whether bootstrap sampling is used
}

# Grid search with pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_rf_model = grid_search.best_estimator_

# Make predictions on training data
train_predictions = best_rf_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)

# Make predictions on testing data
test_predictions = best_rf_model.predict(X_test)
# y_test_proba = best_rf_model.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
conf_matrix = confusion_matrix(y_test, test_predictions)
report = classification_report(y_test, test_predictions)

# Print the results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}\n")
print(f"Training Recall: {train_recall:.4f}")
print(f"Test Recall: {test_recall:.4f}\n")
print(f"Training Precision: {train_precision:.4f}")
print(f"Test Precision: {test_precision:.4f}\n")
print(f"Training F1: {train_f1:.4f}")
print(f"Test F1: {test_f1:.4f}\n")
print("Confusion Matrix:\n", conf_matrix, "\n")
print("Classification Report:\n", report, "\n")
print(f"Predicted Class Distribution: {np.bincount(test_predictions)}")
