In [None]:
import pandas as pd
import io
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_curve, roc_auc_score, precision_recall_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

Specify file path and create target variable

In [None]:
# Use when in Google Colab
# from google.colab import drive
# drive.mount('/content/drive')
# file_path = "/content/drive/MyDrive/Datasets for Google Colab/student_success_enhanced.csv"

# Use when in Shared Google Colab
'''
file_id = '17eoOjbTriXdOnuUC2LSHDe-9lA-V_h1X'
url = f'https://drive.google.com/uc?id={file_id}'

# Send GET request to download the file
response = requests.get(url)

# Use io.BytesIO to read the content into pandas directly
df = pd.read_csv(io.BytesIO(response.content))
'''

# Use when in local environment
file_path = "./student_success_enhanced.csv"


# Load dataset
df = pd.read_csv(file_path)

# Create target variable column, "Graduated" based on Graduation_Rate
df['Graduated'] = (df['Graduation_Rate'] >= 0.6).astype(int)

In [None]:
# Setup columns that need to be scaled or encoded
numerical_columns = ["GPA", "SAT_Score", "ACT_Score", "Family_Size", "Support_Center_Utilization",
                     "Retention_Rate", "Graduation_Age", "Study_Hours_Per_Week", "Student_Loan_Amount", "Distance_From_Home", "Work_Hours_Per_Week"]
nominal_columns = ["Marital_Status", "Life_Event", "Major"]
ordinal_columns = ["Income_Level", "Institution_Type", "Campus_Engagement", "First_Gen_Student", "Enrollment_Status"]

Displaying the split between "Graduated" 1 and 0

In [None]:
df['Graduated'].value_counts()

Display the dataset and information

In [None]:
# Show the first couple rows of the dataframe
df.head(13)

In [None]:
# Display the unique values within a specific column
df['Life_Event'].unique()
# df['Income_Level'].value_counts()

In [None]:
# Describe the different values of a specific column
df['Family_Size'].describe()

In [None]:
# Replacing missing values in the Life_Event column with 'None'
df['Life_Event'] = df['Life_Event'].fillna('None')
# Display the unique values within a specific column
df['Life_Event'].unique()

In [None]:
# Prepare data
X = df.drop(columns=["Student_ID", "Graduation_Rate", "Graduated"])
y = df["Graduated"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of testing samples: {X_test.shape[0]}")

In [None]:
# One-hot encode categorical (nominal) variables
X_train = pd.get_dummies(X_train, columns=nominal_columns, drop_first=True)

# Apply the same encoding to the test set
X_test = pd.get_dummies(X_test, columns=nominal_columns, drop_first=True)

# Handle categorical (ordinal) columns
le = LabelEncoder()
for col in ordinal_columns:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train.head()

In [None]:
# Display the unique values within a specific column
X_train['Life_Event_None'].unique()
# df['Income_Level'].value_counts()

Converting boolean values into 1s and 0s

In [None]:
# Convert only boolean columns (True/False) to 1/0
boolean_columns = X_train.select_dtypes(include=['bool']).columns
X_train[boolean_columns] = X_train[boolean_columns].astype(int)
X_test[boolean_columns] = X_test[boolean_columns].astype(int)

# LOGISTIC REGRESSION

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

# Train logistic regression model
model = LogisticRegression(max_iter=2000, solver='saga')
model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
y_test_proba = model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Evaluate model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)

# Print results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

In [None]:
print(f"Predicted Class Distribution: {np.bincount(y_test_pred)}")

In [None]:
# Visualization 1: Confusion Matrix Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Graduated", "Graduated"], yticklabels=["Not Graduated", "Graduated"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Visualization 2: ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
roc_auc = roc_auc_score(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

# Visualization 3: Precision-Recall Curve (with zero_division parameter)
precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label="Precision-Recall Curve", color='green')
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="lower left")
plt.show()

# DECISION FOREST


In [None]:
model = DecisionTreeClassifier(random_state=42,max_depth=4)
# When using Decision Trees, we do not want to scale the data.
# model.fit(X_train_scaled, y_train)
model.fit(X_train, y_train)

In [None]:
from sklearn.tree import plot_tree

# Plot the trained decision tree
plt.figure(figsize=(40, 30))
plot_tree(model, filled=True, feature_names=X.columns, class_names=["Not Graduated", "Graduated"], rounded=True, fontsize=12)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Evaluate model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)
print(f"Predicted Class Distribution: {np.bincount(y_test_pred)}")


In [None]:
# Visualization 1: Confusion Matrix Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Graduated", "Graduated"], yticklabels=["Not Graduated", "Graduated"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Visualization 2: ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
roc_auc = roc_auc_score(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

# Visualization 3: Precision-Recall Curve (with zero_division parameter)
precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label="Precision-Recall Curve", color='green')
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="lower left")
plt.show()

# RANDOM FOREST

In [None]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
# rf_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
conf_matrix = confusion_matrix(y_test, y_pred)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Graduated", "Graduated"], yticklabels=["Not Graduated", "Graduated"])
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_curve
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load your dataset (assuming df is your DataFrame and you've already preprocessed it)
# Assuming your target is 'Graduation_Rate' (converted to 0 or 1 for classification)

'''
# Prepare features and target
X = df.drop(columns=['Student_ID', 'Graduation_Rate'])  # Dropping ID and target column
y = df['Graduation_Rate'].apply(lambda x: 1 if x >= 0.5 else 0)  # Binary target (0 or 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
'''

# Initialize the base model (Decision Tree as the weak learner for AdaBoost)
base_model = DecisionTreeClassifier(max_depth=1)

# Initialize AdaBoost with the base model
ada_boost = AdaBoostClassifier(estimator=base_model, n_estimators=50, random_state=42)

# Fit the AdaBoost model
ada_boost.fit(X_train, y_train)

# Predict on test data
y_pred = ada_boost.predict(X_test)
y_pred_proba = ada_boost.predict_proba(X_test)[:, 1]

# Model Evaluation

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

# Classification Report
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot ROC curve using Plotly
fig_roc = go.Figure()

fig_roc.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (AUC = {roc_auc:.2f})',
    line=dict(color='#FF6347', width=3)  # Tomato red color
))

fig_roc.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name="Random Guess",
    line=dict(color='#D3D3D3', dash='dash', width=2)  # Light gray for random guess
))

fig_roc.update_layout(
    title="Receiver Operating Characteristic (ROC) Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    showlegend=True,
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False)
)

fig_roc.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# Plot Precision-Recall curve using Plotly
fig_prc = go.Figure()

fig_prc.add_trace(go.Scatter(
    x=recall,
    y=precision,
    mode='lines',
    name="Precision-Recall Curve",
    line=dict(color='#32CD32', width=3)  # Lime green color
))

fig_prc.update_layout(
    title="Precision-Recall Curve",
    xaxis_title="Recall",
    yaxis_title="Precision",
    showlegend=True,
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False)
)

fig_prc.show()

# Confusion Matrix Heatmap
fig_cm = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=["Not Graduated", "Graduated"],
    y=["Not Graduated", "Graduated"],
    colorscale='RdBu',  # Red-Blue color scale for better contrast
    showscale=True
)

fig_cm.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted",
    yaxis_title="Actual",
    xaxis=dict(tickmode='array', tickvals=[0, 1], ticktext=["Not Graduated", "Graduated"]),
    yaxis=dict(tickmode='array', tickvals=[0, 1], ticktext=["Not Graduated", "Graduated"]),
)

fig_cm.show()
