In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

def prepare_data(set_dir):
    data_dir = Path.home() / "OneDrive" / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"
    assessments = pd.read_csv(f"{data_dir}/assessments.csv")
    student_info = pd.read_csv(f"{set_dir}/student_info.csv")
    student_assessment = pd.read_csv(f"{set_dir}/student_assessment.csv")
    student_reg = pd.read_csv(f"{set_dir}/student_reg.csv")
    student_vle = pd.read_csv(f"{set_dir}/student_vle.csv")
    
    student_assessment = pd.merge(
        student_assessment, 
        assessments[['id_assessment', 'weight', 'assessment_type']], 
        on='id_assessment', 
        how='left'
    )

    student_assessment = student_assessment[student_assessment['weight'] > 0] 
  # student_assessment = student_assessment[student_assessment['assessment_type'] != 'Exam']

    # Aggregate assessments per student
    student_agg = student_assessment.groupby(
        ['code_module', 'code_presentation', 'id_student']
    ).agg(
        mean_score=('score', 'mean'),
        max_score=('score', 'max'),
        min_score=('score', 'min'),
        n_assessments=('score', 'count'),
        weighted_score=('score', lambda x: (x * student_assessment.loc[x.index, 'weight']).sum() / 100)
    ).reset_index()

    vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
    total_clicks=('sum_click', 'sum'),
    n_activities=('id_site', 'nunique')
    ).reset_index()
    
    merge_keys = ['code_module', 'code_presentation', 'id_student']
    df = student_info.merge(student_agg, on=merge_keys, how='left')
    df = df.merge(vle_agg, on=merge_keys, how='left')
    df = pd.merge(df, student_reg, on=merge_keys, how='left')

    nan_rows = df[df.isna().any(axis=1)]
    print(f"Found {len(nan_rows)} rows with NaN values")
    # print(nan_rows.head())

    assessment_cols = ['mean_score', 'max_score', 'min_score', 'weighted_score']
    df[assessment_cols] = df[assessment_cols].fillna(-1)  # -1 indicates no assessments
    df['n_assessments'] = df['n_assessments'].fillna(0)   # 0 assessments completed
    
    df['total_clicks'] = df['total_clicks'].fillna(0)
    df['n_activities'] = df['n_activities'].fillna(0)

    df = df.drop(columns=['date_unregistration', 'mean_score', 'max_score', 'min_score'], errors='ignore')

    df = df.drop(columns=['n_assessments']) # because of multicollinearity
    # Dropping equity related  features
  # df = df.drop(columns=['disability_Y', 'age_band', 'imd_band', 'highest_education', 'gender_M'])
    # Dropping regions
  # df = df.drop(columns=[reg for reg in df.columns if reg.startswith('region_')])
    
    y = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)  # binary target

    X = df.drop(columns=['code_module', 'code_presentation', 'id_student', 'final_result'])
    
    return X, y

def evaluate_model(model, X_val, y_val, X_test, y_test):
    print("\n=== Model Performance ===")
    y_val_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    
    y_test_pred = model.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    
    # Gets predicted probabilities for the positive class (1)
    y_val_probs = model.predict_proba(X_val)[:, 1]
    y_test_probs = model.predict_proba(X_test)[:, 1]
    # Getting the best threshold from our function
        
    # Calculates AUC
    val_pr_auc = average_precision_score(y_val, y_val_probs)
    test_pr_auc = average_precision_score(y_test, y_test_probs)
    val_roc_auc = roc_auc_score(y_val, y_val_probs)
    test_roc_auc = roc_auc_score(y_test, y_test_probs)
    
    print(f"Validation ROC AUC: {val_roc_auc:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")
    print(f"Validation PR AUC: {val_pr_auc:.4f}")
    print(f"Test PR AUC: {test_pr_auc:.4f}")
    
paths = ["constant", "median"]

for path in paths:
    print(f"MODE: {path.upper()}")
    data_dir = Path.home() / "OneDrive" / "Desktop" / "Logistic Regression" / path
    
    X_train, y_train = prepare_data(data_dir / "train")
    X_val, y_val = prepare_data(data_dir / "val")
    X_test, y_test = prepare_data(data_dir / "test")
    
    scaler = StandardScaler()
    
    # Fitting only on training data, then transforming all sets
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val) 
    X_test_scaled = scaler.transform(X_test)
    
    # Training
    model = LogisticRegression(
        penalty='l2',                    # Ridge regularization
        C=1.0,                           # Inverse of alpha (C = 1 / alpha); alpha=1e-4 -> C=1e4
        class_weight='balanced',         # Handles class imbalance
        solver='lbfgs',                  # Default solver, good for small to medium datasets
        max_iter=100,                    # Allows enough iterations to converge
        random_state=42                  # For reproducibility
    )
    
    model.fit(X_train_scaled, y_train)
    print(f'Converged in {model.n_iter_} iterations')
    
    evaluate_model(model, X_val_scaled, y_val, X_test_scaled, y_test)
                
            
    


MODE: CONSTANT
Found 5428 rows with NaN values
Found 1857 rows with NaN values
Found 1848 rows with NaN values
Converged in [12] iterations

=== Model Performance ===
Validation Accuracy: 0.8569894123062759
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      3076
           1       0.87      0.86      0.86      3441

    accuracy                           0.86      6517
   macro avg       0.86      0.86      0.86      6517
weighted avg       0.86      0.86      0.86      6517

Test Accuracy: 0.8526525605642441
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      3079
           1       0.86      0.86      0.86      3443

    accuracy                           0.85      6522
   macro avg       0.85      0.85      0.85      6522
weighted avg       0.85      0.85      0.85      6522

Validation ROC AUC: 0.9320
Test ROC AUC: 0.9260
Validation PR AUC: 0.9422
Test PR AUC: 0.9388
MODE: MEDIA