In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

def prepare_data(set_dir, week=None):
    data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"
    assessments = pd.read_csv(f"{data_dir}/assessments.csv")
    student_info = pd.read_csv(f"{set_dir}/student_info.csv")
    student_assessment = pd.read_csv(f"{set_dir}/student_assessment.csv")
    student_reg = pd.read_csv(f"{set_dir}/student_reg.csv")
    student_vle = pd.read_csv(f"{set_dir}/student_vle.csv")

    if week is not None:
        student_vle = student_vle[(student_vle['date'] // 7) <= week]
        student_assessment = student_assessment[(student_assessment['date_submitted'] // 7) <= week]

        student_reg['week_registered'] = student_reg['date_registration'] // 7
        student_reg['days_since_registration'] = (week * 7) - student_reg['date_registration']
        student_reg['days_since_registration'] = student_reg['days_since_registration'].clip(lower=0) # if any value in the days_since_registration column is less than 0, it is set to 0

        student_assessment = pd.merge(
            student_assessment, 
            assessments[['id_assessment', 'weight', 'assessment_type']], 
            on='id_assessment', 
            how='left'
        )
    
        student_assessment = student_assessment[student_assessment['weight'] > 0] 
        student_assessment = student_assessment[student_assessment['assessment_type'] != 'Exam']
    
        # Aggregate assessments per student
        student_agg = student_assessment.groupby(
            ['code_module', 'code_presentation', 'id_student']
        ).agg(
            mean_score=('score', 'mean'),
            max_score=('score', 'max'),
            min_score=('score', 'min'),
            n_assessments=('score', 'count'),
            weighted_score=('score', lambda x: (x * student_assessment.loc[x.index, 'weight']).sum() / 100)
        ).reset_index()
    
        vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
        total_clicks=('sum_click', 'sum'),
        n_activities=('id_site', 'nunique'),
        days_active=('date', 'nunique')
        ).reset_index()

        vle_agg['clicks_per_day'] = vle_agg['total_clicks'] / vle_agg['days_active']
        
        merge_keys = ['code_module', 'code_presentation', 'id_student']
        df = student_info.merge(student_agg, on=merge_keys, how='left')
        df = df.merge(vle_agg, on=merge_keys, how='left')
        df = pd.merge(df, student_reg, on=merge_keys, how='left')
    
        nan_rows = df[df.isna().any(axis=1)]
        print(f"Found {len(nan_rows)} rows with NaN values")
        # print(nan_rows.head())
    
        assessment_cols = ['mean_score', 'max_score', 'min_score', 'weighted_score']
        df[assessment_cols] = df[assessment_cols].fillna(-1)  # -1 indicates no assessments
        df['n_assessments'] = df['n_assessments'].fillna(0)   # 0 assessments completed
        
        df['total_clicks'] = df['total_clicks'].fillna(0)
        df['n_activities'] = df['n_activities'].fillna(0)
        df['clicks_per_day'] = df['clicks_per_day'].fillna(0)
        df['days_active'] = df['days_active'].fillna(0)
    
        df = df.drop(columns=['date_unregistration', 'mean_score', 'max_score', 'min_score'], errors='ignore')
    
        df = df.drop(columns=['n_assessments']) # because of multicollinearity
        # Dropping equity related  features
        df = df.drop(columns=['disability_Y', 'age_band', 'imd_band', 'highest_education', 'gender_M'])
        # Dropping regions
       # df = df.drop(columns=[reg for reg in df.columns if reg.startswith('region_')])
        
        y = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)  # binary target
    
        X = df.drop(columns=['code_module', 'code_presentation', 'id_student', 'final_result'])
        
        return X, y
    else:
        pass

def evaluate_model(model, X_val, y_val, X_test, y_test):
    print("\n=== Model Performance ===")
    y_val_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    
    y_test_pred = model.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    
    # Gets predicted probabilities for the positive class (1)
    y_val_probs = model.predict_proba(X_val)[:, 1]
    y_test_probs = model.predict_proba(X_test)[:, 1]
    # Getting the best threshold from our function
    # threshold = plot_precision_recall_threshold_curve(y_val, y_val_probs, desired_recall=0.9)

    # if threshold is not None:
    #    # Converting the predicted probabilities on the validation set into binary predictions using the chosen threshold.
    #    y_val_pred_thresh = (y_val_probs >= threshold).astype(int)
    #    print(f"\n=== Evaluation at threshold {threshold:.3f} ===")
    #    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_thresh))
    #    print(classification_report(y_val, y_val_pred_thresh))
    
    #    y_test_pred_thresh = (y_test_probs >= threshold).astype(int)
    #    print("Test Accuracy:", accuracy_score(y_test, y_test_pred_thresh))
    #    print(classification_report(y_test, y_test_pred_thresh))
        
    # Calculates AUC
    val_pr_auc = average_precision_score(y_val, y_val_probs)
    test_pr_auc = average_precision_score(y_test, y_test_probs)
    val_roc_auc = roc_auc_score(y_val, y_val_probs)
    test_roc_auc = roc_auc_score(y_test, y_test_probs)
    
    print(f"Validation ROC AUC: {val_roc_auc:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")
    print(f"Validation PR AUC: {val_pr_auc:.4f}")
    print(f"Test PR AUC: {test_pr_auc:.4f}")

data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

for week in range(1, 17):   
    print(f"Performance for the WEEK {week}")
    X_train, y_train = prepare_data(data_dir / "train", week)
    X_val, y_val = prepare_data(data_dir / "val", week)
    X_test, y_test = prepare_data(data_dir / "test", week)
    
    scaler = StandardScaler()
    
    # Fitting only on training data, then transforming all sets
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val) 
    X_test_scaled = scaler.transform(X_test) 

    # Training
    model = LogisticRegression(
        penalty='l2',                    # Ridge regularization
        C=1.0,                           # Inverse of alpha (C = 1 / alpha); alpha=1e-4 -> C=1e4
        class_weight='balanced',         # Handles class imbalance
        solver='lbfgs',                  # Default solver, good for small to medium datasets
        max_iter=100,                    # Allows enough iterations to converge
        random_state=42                  # For reproducibility
    )

    model.fit(X_train_scaled, y_train)
    print(f'Converged in {model.n_iter_} iterations')

    evaluate_model(model, X_val_scaled, y_val, X_test_scaled, y_test)

Performance for the WEEK 1
Found 18167 rows with NaN values
Found 6091 rows with NaN values
Found 6100 rows with NaN values
Converged in [15] iterations

=== Model Performance ===
Validation Accuracy: 0.676538284486727
              precision    recall  f1-score   support

           0       0.65      0.68      0.67      3076
           1       0.70      0.67      0.69      3441

    accuracy                           0.68      6517
   macro avg       0.68      0.68      0.68      6517
weighted avg       0.68      0.68      0.68      6517

Test Accuracy: 0.675866298681386
              precision    recall  f1-score   support

           0       0.65      0.70      0.67      3079
           1       0.71      0.66      0.68      3443

    accuracy                           0.68      6522
   macro avg       0.68      0.68      0.68      6522
weighted avg       0.68      0.68      0.68      6522

Validation ROC AUC: 0.7437
Test ROC AUC: 0.7430
Validation PR AUC: 0.7835
Test PR AUC: 0.7834
