In [3]:
from xgboost import XGBClassifier  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

def prepare_data(set_dir, week=None):
    data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"
    assessments = pd.read_csv(f"{data_dir}/assessments.csv")
    student_info = pd.read_csv(f"{set_dir}/student_info.csv")
    student_assessment = pd.read_csv(f"{set_dir}/student_assessment.csv")
    student_reg = pd.read_csv(f"{set_dir}/student_reg.csv")
    student_vle = pd.read_csv(f"{set_dir}/student_vle.csv")

    if week is not None:
        student_vle = student_vle[(student_vle['date'] // 7) <= week]
        student_assessment = student_assessment[(student_assessment['date_submitted'] // 7) <= week]

        student_reg['week_registered'] = student_reg['date_registration'] // 7
        student_reg['days_since_registration'] = (week * 7) - student_reg['date_registration']
        student_reg['days_since_registration'] = student_reg['days_since_registration'].clip(lower=0) # if any value in the days_since_registration column is less than 0, it is set to 0

        student_assessment = pd.merge(
            student_assessment, 
            assessments[['id_assessment', 'weight', 'assessment_type']], 
            on='id_assessment', 
            how='left'
        )
    
        student_assessment = student_assessment[student_assessment['weight'] > 0] 
        student_assessment = student_assessment[student_assessment['assessment_type'] != 'Exam']
    
        # Aggregate assessments per student
        student_agg = student_assessment.groupby(
            ['code_module', 'code_presentation', 'id_student']
        ).agg(
            mean_score=('score', 'mean'),
            max_score=('score', 'max'),
            min_score=('score', 'min'),
            n_assessments=('score', 'count'),
            weighted_score=('score', lambda x: (x * student_assessment.loc[x.index, 'weight']).sum() / 100)
        ).reset_index()
    
        vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
        total_clicks=('sum_click', 'sum'),
        n_activities=('id_site', 'nunique'),
        days_active=('date', 'nunique')
        ).reset_index()

        vle_agg['clicks_per_day'] = vle_agg['total_clicks'] / vle_agg['days_active']
        
        merge_keys = ['code_module', 'code_presentation', 'id_student']
        df = student_info.merge(student_agg, on=merge_keys, how='left')
        df = df.merge(vle_agg, on=merge_keys, how='left')
        df = pd.merge(df, student_reg, on=merge_keys, how='left')
    
        nan_rows = df[df.isna().any(axis=1)]
        print(f"Found {len(nan_rows)} rows with NaN values")
        # print(nan_rows.head())
    
        assessment_cols = ['mean_score', 'max_score', 'min_score', 'weighted_score']
        df[assessment_cols] = df[assessment_cols].fillna(-1)  # -1 indicates no assessments
        df['n_assessments'] = df['n_assessments'].fillna(0)   # 0 assessments completed
        
        df['total_clicks'] = df['total_clicks'].fillna(0)
        df['n_activities'] = df['n_activities'].fillna(0)
        df['clicks_per_day'] = df['clicks_per_day'].fillna(0)
        df['days_active'] = df['days_active'].fillna(0)
    
        df = df.drop(columns=['date_unregistration', 'mean_score', 'max_score', 'min_score'], errors='ignore')
    
        df = df.drop(columns=['n_assessments']) # because of multicollinearity
        # Dropping equity related  features
        df = df.drop(columns=['disability_Y', 'age_band', 'imd_band', 'highest_education', 'gender_M'])
        # Dropping regions
       # df = df.drop(columns=[reg for reg in df.columns if reg.startswith('region_')])
        
        y = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)  # binary target
    
        X = df.drop(columns=['code_module', 'code_presentation', 'id_student', 'final_result'])
        
        return X, y
    else:
        pass

def evaluate_model(model, X_val, y_val, X_test, y_test):
    print("\n=== Model Performance ===")
    y_val_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    
    y_test_pred = model.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    
    # Gets predicted probabilities for the positive class (1)
    y_val_probs = model.predict_proba(X_val)[:, 1]
    y_test_probs = model.predict_proba(X_test)[:, 1]
    # Getting the best threshold from our function
    # threshold = plot_precision_recall_threshold_curve(y_val, y_val_probs, desired_recall=0.9)

    # if threshold is not None:
    #    # Converting the predicted probabilities on the validation set into binary predictions using the chosen threshold.
    #    y_val_pred_thresh = (y_val_probs >= threshold).astype(int)
    #    print(f"\n=== Evaluation at threshold {threshold:.3f} ===")
    #    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_thresh))
    #    print(classification_report(y_val, y_val_pred_thresh))
    
    #    y_test_pred_thresh = (y_test_probs >= threshold).astype(int)
    #    print("Test Accuracy:", accuracy_score(y_test, y_test_pred_thresh))
    #    print(classification_report(y_test, y_test_pred_thresh))
        
    # Calculates AUC
    val_pr_auc = average_precision_score(y_val, y_val_probs)
    test_pr_auc = average_precision_score(y_test, y_test_probs)
    val_roc_auc = roc_auc_score(y_val, y_val_probs)
    test_roc_auc = roc_auc_score(y_test, y_test_probs)
    
    print(f"Validation ROC AUC: {val_roc_auc:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")
    print(f"Validation PR AUC: {val_pr_auc:.4f}")
    print(f"Test PR AUC: {test_pr_auc:.4f}")

data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

for week in range(1, 17):   
    print(f"Performance for the WEEK {week}")
    X_train, y_train = prepare_data(data_dir / "train", week)
    X_val, y_val = prepare_data(data_dir / "val", week)
    X_test, y_test = prepare_data(data_dir / "test", week)
    
    # Training
    model = XGBClassifier(
        n_estimators=100, # number of trees
        scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1), # Fixes class imbalance
        learning_rate=0.1, # Step size shrinkage used in update to prevent overfitting
        max_depth=6,  # default
        subsample=0.8, # Prevents overfitting by using only 80% of data per tree (row sampling)
        colsample_bytree=0.8, # Uses 80% of features per tree (column sampling) — adds regularization
        random_state=42, # For reproducability
        use_label_encoder=False, # to suppress warning
        eval_metric='logloss', # for binary classification
        n_jobs=-1 # Parallelizes training across all CPU cores
    )
    
    model.fit(X_train, y_train)
    
    evaluate_model(model, X_val, y_val, X_test, y_test)

Performance for the WEEK 1
Found 18167 rows with NaN values
Found 6091 rows with NaN values
Found 6100 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.6837501918060457
              precision    recall  f1-score   support

           0       0.64      0.75      0.69      3076
           1       0.73      0.63      0.68      3441

    accuracy                           0.68      6517
   macro avg       0.69      0.69      0.68      6517
weighted avg       0.69      0.68      0.68      6517

Test Accuracy: 0.6867525298988041
              precision    recall  f1-score   support

           0       0.64      0.75      0.69      3079
           1       0.74      0.63      0.68      3443

    accuracy                           0.69      6522
   macro avg       0.69      0.69      0.69      6522
weighted avg       0.69      0.69      0.69      6522

Validation ROC AUC: 0.7558
Test ROC AUC: 0.7549
Validation PR AUC: 0.7958
Test PR AUC: 0.7940
Performance for the WEEK 2
Found 12711 rows with NaN values
Found 4238 rows with NaN values
Found 4319 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7043117999079331
              precision    recall  f1-score   support

           0       0.66      0.76      0.71      3076
           1       0.75      0.65      0.70      3441

    accuracy                           0.70      6517
   macro avg       0.71      0.71      0.70      6517
weighted avg       0.71      0.70      0.70      6517

Test Accuracy: 0.6962588163140141
              precision    recall  f1-score   support

           0       0.65      0.76      0.70      3079
           1       0.75      0.64      0.69      3443

    accuracy                           0.70      6522
   macro avg       0.70      0.70      0.70      6522
weighted avg       0.70      0.70      0.70      6522

Validation ROC AUC: 0.7796
Test ROC AUC: 0.7759
Validation PR AUC: 0.8216
Test PR AUC: 0.8196
Performance for the WEEK 3
Found 8291 rows with NaN values
Found 2778 rows with NaN values
Found 2780 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.719042504219733
              precision    recall  f1-score   support

           0       0.67      0.79      0.73      3076
           1       0.78      0.65      0.71      3441

    accuracy                           0.72      6517
   macro avg       0.73      0.72      0.72      6517
weighted avg       0.73      0.72      0.72      6517

Test Accuracy: 0.7102115915363385
              precision    recall  f1-score   support

           0       0.66      0.79      0.72      3079
           1       0.77      0.64      0.70      3443

    accuracy                           0.71      6522
   macro avg       0.72      0.71      0.71      6522
weighted avg       0.72      0.71      0.71      6522

Validation ROC AUC: 0.7985
Test ROC AUC: 0.7958
Validation PR AUC: 0.8393
Test PR AUC: 0.8372
Performance for the WEEK 4
Found 6800 rows with NaN values
Found 2303 rows with NaN values
Found 2295 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7259475218658892
              precision    recall  f1-score   support

           0       0.68      0.81      0.74      3076
           1       0.79      0.65      0.72      3441

    accuracy                           0.73      6517
   macro avg       0.73      0.73      0.73      6517
weighted avg       0.74      0.73      0.73      6517

Test Accuracy: 0.7230910763569457
              precision    recall  f1-score   support

           0       0.67      0.81      0.73      3079
           1       0.79      0.65      0.71      3443

    accuracy                           0.72      6522
   macro avg       0.73      0.73      0.72      6522
weighted avg       0.73      0.72      0.72      6522

Validation ROC AUC: 0.8076
Test ROC AUC: 0.8068
Validation PR AUC: 0.8467
Test PR AUC: 0.8467
Performance for the WEEK 5
Found 6578 rows with NaN values
Found 2231 rows with NaN values
Found 2222 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7308577566364892
              precision    recall  f1-score   support

           0       0.69      0.79      0.73      3076
           1       0.78      0.68      0.73      3441

    accuracy                           0.73      6517
   macro avg       0.73      0.73      0.73      6517
weighted avg       0.74      0.73      0.73      6517

Test Accuracy: 0.735510579576817
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      3079
           1       0.79      0.68      0.73      3443

    accuracy                           0.74      6522
   macro avg       0.74      0.74      0.74      6522
weighted avg       0.74      0.74      0.74      6522

Validation ROC AUC: 0.8173
Test ROC AUC: 0.8145
Validation PR AUC: 0.8530
Test PR AUC: 0.8524
Performance for the WEEK 6
Found 6390 rows with NaN values
Found 2166 rows with NaN values
Found 2172 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7489642473530765
              precision    recall  f1-score   support

           0       0.71      0.80      0.75      3076
           1       0.80      0.71      0.75      3441

    accuracy                           0.75      6517
   macro avg       0.75      0.75      0.75      6517
weighted avg       0.75      0.75      0.75      6517

Test Accuracy: 0.7557497700091996
              precision    recall  f1-score   support

           0       0.71      0.81      0.76      3079
           1       0.81      0.71      0.75      3443

    accuracy                           0.76      6522
   macro avg       0.76      0.76      0.76      6522
weighted avg       0.76      0.76      0.76      6522

Validation ROC AUC: 0.8294
Test ROC AUC: 0.8327
Validation PR AUC: 0.8650
Test PR AUC: 0.8667
Performance for the WEEK 7
Found 5617 rows with NaN values
Found 1914 rows with NaN values
Found 1909 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7623139481356452
              precision    recall  f1-score   support

           0       0.71      0.83      0.77      3076
           1       0.82      0.71      0.76      3441

    accuracy                           0.76      6517
   macro avg       0.77      0.77      0.76      6517
weighted avg       0.77      0.76      0.76      6517

Test Accuracy: 0.7606562404170499
              precision    recall  f1-score   support

           0       0.71      0.83      0.77      3079
           1       0.82      0.70      0.76      3443

    accuracy                           0.76      6522
   macro avg       0.77      0.76      0.76      6522
weighted avg       0.77      0.76      0.76      6522

Validation ROC AUC: 0.8439
Test ROC AUC: 0.8416
Validation PR AUC: 0.8808
Test PR AUC: 0.8789
Performance for the WEEK 8
Found 5547 rows with NaN values
Found 1891 rows with NaN values
Found 1887 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7647690655209453
              precision    recall  f1-score   support

           0       0.72      0.83      0.77      3076
           1       0.82      0.71      0.76      3441

    accuracy                           0.76      6517
   macro avg       0.77      0.77      0.76      6517
weighted avg       0.77      0.76      0.76      6517

Test Accuracy: 0.7631094756209752
              precision    recall  f1-score   support

           0       0.71      0.83      0.77      3079
           1       0.82      0.70      0.76      3443

    accuracy                           0.76      6522
   macro avg       0.77      0.77      0.76      6522
weighted avg       0.77      0.76      0.76      6522

Validation ROC AUC: 0.8490
Test ROC AUC: 0.8474
Validation PR AUC: 0.8847
Test PR AUC: 0.8836
Performance for the WEEK 9
Found 5500 rows with NaN values
Found 1878 rows with NaN values
Found 1874 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7791928801595827
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      3076
           1       0.83      0.73      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.78      0.78      0.78      6517
weighted avg       0.78      0.78      0.78      6517

Test Accuracy: 0.7793621588469795
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3079
           1       0.83      0.73      0.78      3443

    accuracy                           0.78      6522
   macro avg       0.78      0.78      0.78      6522
weighted avg       0.79      0.78      0.78      6522

Validation ROC AUC: 0.8596
Test ROC AUC: 0.8622
Validation PR AUC: 0.8939
Test PR AUC: 0.8945
Performance for the WEEK 10
Found 5487 rows with NaN values
Found 1874 rows with NaN values
Found 1869 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7804204388522327
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      3076
           1       0.83      0.74      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.78      0.78      0.78      6517
weighted avg       0.79      0.78      0.78      6517

Test Accuracy: 0.7779822140447715
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3079
           1       0.83      0.73      0.78      3443

    accuracy                           0.78      6522
   macro avg       0.78      0.78      0.78      6522
weighted avg       0.78      0.78      0.78      6522

Validation ROC AUC: 0.8635
Test ROC AUC: 0.8637
Validation PR AUC: 0.8970
Test PR AUC: 0.8958
Performance for the WEEK 11
Found 5483 rows with NaN values
Found 1874 rows with NaN values
Found 1869 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7836427804204389
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      3076
           1       0.83      0.74      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.79      0.79      0.78      6517
weighted avg       0.79      0.78      0.78      6517

Test Accuracy: 0.7833486660533578
              precision    recall  f1-score   support

           0       0.74      0.84      0.78      3079
           1       0.83      0.74      0.78      3443

    accuracy                           0.78      6522
   macro avg       0.79      0.79      0.78      6522
weighted avg       0.79      0.78      0.78      6522

Validation ROC AUC: 0.8640
Test ROC AUC: 0.8644
Validation PR AUC: 0.8984
Test PR AUC: 0.8966
Performance for the WEEK 12
Found 5474 rows with NaN values
Found 1873 rows with NaN values
Found 1865 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7925425809421512
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      3076
           1       0.84      0.75      0.79      3441

    accuracy                           0.79      6517
   macro avg       0.80      0.80      0.79      6517
weighted avg       0.80      0.79      0.79      6517

Test Accuracy: 0.7902483900643974
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      3079
           1       0.84      0.74      0.79      3443

    accuracy                           0.79      6522
   macro avg       0.79      0.79      0.79      6522
weighted avg       0.80      0.79      0.79      6522

Validation ROC AUC: 0.8703
Test ROC AUC: 0.8679
Validation PR AUC: 0.9020
Test PR AUC: 0.8996
Performance for the WEEK 13
Found 5464 rows with NaN values
Found 1870 rows with NaN values
Found 1863 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8072732852539513
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      3076
           1       0.84      0.79      0.81      3441

    accuracy                           0.81      6517
   macro avg       0.81      0.81      0.81      6517
weighted avg       0.81      0.81      0.81      6517

Test Accuracy: 0.8020545844832874
              precision    recall  f1-score   support

           0       0.77      0.83      0.80      3079
           1       0.84      0.77      0.80      3443

    accuracy                           0.80      6522
   macro avg       0.80      0.80      0.80      6522
weighted avg       0.80      0.80      0.80      6522

Validation ROC AUC: 0.8856
Test ROC AUC: 0.8827
Validation PR AUC: 0.9119
Test PR AUC: 0.9092
Performance for the WEEK 14
Found 5460 rows with NaN values
Found 1869 rows with NaN values
Found 1861 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8170937547951511
              precision    recall  f1-score   support

           0       0.78      0.84      0.81      3076
           1       0.85      0.79      0.82      3441

    accuracy                           0.82      6517
   macro avg       0.82      0.82      0.82      6517
weighted avg       0.82      0.82      0.82      6517

Test Accuracy: 0.8066544004906471
              precision    recall  f1-score   support

           0       0.77      0.84      0.80      3079
           1       0.84      0.78      0.81      3443

    accuracy                           0.81      6522
   macro avg       0.81      0.81      0.81      6522
weighted avg       0.81      0.81      0.81      6522

Validation ROC AUC: 0.8914
Test ROC AUC: 0.8883
Validation PR AUC: 0.9167
Test PR AUC: 0.9140
Performance for the WEEK 15
Found 5449 rows with NaN values
Found 1867 rows with NaN values
Found 1857 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8140248580635262
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      3076
           1       0.86      0.77      0.81      3441

    accuracy                           0.81      6517
   macro avg       0.82      0.82      0.81      6517
weighted avg       0.82      0.81      0.81      6517

Test Accuracy: 0.8155473781048758
              precision    recall  f1-score   support

           0       0.77      0.86      0.82      3079
           1       0.86      0.77      0.82      3443

    accuracy                           0.82      6522
   macro avg       0.82      0.82      0.82      6522
weighted avg       0.82      0.82      0.82      6522

Validation ROC AUC: 0.8974
Test ROC AUC: 0.8972
Validation PR AUC: 0.9225
Test PR AUC: 0.9221
Performance for the WEEK 16
Found 5446 rows with NaN values
Found 1865 rows with NaN values
Found 1856 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8223108792389137
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      3076
           1       0.87      0.78      0.82      3441

    accuracy                           0.82      6517
   macro avg       0.82      0.82      0.82      6517
weighted avg       0.83      0.82      0.82      6517

Test Accuracy: 0.8206071757129715
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      3079
           1       0.87      0.78      0.82      3443

    accuracy                           0.82      6522
   macro avg       0.82      0.82      0.82      6522
weighted avg       0.83      0.82      0.82      6522

Validation ROC AUC: 0.9010
Test ROC AUC: 0.9025
Validation PR AUC: 0.9257
Test PR AUC: 0.9263
