In [7]:
from xgboost import XGBClassifier  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

def prepare_data(set_dir, week=None):
    data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"
    assessments = pd.read_csv(f"{data_dir}/assessments.csv")
    student_info = pd.read_csv(f"{set_dir}/student_info.csv")
    student_assessment = pd.read_csv(f"{set_dir}/student_assessment.csv")
    student_reg = pd.read_csv(f"{set_dir}/student_reg.csv")
    student_vle = pd.read_csv(f"{set_dir}/student_vle.csv")

    if week is not None:
        student_vle = student_vle[(student_vle['date'] // 7) <= week]
        student_assessment = student_assessment[(student_assessment['date_submitted'] // 7) <= week]

        student_reg['week_registered'] = student_reg['date_registration'] // 7
        student_reg['days_since_registration'] = (week * 7) - student_reg['date_registration']
        student_reg['days_since_registration'] = student_reg['days_since_registration'].clip(lower=0) # if any value in the days_since_registration column is less than 0, it is set to 0

        student_assessment = pd.merge(
            student_assessment, 
            assessments[['id_assessment', 'weight', 'assessment_type']], 
            on='id_assessment', 
            how='left'
        )
    
        student_assessment = student_assessment[student_assessment['weight'] > 0] 
        student_assessment = student_assessment[student_assessment['assessment_type'] != 'Exam']
    
        # Aggregate assessments per student
        student_agg = student_assessment.groupby(
            ['code_module', 'code_presentation', 'id_student']
        ).agg(
            mean_score=('score', 'mean'),
            max_score=('score', 'max'),
            min_score=('score', 'min'),
            n_assessments=('score', 'count'),
            weighted_score=('score', lambda x: (x * student_assessment.loc[x.index, 'weight']).sum() / 100)
        ).reset_index()
    
        vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
        total_clicks=('sum_click', 'sum'),
        n_activities=('id_site', 'nunique'),
        days_active=('date', 'nunique')
        ).reset_index()

        vle_agg['clicks_per_day'] = vle_agg['total_clicks'] / vle_agg['days_active']
        
        merge_keys = ['code_module', 'code_presentation', 'id_student']
        df = student_info.merge(student_agg, on=merge_keys, how='left')
        df = df.merge(vle_agg, on=merge_keys, how='left')
        df = pd.merge(df, student_reg, on=merge_keys, how='left')
    
        nan_rows = df[df.isna().any(axis=1)]
        print(f"Found {len(nan_rows)} rows with NaN values")
        # print(nan_rows.head())
    
        assessment_cols = ['mean_score', 'max_score', 'min_score', 'weighted_score']
        df[assessment_cols] = df[assessment_cols].fillna(-1)  # -1 indicates no assessments
        df['n_assessments'] = df['n_assessments'].fillna(0)   # 0 assessments completed
        
        df['total_clicks'] = df['total_clicks'].fillna(0)
        df['n_activities'] = df['n_activities'].fillna(0)
        df['clicks_per_day'] = df['clicks_per_day'].fillna(0)
        df['days_active'] = df['days_active'].fillna(0)
    
        df = df.drop(columns=['date_unregistration', 'mean_score', 'max_score', 'min_score'], errors='ignore')
    
        df = df.drop(columns=['n_assessments']) # because of multicollinearity
        # Dropping equity related  features
        df = df.drop(columns=['disability_Y', 'age_band', 'imd_band', 'highest_education', 'gender_M'])
        # Dropping regions
        df = df.drop(columns=[reg for reg in df.columns if reg.startswith('region_')])
        
        y = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)  # binary target
    
        X = df.drop(columns=['code_module', 'code_presentation', 'id_student', 'final_result'])
        
        return X, y
    else:
        pass

def evaluate_model(model, X_val, y_val, X_test, y_test):
    print("\n=== Model Performance ===")
    y_val_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    
    y_test_pred = model.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    
    # Gets predicted probabilities for the positive class (1)
    y_val_probs = model.predict_proba(X_val)[:, 1]
    y_test_probs = model.predict_proba(X_test)[:, 1]
    # Getting the best threshold from our function
    # threshold = plot_precision_recall_threshold_curve(y_val, y_val_probs, desired_recall=0.9)

    # if threshold is not None:
    #    # Converting the predicted probabilities on the validation set into binary predictions using the chosen threshold.
    #    y_val_pred_thresh = (y_val_probs >= threshold).astype(int)
    #    print(f"\n=== Evaluation at threshold {threshold:.3f} ===")
    #    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_thresh))
    #    print(classification_report(y_val, y_val_pred_thresh))
    
    #    y_test_pred_thresh = (y_test_probs >= threshold).astype(int)
    #    print("Test Accuracy:", accuracy_score(y_test, y_test_pred_thresh))
    #    print(classification_report(y_test, y_test_pred_thresh))
        
    # Calculates AUC
    val_pr_auc = average_precision_score(y_val, y_val_probs)
    test_pr_auc = average_precision_score(y_test, y_test_probs)
    val_roc_auc = roc_auc_score(y_val, y_val_probs)
    test_roc_auc = roc_auc_score(y_test, y_test_probs)
    
    print(f"Validation ROC AUC: {val_roc_auc:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")
    print(f"Validation PR AUC: {val_pr_auc:.4f}")
    print(f"Test PR AUC: {test_pr_auc:.4f}")

data_dir = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

for week in range(1, 17):   
    print(f"Performance for the WEEK {week}")
    X_train, y_train = prepare_data(data_dir / "train", week)
    X_val, y_val = prepare_data(data_dir / "val", week)
    X_test, y_test = prepare_data(data_dir / "test", week)
    
    # Training
    model = XGBClassifier(
        n_estimators=100, # number of trees
        scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1), # Fixes class imbalance
        learning_rate=0.1, # Step size shrinkage used in update to prevent overfitting
        max_depth=6,  # default
        subsample=0.8, # Prevents overfitting by using only 80% of data per tree (row sampling)
        colsample_bytree=0.8, # Uses 80% of features per tree (column sampling) — adds regularization
        random_state=42, # For reproducability
        use_label_encoder=False, # to suppress warning
        eval_metric='logloss', # for binary classification
        n_jobs=-1 # Parallelizes training across all CPU cores
    )
    
    model.fit(X_train, y_train)
    
    evaluate_model(model, X_val, y_val, X_test, y_test)

Performance for the WEEK 1
Found 18167 rows with NaN values
Found 6091 rows with NaN values
Found 6100 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.6823691882768145
              precision    recall  f1-score   support

           0       0.64      0.75      0.69      3076
           1       0.73      0.63      0.68      3441

    accuracy                           0.68      6517
   macro avg       0.69      0.69      0.68      6517
weighted avg       0.69      0.68      0.68      6517

Test Accuracy: 0.6850659306961054
              precision    recall  f1-score   support

           0       0.64      0.75      0.69      3079
           1       0.74      0.62      0.68      3443

    accuracy                           0.69      6522
   macro avg       0.69      0.69      0.68      6522
weighted avg       0.69      0.69      0.68      6522

Validation ROC AUC: 0.7544
Test ROC AUC: 0.7522
Validation PR AUC: 0.7932
Test PR AUC: 0.7937
Performance for the WEEK 2
Found 12711 rows with NaN values
Found 4238 rows with NaN values
Found 4319 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7027773515421206
              precision    recall  f1-score   support

           0       0.66      0.76      0.71      3076
           1       0.75      0.65      0.70      3441

    accuracy                           0.70      6517
   macro avg       0.71      0.71      0.70      6517
weighted avg       0.71      0.70      0.70      6517

Test Accuracy: 0.6994786875191659
              precision    recall  f1-score   support

           0       0.66      0.77      0.71      3079
           1       0.75      0.64      0.69      3443

    accuracy                           0.70      6522
   macro avg       0.70      0.70      0.70      6522
weighted avg       0.71      0.70      0.70      6522

Validation ROC AUC: 0.7776
Test ROC AUC: 0.7731
Validation PR AUC: 0.8199
Test PR AUC: 0.8175
Performance for the WEEK 3
Found 8291 rows with NaN values
Found 2778 rows with NaN values
Found 2780 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7173546110173392
              precision    recall  f1-score   support

           0       0.67      0.79      0.73      3076
           1       0.78      0.65      0.71      3441

    accuracy                           0.72      6517
   macro avg       0.72      0.72      0.72      6517
weighted avg       0.73      0.72      0.72      6517

Test Accuracy: 0.7126648267402638
              precision    recall  f1-score   support

           0       0.66      0.80      0.72      3079
           1       0.78      0.64      0.70      3443

    accuracy                           0.71      6522
   macro avg       0.72      0.72      0.71      6522
weighted avg       0.72      0.71      0.71      6522

Validation ROC AUC: 0.7983
Test ROC AUC: 0.7925
Validation PR AUC: 0.8392
Test PR AUC: 0.8359
Performance for the WEEK 4
Found 6800 rows with NaN values
Found 2303 rows with NaN values
Found 2295 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.725794077029308
              precision    recall  f1-score   support

           0       0.68      0.81      0.74      3076
           1       0.79      0.65      0.72      3441

    accuracy                           0.73      6517
   macro avg       0.73      0.73      0.73      6517
weighted avg       0.74      0.73      0.72      6517

Test Accuracy: 0.7246243483593989
              precision    recall  f1-score   support

           0       0.67      0.81      0.74      3079
           1       0.79      0.65      0.71      3443

    accuracy                           0.72      6522
   macro avg       0.73      0.73      0.72      6522
weighted avg       0.74      0.72      0.72      6522

Validation ROC AUC: 0.8073
Test ROC AUC: 0.8054
Validation PR AUC: 0.8453
Test PR AUC: 0.8459
Performance for the WEEK 5
Found 6578 rows with NaN values
Found 2231 rows with NaN values
Found 2222 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7317784256559767
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      3076
           1       0.79      0.68      0.73      3441

    accuracy                           0.73      6517
   macro avg       0.74      0.74      0.73      6517
weighted avg       0.74      0.73      0.73      6517

Test Accuracy: 0.7283042011652867
              precision    recall  f1-score   support

           0       0.68      0.80      0.73      3079
           1       0.79      0.67      0.72      3443

    accuracy                           0.73      6522
   macro avg       0.73      0.73      0.73      6522
weighted avg       0.74      0.73      0.73      6522

Validation ROC AUC: 0.8167
Test ROC AUC: 0.8107
Validation PR AUC: 0.8527
Test PR AUC: 0.8500
Performance for the WEEK 6
Found 6390 rows with NaN values
Found 2166 rows with NaN values
Found 2172 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.749884916372564
              precision    recall  f1-score   support

           0       0.71      0.80      0.75      3076
           1       0.80      0.70      0.75      3441

    accuracy                           0.75      6517
   macro avg       0.75      0.75      0.75      6517
weighted avg       0.76      0.75      0.75      6517

Test Accuracy: 0.7513032812020852
              precision    recall  f1-score   support

           0       0.71      0.81      0.75      3079
           1       0.80      0.70      0.75      3443

    accuracy                           0.75      6522
   macro avg       0.75      0.75      0.75      6522
weighted avg       0.76      0.75      0.75      6522

Validation ROC AUC: 0.8278
Test ROC AUC: 0.8323
Validation PR AUC: 0.8637
Test PR AUC: 0.8662
Performance for the WEEK 7
Found 5617 rows with NaN values
Found 1914 rows with NaN values
Found 1909 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.761546723952739
              precision    recall  f1-score   support

           0       0.71      0.82      0.77      3076
           1       0.82      0.71      0.76      3441

    accuracy                           0.76      6517
   macro avg       0.77      0.76      0.76      6517
weighted avg       0.77      0.76      0.76      6517

Test Accuracy: 0.7606562404170499
              precision    recall  f1-score   support

           0       0.71      0.83      0.77      3079
           1       0.82      0.70      0.76      3443

    accuracy                           0.76      6522
   macro avg       0.77      0.76      0.76      6522
weighted avg       0.77      0.76      0.76      6522

Validation ROC AUC: 0.8439
Test ROC AUC: 0.8402
Validation PR AUC: 0.8807
Test PR AUC: 0.8783
Performance for the WEEK 8
Found 5547 rows with NaN values
Found 1891 rows with NaN values
Found 1887 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7636949516648764
              precision    recall  f1-score   support

           0       0.72      0.83      0.77      3076
           1       0.82      0.71      0.76      3441

    accuracy                           0.76      6517
   macro avg       0.77      0.77      0.76      6517
weighted avg       0.77      0.76      0.76      6517

Test Accuracy: 0.7651027292241643
              precision    recall  f1-score   support

           0       0.72      0.83      0.77      3079
           1       0.82      0.71      0.76      3443

    accuracy                           0.77      6522
   macro avg       0.77      0.77      0.77      6522
weighted avg       0.77      0.77      0.76      6522

Validation ROC AUC: 0.8499
Test ROC AUC: 0.8466
Validation PR AUC: 0.8856
Test PR AUC: 0.8834
Performance for the WEEK 9
Found 5500 rows with NaN values
Found 1878 rows with NaN values
Found 1874 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7762774282645389
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3076
           1       0.83      0.73      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.78      0.78      0.78      6517
weighted avg       0.78      0.78      0.78      6517

Test Accuracy: 0.7764489420423183
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3079
           1       0.83      0.73      0.77      3443

    accuracy                           0.78      6522
   macro avg       0.78      0.78      0.78      6522
weighted avg       0.78      0.78      0.78      6522

Validation ROC AUC: 0.8589
Test ROC AUC: 0.8618
Validation PR AUC: 0.8934
Test PR AUC: 0.8943
Performance for the WEEK 10
Found 5487 rows with NaN values
Found 1874 rows with NaN values
Found 1869 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7811876630351389
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      3076
           1       0.83      0.74      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.78      0.78      0.78      6517
weighted avg       0.79      0.78      0.78      6517

Test Accuracy: 0.7781355412450168
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3079
           1       0.83      0.73      0.78      3443

    accuracy                           0.78      6522
   macro avg       0.78      0.78      0.78      6522
weighted avg       0.78      0.78      0.78      6522

Validation ROC AUC: 0.8632
Test ROC AUC: 0.8652
Validation PR AUC: 0.8970
Test PR AUC: 0.8964
Performance for the WEEK 11
Found 5483 rows with NaN values
Found 1874 rows with NaN values
Found 1869 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7825686665643701
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      3076
           1       0.83      0.74      0.78      3441

    accuracy                           0.78      6517
   macro avg       0.79      0.79      0.78      6517
weighted avg       0.79      0.78      0.78      6517

Test Accuracy: 0.7790555044464889
              precision    recall  f1-score   support

           0       0.73      0.84      0.78      3079
           1       0.83      0.73      0.78      3443

    accuracy                           0.78      6522
   macro avg       0.78      0.78      0.78      6522
weighted avg       0.79      0.78      0.78      6522

Validation ROC AUC: 0.8647
Test ROC AUC: 0.8639
Validation PR AUC: 0.8987
Test PR AUC: 0.8965
Performance for the WEEK 12
Found 5474 rows with NaN values
Found 1873 rows with NaN values
Found 1865 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.7891667945373638
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      3076
           1       0.84      0.75      0.79      3441

    accuracy                           0.79      6517
   macro avg       0.79      0.79      0.79      6517
weighted avg       0.79      0.79      0.79      6517

Test Accuracy: 0.7888684452621895
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      3079
           1       0.84      0.74      0.79      3443

    accuracy                           0.79      6522
   macro avg       0.79      0.79      0.79      6522
weighted avg       0.80      0.79      0.79      6522

Validation ROC AUC: 0.8694
Test ROC AUC: 0.8675
Validation PR AUC: 0.9013
Test PR AUC: 0.8994
Performance for the WEEK 13
Found 5464 rows with NaN values
Found 1870 rows with NaN values
Found 1863 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8031302746662575
              precision    recall  f1-score   support

           0       0.77      0.83      0.80      3076
           1       0.84      0.78      0.81      3441

    accuracy                           0.80      6517
   macro avg       0.80      0.80      0.80      6517
weighted avg       0.81      0.80      0.80      6517

Test Accuracy: 0.7994480220791168
              precision    recall  f1-score   support

           0       0.76      0.83      0.80      3079
           1       0.84      0.77      0.80      3443

    accuracy                           0.80      6522
   macro avg       0.80      0.80      0.80      6522
weighted avg       0.80      0.80      0.80      6522

Validation ROC AUC: 0.8852
Test ROC AUC: 0.8826
Validation PR AUC: 0.9115
Test PR AUC: 0.9093
Performance for the WEEK 14
Found 5460 rows with NaN values
Found 1869 rows with NaN values
Found 1861 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8149455270830137
              precision    recall  f1-score   support

           0       0.78      0.84      0.81      3076
           1       0.85      0.79      0.82      3441

    accuracy                           0.81      6517
   macro avg       0.82      0.82      0.81      6517
weighted avg       0.82      0.81      0.82      6517

Test Accuracy: 0.8052744556884391
              precision    recall  f1-score   support

           0       0.77      0.83      0.80      3079
           1       0.84      0.78      0.81      3443

    accuracy                           0.81      6522
   macro avg       0.81      0.81      0.81      6522
weighted avg       0.81      0.81      0.81      6522

Validation ROC AUC: 0.8913
Test ROC AUC: 0.8870
Validation PR AUC: 0.9163
Test PR AUC: 0.9130
Performance for the WEEK 15
Found 5449 rows with NaN values
Found 1867 rows with NaN values
Found 1857 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8146386374098512
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      3076
           1       0.86      0.77      0.82      3441

    accuracy                           0.81      6517
   macro avg       0.82      0.82      0.81      6517
weighted avg       0.82      0.81      0.81      6517

Test Accuracy: 0.8138607789021772
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      3079
           1       0.86      0.77      0.81      3443

    accuracy                           0.81      6522
   macro avg       0.82      0.82      0.81      6522
weighted avg       0.82      0.81      0.81      6522

Validation ROC AUC: 0.8985
Test ROC AUC: 0.8957
Validation PR AUC: 0.9230
Test PR AUC: 0.9210
Performance for the WEEK 16
Found 5446 rows with NaN values
Found 1865 rows with NaN values
Found 1856 rows with NaN values


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Performance ===
Validation Accuracy: 0.8206229860365198
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      3076
           1       0.87      0.78      0.82      3441

    accuracy                           0.82      6517
   macro avg       0.82      0.82      0.82      6517
weighted avg       0.83      0.82      0.82      6517

Test Accuracy: 0.8213738117141981
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      3079
           1       0.87      0.78      0.82      3443

    accuracy                           0.82      6522
   macro avg       0.82      0.82      0.82      6522
weighted avg       0.83      0.82      0.82      6522

Validation ROC AUC: 0.9019
Test ROC AUC: 0.9023
Validation PR AUC: 0.9264
Test PR AUC: 0.9260
