In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

def prepare_data(set_dir):
    data_dir = Path.home() / "OneDrive" / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"
    assessments = pd.read_csv(f"{data_dir}/assessments.csv")
    student_info = pd.read_csv(f"{set_dir}/student_info.csv")
    student_assessment = pd.read_csv(f"{set_dir}/student_assessment.csv")
    student_reg = pd.read_csv(f"{set_dir}/student_reg.csv")
    student_vle = pd.read_csv(f"{set_dir}/student_vle.csv")
    
    student_assessment = pd.merge(
        student_assessment, 
        assessments[['id_assessment', 'weight', 'assessment_type']], 
        on='id_assessment', 
        how='left'
    )

    student_assessment = student_assessment[student_assessment['weight'] > 0] 
  # student_assessment = student_assessment[student_assessment['assessment_type'] != 'Exam']

    # Aggregate assessments per student
    student_agg = student_assessment.groupby(
        ['code_module', 'code_presentation', 'id_student']
    ).agg(
        mean_score=('score', 'mean'),
        max_score=('score', 'max'),
        min_score=('score', 'min'),
        n_assessments=('score', 'count'),
        weighted_score=('score', lambda x: (x * student_assessment.loc[x.index, 'weight']).sum() / 100)
    ).reset_index()

    vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
    total_clicks=('sum_click', 'sum'),
    n_activities=('id_site', 'nunique')
    ).reset_index()
    
    merge_keys = ['code_module', 'code_presentation', 'id_student']
    df = student_info.merge(student_agg, on=merge_keys, how='left')
    df = df.merge(vle_agg, on=merge_keys, how='left')
    df = pd.merge(df, student_reg, on=merge_keys, how='left')

    nan_rows = df[df.isna().any(axis=1)]
    print(f"Found {len(nan_rows)} rows with NaN values")
    # print(nan_rows.head())

    assessment_cols = ['mean_score', 'max_score', 'min_score', 'weighted_score']
    df[assessment_cols] = df[assessment_cols].fillna(-1)  # -1 indicates no assessments
    df['n_assessments'] = df['n_assessments'].fillna(0)   # 0 assessments completed
    
    df['total_clicks'] = df['total_clicks'].fillna(0)
    df['n_activities'] = df['n_activities'].fillna(0)

    df = df.drop(columns=['date_unregistration', 'mean_score', 'max_score', 'min_score'], errors='ignore')

    df = df.drop(columns=['n_assessments']) # because of multicollinearity
    # Dropping equity related  features
  # df = df.drop(columns=['disability_Y', 'age_band', 'imd_band', 'highest_education', 'gender_M'])
    # Dropping regions
  # df = df.drop(columns=[reg for reg in df.columns if reg.startswith('region_')])
    
    y = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)  # binary target

    X = df.drop(columns=['code_module', 'code_presentation', 'id_student', 'final_result'])
    
    return X, y

def compute_vif(scaled_data, original_columns):
    # Converts scaled features into DataFrame
    X_df = pd.DataFrame(scaled_data, columns=original_columns.columns)
    
    # Computes VIF for each feature
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X_df.columns
    vif_data["VIF"] = [variance_inflation_factor(X_df.values, i) for i in range(X_df.shape[1])]
    
    # Shows high-VIF features
    print(vif_data.sort_values("VIF", ascending=False))

data_dir = Path.home() / "OneDrive" / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

X_train, y_train = prepare_data(data_dir / "train")
X_val, y_val = prepare_data(data_dir / "val")
X_test, y_test = prepare_data(data_dir / "test")

scaler = StandardScaler()

# Fitting only on training data, then transforming all sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val) 
X_test_scaled = scaler.transform(X_test)

compute_vif(X_train_scaled, X_train)

penalties = ['l1', 'l2', 'elasticnet', 'none']
results = []

for penalty in penalties:
    for C in [0.01, 0.1, 1, 10, 100]:
        # solver must match penalty type
        if penalty == 'l1':
            solver = 'liblinear'
        elif penalty == 'elasticnet':
            solver = 'saga'
        elif penalty in ['l2', 'none']:
            solver = 'lbfgs' 
        else:
            continue
    
        try:
            # Training
            model = LogisticRegression(
                penalty=penalty,
                C=C,                           
                l1_ratio=0.5 if penalty == 'elasticnet' else None,  # only for elasticnet
                class_weight='balanced',         # Handles class imbalance
                solver=solver,                 
                max_iter=100,                    # Allows enough iterations to converge
                random_state=42                  # For reproducibility
            )
            
            model.fit(X_train_scaled, y_train)
            print(f'Converged in {model.n_iter_} iterations')
        
            print(f"\n=== Penalty: {penalty.upper()} ===")
            y_val_probs = model.predict_proba(X_val_scaled)[:, 1]

            results.append({
                "penalty": penalty,
                "C": C,
                "val_acc": accuracy_score(y_val, model.predict(X_val_scaled)),
                "val_roc_auc": roc_auc_score(y_val, y_val_probs),
                "val_pr_auc": average_precision_score(y_val, y_val_probs)
            })
        except Exception as e:
            print(f"Penalty {penalty} not supported with solver {solver}: {e}")

results_df = pd.DataFrame(results)
print("\nComparison of penalties:")
print(results_df)


Found 5428 rows with NaN values
Found 1857 rows with NaN values
Found 1848 rows with NaN values
                        Feature       VIF
21                 n_activities  2.981763
20                 total_clicks  2.491377
11              region_Scotland  1.866870
8          region_London Region  1.813003
19               weighted_score  1.778669
13          region_South Region  1.750985
10  region_North Western Region  1.749844
16  region_West Midlands Region  1.654758
9           region_North Region  1.586882
6   region_East Midlands Region  1.575933
14     region_South West Region  1.570907
15                 region_Wales  1.536377
17      region_Yorkshire Region  1.526794
12     region_South East Region  1.524634
7                region_Ireland  1.332628
1                      imd_band  1.209992
0             highest_education  1.118015
4               studied_credits  1.073264
2                      age_band  1.058426
3          num_of_prev_attempts  1.050986
5                     