## Setup

In [7]:
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, make_scorer
from sklearn.model_selection import TimeSeriesSplit, cross_validate # GroupKFold, GridSearchCV,
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from joblib import dump, load

from utils import (
    precision_at_k,
    recall_at_k,
    #get_topk_scorers,
    summarize_by_predset,
    #compute_group_proportions
)

## Data Loading

In [None]:
X_train_f = pd.read_csv("./output/X_train_f.csv") # 2010 - 2014, w. protected attributes
X_train_s = pd.read_csv("./output/X_train_s.csv") # 2010 - 2014, w/o protected attributes
y_train = pd.read_csv("./output/y_train.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

## Correlation Analysis

In [3]:
# Computes the absolute value of the correlation matrix for the training features with protected attributes
corrM = X_train_f.corr().abs() # Corr matrix of X
corrM = corrM.unstack() # flatten
corrMo = corrM.sort_values(kind = "quicksort") # sort correlations
corrMo[corrMo < 1].tail(20) # Filters out the self-correlations (which equal 1) and prints the last 20 entries (lowest correlations)

# to spot which features (including protected‐attribute proxies) are most strongly correlated, so I can 
# watch out for multicollinearity or fairness-related leakage

ft_tot_dur_byage        ft_tot_dur                0.954161
ft_tot_dur              ft_tot_dur_byage          0.954161
maxbula.Missing.        lastjob_pt99999           0.954524
lastjob_pt99999         maxbula.Missing.          0.954524
seeking1_tot_dur_byage  seeking1_tot_dur          0.955420
seeking1_tot_dur        seeking1_tot_dur_byage    0.955420
lastjob_none            maxbula.Missing.          0.961120
maxbula.Missing.        lastjob_none              0.961120
lastjob_type99999       maxbula.Missing.          0.961120
maxbula.Missing.        tsince_lm_contact_cat5    0.961120
tsince_lm_contact_cat5  maxbula.Missing.          0.961120
maxbula.Missing.        lastjob_type99999         0.961120
                        lastjob_parallel99999     0.961120
lastjob_parallel99999   maxbula.Missing.          0.961120
emp1_total_dur          emp1_total_dur_byage      0.963495
emp1_total_dur_byage    emp1_total_dur            0.963495
secjob_tot_dur          secjob_tot_dur_byage      0.9658

In [4]:
tscv = TimeSeriesSplit(4) # Create splits by year

In [None]:
for train_index, test_index in tscv.split(X_train_f):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [   0    1    2 ... 4997 4998 4999] TEST: [5000 5001 5002 ... 9997 9998 9999]
TRAIN: [   0    1    2 ... 9997 9998 9999] TEST: [10000 10001 10002 ... 14997 14998 14999]
TRAIN: [    0     1     2 ... 14997 14998 14999] TEST: [15000 15001 15002 ... 19997 19998 19999]
TRAIN: [    0     1     2 ... 19997 19998 19999] TEST: [20000 20001 20002 ... 24997 24998 24999]


In [10]:
def precision_at_25(y_true, y_score, **kwargs):
    return precision_at_k(y_true, y_score, 0.25)
def precision_at_10(y_true, y_score, **kwargs):
    return precision_at_k(y_true, y_score, 0.10)
def recall_at_25(y_true, y_score, **kwargs):
    return recall_at_k(y_true, y_score, 0.25)
def recall_at_10(y_true, y_score, **kwargs):
    return recall_at_k(y_true, y_score, 0.10)

custom_precision25 = make_scorer(precision_at_25, needs_proba=True) # Precision at top 25%
custom_precision10 = make_scorer(precision_at_10, needs_proba=True) # Precision at top 10%
custom_recall25 = make_scorer(recall_at_25, needs_proba = True) # Recall at top 25%
custom_recall10 = make_scorer(recall_at_10, needs_proba = True) # Recall at top 10%

In [11]:
score = {'log_loss': 'neg_log_loss',
         'auc': 'roc_auc',
         'precision': 'precision', # uses default model threshold of 0.5
         'recall': 'recall',
         'precision_at_k25': custom_precision25, # uses custom threshold 
         'recall_at_k25': custom_recall25,
         'precision_at_k10': custom_precision10,
         'recall_at_k10': custom_recall10}

## 01 Logit Regression (w. protected attributes)

In [None]:
glm1 = LogisticRegression(penalty = None, solver = 'lbfgs', max_iter = 1000)
glm1.fit(X_train_f, y_train)

In [None]:
glmcv1 = cross_validate(estimator = glm1, 
                       X = X_train_f,
                       y = y_train,
                       cv = tscv,
                       n_jobs = -1, # use all available cores
                       scoring = score)

# !!! recall always 1.0, check if this is correct

In [None]:
coefs1 = pd.DataFrame(X_train_f.columns, columns = ['var'])
coefs1['coef'] = pd.DataFrame(glm1.coef_).transpose()

# Build a DataFrame of feature names + their learned coefficients, to inspect which variables 
# (including protected attrs) the model weights most heavily.

In [None]:
dump(glm1, './models/glm1.joblib')

## Predict

In [None]:
k75 = 0.75 # Top 75% 
k25 = 0.25 # Top 25% 
k10 = 0.1 # Top 10%

In [12]:
glm1_p = glm1.predict_proba(X_test_f)[:,1] # glm1

# Generate the predicted probability of the positive class for each test sample

NameError: name 'glm1' is not defined

In [None]:
threshold75 = np.sort(glm1_p)[::-1][int(k75*len(glm1_p))]
threshold25 = np.sort(glm1_p)[::-1][int(k25*len(glm1_p))]
threshold10 = np.sort(glm1_p)[::-1][int(k10*len(glm1_p))] # threshold10 is the score above which only the top 10% of test samples lie

In [None]:
glm1_c1 = glm1_p.copy()
glm1_c1[glm1_c1 < threshold10] = 0
glm1_c1[glm1_c1 >= threshold10] = 1

# Create a binary classification vector where only the top 10% by predicted probability are labeled “1”

In [None]:
glm1_c2 = glm1_p.copy()
glm1_c2[glm1_c2 < threshold25] = 0
glm1_c2[glm1_c2 >= threshold25] = 1

In [None]:
glm1_c3 = glm1_p.copy()
glm1_c3[(glm1_c3 <= threshold75) | (glm1_c3 >= threshold25)] = 0
glm1_c3[(glm1_c3 > threshold75) & (glm1_c3 < threshold25)] = 1

## Performance evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
for preds, label in zip(
    [glm1_c1, glm1_c2, glm1_c3],
    ["Top 10%", "Top 25%", "Middle 25–75%"]
):
    acc = accuracy_score(y_test, preds)
    f1  = f1_score(y_test, preds)
    print(f"{label:15s} → Accuracy: {acc:.3f},  F1-score: {f1:.3f}")

## Combine and save

In [None]:
'''
Build a single DataFrame side by side with:
      - The true labels (‘y_test’)
      - The raw predicted probabilities (‘glm1_p’)
      - Each binary decision vector at different cutoffs (‘glm1_c1’, ‘glm1_c2’, ‘glm1_c3’).
'''

preds_test = pd.concat([pd.DataFrame(np.array(y_test), columns = ['y_test']),
                         pd.DataFrame(glm1_p, columns = ['glm1_p']),
                         pd.DataFrame(glm1_c1, columns = ['glm1_c1']),
                         pd.DataFrame(glm1_c2, columns = ['glm1_c2']),
                         pd.DataFrame(glm1_c3, columns = ['glm1_c3'])],
                        axis = 1)

In [None]:
preds_test.to_csv('./output/preds_test.csv', index = False)