#### Imports

In [70]:
import pandas as pd
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import itertools

#### Read Data

In [71]:
df = pd.read_csv('cleaned_train_with_features.csv')
X, y = df.drop('TARGET',axis=1), df['TARGET']

### Cross-Validation Splits

#### Stratified Split

In [55]:
# Stratified Split
def stratified_split(data, test_size=0.1, validation_size = 0.1):

    class_1 = data[data['TARGET'] == 1]
    class_0 = data[data['TARGET'] == 0]

    test_count_1 = int(len(class_1) * test_size)
    validation_count_1 = int(len(class_1) * validation_size)
    test_count_0 = int(len(class_0) * test_size)
    validation_count_0 = int(len(class_0) * validation_size)

    class_1 = class_1.sample(frac=1)
    class_0 = class_0.sample(frac=1)

    # Split each class into test, validation, and train sets
    test_data = pd.concat([class_1.iloc[:test_count_1], class_0.iloc[:test_count_0]])
    validation_data = pd.concat([
        class_1.iloc[test_count_1:test_count_1 + validation_count_1],
        class_0.iloc[test_count_0:test_count_0 + validation_count_0]
    ])
    train_data = pd.concat([
        class_1.iloc[test_count_1 + validation_count_1:],
        class_0.iloc[test_count_0 + validation_count_0:]
    ])

    # Split features and target for each set
    X_train = train_data.drop('TARGET',axis=1)
    X_test = test_data.drop('TARGET',axis=1)
    X_val = validation_data.drop('TARGET',axis=1)
    y_train = train_data['TARGET']
    y_test = test_data['TARGET']
    y_val = validation_data['TARGET']

    return X_train, X_test, X_val, y_train, y_test, y_val

In [6]:
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
y_val.head

<bound method NDFrame.head of 187047    1
82891     1
39875     1
212962    1
13540     1
         ..
255389    0
214840    0
136169    0
218754    0
139408    0
Name: TARGET, Length: 26341, dtype: int64>

#### Custom Split

In [7]:
# Custom Split

def custom_split(data, valid_prop = 0.1, test_prop = 0.1, random_seed = 1738):

    train_prop = 1 - valid_prop - test_prop

    # define bins for age and income
    age_bins = [0, 40, 60, np.inf]
    age_labels = ['young', 'middle_aged', 'senior']
    income_bins = [0, 30000, 70000, np.inf]
    income_labels = ['low', 'medium', 'high']

    # convert age to years
    data['age'] = data['DAYS_BIRTH']/-365

    # create binned variables
    data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels)
    data['income_group'] = pd.cut(data['AMT_INCOME_TOTAL'], bins=income_bins, labels=income_labels)

    # create a key for each group (combination of gender, age, and income)
    data['group_key'] = data['CODE_GENDER_M'].astype(str) + '_' + data['age_group'].astype(str) + '_' + data['income_group'].astype(str)

    # shuffle the data
    data = data.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # split the data based on key
    train_data = pd.DataFrame()
    val_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for key, group in data.group_by('group_key'):
        n = len(group)
        n_train = int(n * train_prop)
        n_val = int(n * valid_prop)

        train_data = pd.concat([train_data, group[:n_train]])
        val_data = pd.concat([val_data, group[n_train:n_train + n_val]])
        test_data = pd.concat([test_data, group[n_train + n_val:]])

    # drop all unnecesary columns
    train_data = train_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    val_data = val_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    test_data = test_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])

    X_train = train_data.drop(['TARGET'])
    y_train = train_data['TARGET']

    X_validation = val_data.drop(['TARGET'])
    y_validation = val_data['TARGET']

    X_test = test_data.drop(['TARGET'])
    y_test = test_data['TARGET']

    return X_train, y_train, X_validation, y_validation, X_test, y_test


#### Random Split

In [8]:
def random_train_test_split(X, y, test_size=0.2, val_size=0.2, random_seed=42):
    np.random.seed(random_seed)

    n = len(X)
    indices = np.arange(n)
    np.random.shuffle(indices)

    test_indices = indices[:int(n * test_size)]
    val_indices = indices[int(n * test_size):int(n * (test_size + val_size))]
    train_indices = indices[int(n * (test_size + val_size)):]

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    X_val, y_val = X[val_indices], y[val_indices]

    return X_train, y_train, X_val, y_val, X_test, y_test

### Metrics

#### Recall

In [24]:
def recall(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    return float(TP / (TP + FN)) if (TP + FN) > 0 else 0.0

#### Precision

In [10]:
def precision(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    return float(TP / (TP + FP)) if (TP + FP) > 0 else 0.0

#### Accuracy

In [11]:
# Accuracy
def compute_accuracy(truth, predicted):
    return np.mean(truth == predicted)


#### F1 Score

In [12]:
# F1 Score
def calculate_f1(y_true, y_pred):
    TP = FP = TN = FN = 0

    for true, pred in zip(y_true, y_pred):
        if true == 1 and pred == 1:
            TP += 1
        elif true == 0 and pred == 1:
            FP += 1
        elif true == 0 and pred == 0:
            TN += 1
        elif true == 1 and pred == 0:
            FN += 1  ,

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    if precision + recall == 0:
        return 0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

#### F2 Score

In [13]:
# F2 Score
def calculate_f2(y_true, y_pred, b=2):
    r = recall(y_true, y_pred)
    p = precision(y_true, y_pred)
    
    if r + p == 0:
        return 0
    
    f2 = (1 + b**2) * (r * p) / (b**2 * r + p)
    return f2

#### ROC-AUC

In [14]:
def calc_roc_auc(y_true, y_prob):
  sorted_indices = np.argsort(y_prob)
  true_sort = y_true[sorted_indices]
  prob_sort = y_prob[sorted_indices]

  TP = np.cumsum(true_sort)
  FP = np.cumsum(1 - true_sort)

  TPR = TP / TP[-1]
  FPR = FP / FP[-1]

  # rocauc
  rocauc_score = np.trapz(FPR, TPR)

  return rocauc_score

### Fairness Metric

In [53]:
# Equal Opportunity
def equal_opportunity(classes,truth,preds):
    df = classes
    df["truth"] = truth.values
    df_preds = pd.DataFrame(preds, columns=["preds"])
    df = df.join(df_preds)
    scores = {}

    for key,group in df.groupby(['CODE_GENDER_M', 'AGE_GROUP']):
        scores[key] = recall(group['truth'], group['preds'])

    values = list(scores.values())
    diffs = []

    for pair in itertools.combinations(values, 2):
        diffs.append(abs(pair[0] - pair[1]))

    average_difference = sum(diffs) / len(diffs)


    return scores, average_difference

# testing
age_bins = [0, 40, 60, np.inf]
age_labels = ['young', 'middle_aged', 'senior']

df['age'] = df['DAYS_BIRTH']/-365

df['AGE_GROUP'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)

ex_classes = df[['CODE_GENDER_M', 'AGE_GROUP']]

preds = lda_model.predict(X)

#df_preds = pd.DataFrame(preds, columns=["preds"])
#classes.join(df_preds)
print(equal_opportunity(ex_classes,y,preds)[0], "\n", equal_opportunity(ex_classes,y,preds)[1])

#unique_values, counts = np.unique(preds, return_counts=True)
#print(unique_values)
#print(counts)


{(0, 'young'): 0.0019064124783362219, (0, 'middle_aged'): 0.0, (0, 'senior'): 0.0, (1, 'young'): 0.00962544465369324, (1, 'middle_aged'): 0.0034782608695652175, (1, 'senior'): 0.0} 
 0.004031227890366538


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["truth"] = truth.values
  for key,group in df.groupby(['CODE_GENDER_M', 'AGE_GROUP']):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["truth"] = truth.values
  for key,group in df.groupby(['CODE_GENDER_M', 'AGE_GROUP']):


### Models

#### Logistic Regression

In [36]:
# logistic regression
lr = LogisticRegression()
lr.fit(X, y)
w = lr.coef_[0]
b = lr.intercept_[0]
print(f'intercept: {b} \n weights: {w}')

intercept: -4.8759403511165345e-08 
 weights: [-6.17098035e-08 -5.11738321e-07  3.24970556e-06  2.17314844e-06
 -4.08774984e-06 -1.80579918e-09  1.05377936e-04  1.04870586e-06
  2.85868825e-05  1.38798174e-04 -4.87594035e-08 -4.81245275e-08
  8.92690052e-09 -4.83026440e-08 -2.50491797e-08 -9.06442081e-09
 -1.49171184e-07 -3.90096871e-08 -3.60710241e-08 -8.14982765e-07
 -8.63196704e-10 -1.03894884e-09 -1.25067933e-09  8.02450020e-09
  1.22751072e-08  8.01566297e-09 -9.23579637e-08 -1.86947865e-08
  2.68449653e-08 -1.81464868e-08  2.12597000e-08  1.02539475e-04
  0.00000000e+00  8.61561471e-09 -6.00593641e-11 -7.43621294e-10
  1.28446036e-09 -2.53485871e-11 -1.47495631e-08 -9.65573225e-10
 -1.67996890e-11 -1.50942330e-09 -3.25387655e-12 -2.24982981e-09
 -1.97631805e-09 -6.30331622e-10 -4.16666562e-09 -2.62366014e-10
 -3.85106962e-09 -6.50571215e-11  7.44024470e-11 -6.26550472e-11
 -2.69802639e-10  5.43492859e-10 -1.03080323e-09 -2.81655884e-08
 -1.26380721e-08  5.69097132e-08 -3.68803724

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Logistic Regression W/ Penalization

In [None]:
# scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# logistic regression w penalty
lr_penalty = LogisticRegression(penalty='l2', C=0.1)
lr_penalty.fit(X, y)
w = lr_penalty.coef_[0]
b = lr_penalty.intercept_[0]
print(f'intercept: {b} \n weights: {w}')

intercept: -4.9781205980785475e-08 
 weights: [-6.29774349e-08 -4.64958120e-07  3.17404965e-06  1.99527078e-06
 -4.00132795e-06 -1.84860477e-09  1.04920090e-04  1.05758003e-06
  2.94474735e-05  1.39724412e-04 -4.97812060e-08 -4.91318944e-08
  9.25746059e-09 -4.93129808e-08 -2.56061234e-08 -9.29348728e-09
 -1.52258031e-07 -3.94736910e-08 -3.64593397e-08 -8.33245052e-07
 -8.92672183e-10 -1.08029531e-09 -1.29193443e-09  8.22923853e-09
  1.26363114e-08  8.27006751e-09 -9.45893716e-08 -1.87506555e-08
  2.75656894e-08 -1.81917093e-08  2.18276236e-08  1.04428263e-04
  0.00000000e+00  9.02088395e-09 -6.15921339e-11 -7.57262878e-10
  1.32004091e-09 -2.59283591e-11 -1.51248000e-08 -9.91530921e-10
 -1.72214747e-11 -1.54882650e-09 -3.32840780e-12 -2.30770593e-09
 -2.02809573e-09 -6.46480071e-10 -4.27045635e-09 -2.69147229e-10
 -3.94890033e-09 -6.66345010e-11  7.64050565e-11 -6.45403092e-11
 -2.76156807e-10  5.58503023e-10 -1.05099974e-09 -2.88165937e-08
 -1.29116991e-08  5.88900257e-08 -3.77847825

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### SVC

In [30]:
# SVC Model
svc_model = SVC(gamma='auto', kernel='linear') #Can add standard scaler if you want to scale, but left it out to interpret results
svc_model.fit(X, y)
weights = svc_model.coef_
b = svc_model.intercept_
print(f'weights: {weights} \n constant: {b}')
print(f'Actual decision boundary: {-b/weights}')


#### LDA


In [75]:
# LDA Model
#print(df)
lda_model = LinearDiscriminantAnalysis()

X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)


lda_model.fit(X_train,y_train)

preds = lda_model.predict(X_test)

unique_values, counts = np.unique(y_train, return_counts=True)
print(unique_values)
print(counts)

unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values)
print(counts)


unique_values, counts = np.unique(preds, return_counts=True)
print(unique_values)
print(counts)

#print(f'weights: {weights} \n constant: {b}')

[0 1]
[194410  16327]
[0 1]
[24301  2040]
[0 1]
[26328    13]
