#### Imports

In [3]:
import pandas as pd
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
import itertools

#### Read Data

In [4]:
df = pd.read_csv('cleaned_train_with_features.csv')
X, y = df.drop('TARGET',axis=1), df['TARGET']

### Data Splits

#### Stratified Split

In [7]:
# Stratified Split
def stratified_split(data, test_size=0.1, validation_size = 0.1):

    class_1 = data[data['TARGET'] == 1]
    class_0 = data[data['TARGET'] == 0]

    test_count_1 = int(len(class_1) * test_size)
    validation_count_1 = int(len(class_1) * validation_size)
    test_count_0 = int(len(class_0) * test_size)
    validation_count_0 = int(len(class_0) * validation_size)

    class_1 = class_1.sample(frac=1)
    class_0 = class_0.sample(frac=1)

    # Split each class into test, validation, and train sets
    test_data = pd.concat([class_1.iloc[:test_count_1], class_0.iloc[:test_count_0]])
    validation_data = pd.concat([
        class_1.iloc[test_count_1:test_count_1 + validation_count_1],
        class_0.iloc[test_count_0:test_count_0 + validation_count_0]
    ])
    train_data = pd.concat([
        class_1.iloc[test_count_1 + validation_count_1:],
        class_0.iloc[test_count_0 + validation_count_0:]
    ])

    # Split features and target for each set
    X_train = train_data.drop('TARGET',axis=1)
    X_test = test_data.drop('TARGET',axis=1)
    X_val = validation_data.drop('TARGET',axis=1)
    y_train = train_data['TARGET']
    y_test = test_data['TARGET']
    y_val = validation_data['TARGET']

    return X_train, X_test, X_val, y_train, y_test, y_val

In [54]:
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
y_val.head

<bound method NDFrame.head of 99247     1
126942    1
15495     1
222802    1
190689    1
         ..
22811     0
187581    0
179885    0
254482    0
204452    0
Name: TARGET, Length: 26341, dtype: int64>

#### Custom Split

In [20]:
# Custom Split

def custom_split(data, valid_prop = 0.1, test_prop = 0.1, random_seed = 1738):

    train_prop = 1 - valid_prop - test_prop

    # define bins for age and income
    age_bins = [0, 40, 60, np.inf]
    age_labels = ['young', 'middle_aged', 'senior']
    income_bins = [0, 30000, 70000, np.inf]
    income_labels = ['low', 'medium', 'high']

    # convert age to years
    data['age'] = data['DAYS_BIRTH']/-365

    # create binned variables
    data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels)
    data['income_group'] = pd.cut(data['AMT_INCOME_TOTAL'], bins=income_bins, labels=income_labels)

    # create a key for each group (combination of gender, age, and income)
    data['group_key'] = data['CODE_GENDER_M'].astype(str) + '_' + data['age_group'].astype(str) + '_' + data['income_group'].astype(str)

    # shuffle the data
    data = data.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # split the data based on key
    train_data = pd.DataFrame()
    val_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for key, group in data.groupby('group_key'):
        n = len(group)
        n_train = int(n * train_prop)
        n_val = int(n * valid_prop)

        train_data = pd.concat([train_data, group[:n_train]])
        val_data = pd.concat([val_data, group[n_train:n_train + n_val]])
        test_data = pd.concat([test_data, group[n_train + n_val:]])

    # drop all unnecesary columns
    train_data = train_data.drop(columns=['age','age_group', 'income_group', 'group_key'])
    val_data = val_data.drop(columns=['age','age_group', 'income_group', 'group_key'])
    test_data = test_data.drop(columns=['age','age_group', 'income_group', 'group_key'])

    X_train = train_data.drop('TARGET', axis=1)
    y_train = train_data['TARGET']

    X_validation = val_data.drop('TARGET', axis=1)
    y_validation = val_data['TARGET']


    X_test = test_data.drop('TARGET', errors='ignore')
    y_test = test_data['TARGET']

    return X_train, X_test, X_validation, y_train, y_test, y_validation


#### Random Split

In [21]:
def random_train_test_split(X, y, test_size=0.2, val_size=0.2, random_seed=42):
    np.random.seed(random_seed)

    n = len(X)
    indices = np.arange(n)
    np.random.shuffle(indices)

    test_indices = indices[:int(n * test_size)]
    val_indices = indices[int(n * test_size):int(n * (test_size + val_size))]
    train_indices = indices[int(n * (test_size + val_size)):]

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    X_val, y_val = X[val_indices], y[val_indices]

    return X_train, X_test, X_val, y_train, y_test, y_val

#### Balanced Training Data Split

In [22]:
# custom split 2

def balanced_train_split(data, valid_prop = 0.1, test_prop = 0.1, random_seed = 1738 ):

    class_1 = data[data['TARGET'] == 1]
    class_0 = data[data['TARGET'] == 0]

    test_count_1 = int(len(class_1) * test_prop)
    validation_count_1 = int(len(class_1) * valid_prop)
    test_count_0 = int(len(class_0) * test_prop)
    validation_count_0 = int(len(class_0) * valid_prop)

    class_1 = class_1.sample(frac=1, random_state = random_seed)
    class_0 = class_0.sample(frac=1, random_state = random_seed)

    # Split each class into test, validation, and train sets
    test_data = pd.concat([class_1.iloc[:test_count_1], class_0.iloc[:test_count_0]])
    validation_data = pd.concat([
        class_1.iloc[test_count_1:test_count_1 + validation_count_1],
        class_0.iloc[test_count_0:test_count_0 + validation_count_0]
    ])
    train_data = pd.concat([
        class_1.iloc[test_count_1 + validation_count_1:],
        class_0.iloc[test_count_0 + validation_count_0:]
    ])

    # randomly remove training observations in class 0
    #num_class_1_train = len(train_data[train_data['TARGET'] == 1])
    #n_obs = int(num_class_1_train * 2)
    class_0_train = train_data[train_data['TARGET'] == 0]#.sample(n=n_obs, random_state=random_seed)
    class_1_train = train_data[train_data['TARGET'] == 1]

    num_samples = int((len(class_0_train) - len(class_1_train)) / 1.5)
    #print(num_samples)

    class_1_train_dups = train_data[train_data['TARGET'] == 1].sample(n = num_samples, replace=True, random_state=random_seed)

    balanced_train_data = pd.concat([class_0_train, class_1_train, class_1_train_dups]).sample(frac=1, random_state=random_seed)

    # Split features and target for each set
    X_train = balanced_train_data.drop('TARGET', axis=1)
    y_train = balanced_train_data['TARGET']
    X_test = test_data.drop('TARGET', axis=1)
    y_test = test_data['TARGET']
    X_val = validation_data.drop('TARGET', axis=1)
    y_val = validation_data['TARGET']

    return X_train, X_test, X_val, y_train, y_test, y_val

#### K-Fold Cross Validation

In [9]:
# Split Data Randomly Into K Folds
def kfold_cv(data, k = 5, seed = 1738):
    shuffled_data = data.sample(frac = 1, random_state = seed).reset_index(drop=True)

    n = len(shuffled_data)

    split_size = n//k
    splits = []

    for i in range(k):
        start = (i)*split_size
        end = (i+1) * split_size
        splits.append(shuffled_data.iloc[start:end])

    return splits


kfold_cv(df)

[       TARGET  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
 0           0             0          180000.0    360000.0      15381.0   
 1           0             0          123750.0    254700.0      14350.5   
 2           0             1          405000.0   1006920.0      46791.0   
 3           0             1           81000.0   1288350.0      37669.5   
 4           0             0          225000.0    675000.0      33750.0   
 ...       ...           ...               ...         ...          ...   
 52678       0             0          171000.0    679500.0      22050.0   
 52679       0             0           85500.0     95940.0       9342.0   
 52680       0             1          121500.0    364428.0      10152.0   
 52681       0             1           90000.0    289597.5      14913.0   
 52682       0             1          112500.0    755190.0      35122.5   
 
        AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  \
 0             3

### Metrics

#### Recall

In [6]:
def recall(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    return float(TP / (TP + FN)) if (TP + FN) > 0 else 0.0

#### Precision

In [7]:
def precision(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))

    return float(TP / (TP + FP)) if (TP + FP) > 0 else 0.0

#### Accuracy

In [8]:
# Accuracy
def compute_accuracy(truth, predicted):
    return np.mean(truth == predicted)


#### F1 Score

In [9]:
# F1 Score
def calculate_f1(y_true, y_pred):
    TP = FP = TN = FN = 0

    for true, pred in zip(y_true, y_pred):
        if true == 1 and pred == 1:
            TP += 1
        elif true == 0 and pred == 1:
            FP += 1
        elif true == 0 and pred == 0:
            TN += 1
        elif true == 1 and pred == 0:
            FN += 1

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    if precision + recall == 0:
        return 0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

#### F2 Score

In [10]:
# F2 Score
def calculate_f2(y_true, y_pred, b=2):
    r = recall(y_true, y_pred)
    p = precision(y_true, y_pred)

    if r + p == 0:
        return 0

    f2 = (1 + b**2) * (r * p) / (b**2 * r + p)
    return f2

#### ROC-AUC

In [11]:
def calc_roc_auc(y_true, y_prob):
  sorted_indices = np.argsort(y_prob)
  true_sort = y_true[sorted_indices]
  prob_sort = y_prob[sorted_indices]

  TP = np.cumsum(true_sort)
  FP = np.cumsum(1 - true_sort)

  TPR = TP / TP[-1]
  FPR = FP / FP[-1]



  # rocauc
  rocauc_score = np.trapz(FPR, TPR)

  return rocauc_score

### Fairness Metric

In [62]:
# Equal Opportunity
def equal_opportunity(classes,truth,preds):
    df = classes
    df["truth"] = truth.values
    df_preds = pd.DataFrame(preds, columns=["preds"])
    df = df.join(df_preds)
    scores = {}

    for key,group in df.groupby(['CODE_GENDER_M', 'AGE_GROUP']):
        scores[key] = recall(group['truth'], group['preds'])

    values = list(scores.values())
    diffs = []

    for pair in itertools.combinations(values, 2):
        diffs.append(abs(pair[0] - pair[1]))

    average_difference = sum(diffs) / len(diffs)


    return scores, average_difference

# testing
age_bins = [0, 40, 60, np.inf]
age_labels = ['young', 'middle_aged', 'senior']

df['age'] = df['DAYS_BIRTH']/-365

df['AGE_GROUP'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)

ex_classes = df[['CODE_GENDER_M', 'AGE_GROUP']]

#preds = lda_model.predict(X)

#df_preds = pd.DataFrame(preds, columns=["preds"])
#classes.join(df_preds)
#print(equal_opportunity(ex_classes,y,preds)[0], "\n", equal_opportunity(ex_classes,y,preds)[1])

#unique_values, counts = np.unique(preds, return_counts=True)
#print(unique_values)
#print(counts)


### Cross-Validated Metrics

In [77]:
# Perform CV and Compute all Metrics

def calc_cv_metrics(model, folds):
    num_folds = len(folds)

    metrics = {"accuracy": [],
               "precision": [],
               "recall": [],
               "f1": [],
               "roc auc": []}

    for i in range(num_folds):
        # separate folds into training and testing
        test_data = folds[i]
        train_data = pd.DataFrame()
        other_folds = folds[i + 1:] + folds[:i]
        for fold in other_folds:
            train_data = pd.concat([train_data, fold])

        X_train = train_data.drop('TARGET', axis=1)
        y_train = train_data['TARGET']
        X_test = test_data.drop('TARGET', axis=1)
        y_test = test_data['TARGET']

        # fit model and get predictions
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:,1]

        # calculate metrics and add to dictionary
        acc = compute_accuracy(y_test, preds)
        metrics["accuracy"].append(acc)

        p = precision(y_test, preds)
        metrics["precision"].append(p)

        r = recall(y_test, preds)
        metrics["recall"].append(r)

        f1 = calculate_f1(y_test, preds)
        metrics["f1"].append(f1)

        roc_auc = calc_roc_auc(np.array(y_test), probs)
        metrics["roc auc"].append(roc_auc)

    averages = {key: sum(values) / len(values) for key, values in metrics.items()}
    return averages

In [5]:
# Perform CV and Compute all Metrics

def calc_cv_metrics_svc(model, folds):
    num_folds = len(folds)

    metrics = {"accuracy": [],
               "precision": [],
               "recall": [],
               "f1": [],
               "roc auc": []}

    for i in range(num_folds):
        # separate folds into training and testing
        test_data = folds[i]
        train_data = pd.DataFrame()
        other_folds = folds[i + 1:] + folds[:i]
        for fold in other_folds:
            train_data = pd.concat([train_data, fold])

        X_train = train_data.drop('TARGET', axis=1)
        y_train = train_data['TARGET']
        X_test = test_data.drop('TARGET', axis=1)
        y_test = test_data['TARGET']

        # fit model and get predictions
        print('hit1')
        model.fit(X_train, y_train)
        print('hit2')

        calibrated_svc = CalibratedClassifierCV(model, method='sigmoid')  # 'sigmoid' for Platt scaling
        calibrated_svc.fit(X_train, y_train)
        print('hit2.5')

        preds = calibrated_svc.predict(X_test)
        print('hit3')
        probs = model.predict_proba(X_test)[:,1]
        print('hit4')

        # calculate metrics and add to dictionary
        acc = compute_accuracy(y_test, preds)
        metrics["accuracy"].append(acc)

        p = precision(y_test, preds)
        metrics["precision"].append(p)

        r = recall(y_test, preds)
        metrics["recall"].append(r)

        f1 = calculate_f1(y_test, preds)
        metrics["f1"].append(f1)

        roc_auc = calc_roc_auc(np.array(y_test), probs)
        metrics["roc auc"].append(roc_auc)

    averages = {key: sum(values) / len(values) for key, values in metrics.items()}
    return averages, model

### Models

#### Logistic Regression

In [35]:
# logistic regression
lr = LogisticRegression(penalty=None)
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
lr.fit(X_train, y_train)

preds = lr.predict(X_test)

unique_values, counts = np.unique(y_train, return_counts=True)
print(unique_values)
print(counts)

unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values)
print(counts)


unique_values, counts = np.unique(preds, return_counts=True)
print(unique_values)
print(counts)

#print(f'weights: {weights} \n constant: {b}')

# Metrics
print("precision = ", precision(y_test, preds), "\nrecall = ", recall(y_test, preds), "\nf1 score = ", calculate_f1(y_test, preds))

[0 1]
[194410  16327]
[0 1]
[24301  2040]
[0 1]
[26338     3]
precision =  0.0 
recall =  0.0 
f1 score =  0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Logistic Regression W/ Penalization

In [34]:
# scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# logistic regression w penalty
lr_penalty = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, C=1.5, solver="saga")
X_train, X_test, X_val, y_train, y_test, y_val = balanced_train_split(df)
lr_penalty.fit(X_train, y_train)

preds = lr_penalty.predict(X_test)

# Evaluate predictio
unique_values, counts = np.unique(y_train, return_counts=True)
print(unique_values)
print(counts)

unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values)
print(counts)


unique_values, counts = np.unique(preds, return_counts=True)
print(unique_values)
print(counts)

#print(f'weights: {weights} \n constant: {b}')

# Metrics
print("precision = ", precision(y_test, preds), "\nrecall = ", recall(y_test, preds), "\nf1 score = ", calculate_f1(y_test, preds))

[0 1]
[32654 16327]
[0 1]
[24301  2040]
[0 1]
[23338  3003]
precision =  0.2077922077922078 
recall =  0.3058823529411765 
f1 score =  0.24747174301011302




#### SVC

In [68]:
# SVC Model
svc_model = SVC(gamma='auto', kernel='linear') #Can add standard scaler if you want to scale, but left it out to interpret results
svc_model.fit(X, y)
weights = svc_model.coef_
b = svc_model.intercept_
print(f'weights: {weights} \n constant: {b}')
print(f'Actual decision boundary: {-b/weights}')


#### LDA


In [40]:
# LDA Model
#print(df)
lda_model = LinearDiscriminantAnalysis()

X_train, X_test, X_val, y_train, y_test, y_val = balanced_train_split(df)


lda_model.fit(X_train,y_train)

preds = lda_model.predict(X_test)
probabilities = lda_model.predict_proba(X_test)
print(probabilities)
print(preds)

unique_values, counts = np.unique(y_train, return_counts=True)
print(unique_values)
print(counts)

unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values)
print(counts)


unique_values, counts = np.unique(preds, return_counts=True)
print(unique_values)
print(counts)

#print(f'weights: {weights} \n constant: {b}')
#print(y_test)
# Metrics
print("precision = ", precision(y_test, preds), "\nrecall = ", recall(y_test, preds), "\nf1 score = ", calculate_f1(y_test, preds), "\nROC AUC =", calc_roc_auc(np.array(y_test),probabilities[:,1]))

[[0.83816371 0.16183629]
 [0.67399917 0.32600083]
 [0.55747821 0.44252179]
 ...
 [0.63029441 0.36970559]
 [0.62834239 0.37165761]
 [0.7193371  0.2806629 ]]
[0 0 0 ... 0 0 0]
[0 1]
[194410 135049]
[0 1]
[24301  2040]
[0 1]
[20726  5615]
precision =  0.16901157613535173 
recall =  0.46519607843137256 
f1 score =  0.24794252122795557 
ROC AUC = 0.705172969562295


# Cross Validation For Model Selection

In [54]:
# LDA
# Balanced Training Data Split
X_train, X_test, X_val, y_train, y_test, y_val = balanced_train_split(df)
train_data = X_train.join(y_train)
lda = LinearDiscriminantAnalysis()

folds = kfold_cv(train_data)
calc_cv_metrics(lda, folds)

{'accuracy': 0.8645878236529041,
 'precision': 0.8668503364143877,
 'recall': 0.9962975662743359,
 'f1': 0.9270770388752994,
 'roc auc': 0.7117389461773919}

In [None]:
#SVC
#Stratified Data Split
#Balanced Class Weights
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
train_data = X_train.join(y_train)
C = [0.1,0.5,1,5, 10]
folds = kfold_cv(train_data)
models = []
metrics_list = []
for c in C:
    svc = SVC(C=c, class_weight='balanced', probability=True)
    metrics, svc_model = calc_cv_metrics_svc(svc, folds)
    models.append(svc_model)
    metrics.append(metrics)

for mod, met in zip(models, metrics_list):
    print(f'SVC w/ C: {mod.C}: {metrics}')

hit1
