#### Imports

In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

#### Data Preprocessing and Cleaning

In [4]:
df = pd.read_csv('application_train_clean.csv')
df = df.drop(['Unnamed: 0','SK_ID_CURR'],axis=1)
print(df.columns[df.dtypes == object])
df = pd.get_dummies(df, columns = df.select_dtypes(include=['object']).columns, drop_first=True, dtype=int)
df.to_csv('cleaned_train.csv', index=False)
df.head()


Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')


Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648,...,0,0,0,0,0,0,0,0,0,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186,...,0,0,0,0,0,0,0,0,0,0
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260,...,0,0,0,0,0,0,0,0,0,0
3,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311,...,0,0,0,0,0,0,0,0,0,0
4,0,0,99000.0,490495.5,27517.5,454500.0,0.035792,-16941,-1588,-4970,...,0,0,0,0,0,0,0,0,0,0


In [2]:
df = pd.read_csv('cleaned_train.csv')

In [3]:
X, y = df.drop('TARGET',axis=1), df['TARGET']

### Cross-Validation Splits

#### Stratified Split

In [4]:
# Stratified Split
def stratified_split(data, test_size=0.1, validation_size = 0.1):

    class_1 = data[data['TARGET'] == 1]
    class_0 = data[data['TARGET'] == 0]

    test_count_1 = int(len(class_1) * test_size)
    validation_count_1 = int(len(class_1) * validation_size)
    test_count_0 = int(len(class_0) * test_size)
    validation_count_0 = int(len(class_0) * validation_size)

    class_1 = class_1.sample(frac=1)
    class_0 = class_0.sample(frac=1)

    # Split each class into test, validation, and train sets
    test_data = pd.concat([class_1.iloc[:test_count_1], class_0.iloc[:test_count_0]])
    validation_data = pd.concat([
        class_1.iloc[test_count_1:test_count_1 + validation_count_1],
        class_0.iloc[test_count_0:test_count_0 + validation_count_0]
    ])
    train_data = pd.concat([
        class_1.iloc[test_count_1 + validation_count_1:],
        class_0.iloc[test_count_0 + validation_count_0:]
    ])

    # Split features and target for each set
    X_train = train_data.drop('TARGET',axis=1)
    X_test = test_data.drop('TARGET',axis=1)
    X_val = validation_data.drop('TARGET',axis=1)
    y_train = train_data['TARGET']
    y_test = test_data['TARGET']
    y_val = validation_data['TARGET']

    return X_train, X_test, X_val, y_train, y_test, y_val

In [5]:
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
y_val.head

<bound method NDFrame.head of 224243    1
173991    1
115205    1
68818     1
47700     1
         ..
50620     0
100886    0
214539    0
91403     0
29895     0
Name: TARGET, Length: 26341, dtype: int64>

#### Custom Split

In [6]:
# Custom Split

def custom_split(data, valid_prop = 0.1, test_prop = 0.1, random_seed = 1738):

    train_prop = 1 - valid_prop - test_prop

    # define bins for age and income
    age_bins = [0, 40, 60, np.inf]
    age_labels = ['young', 'middle_aged', 'senior']
    income_bins = [0, 30000, 70000, np.inf]
    income_labels = ['low', 'medium', 'high']

    # convert age to years
    data['age'] = data['DAYS_BIRTH']/-365

    # create binned variables
    data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels)
    data['income_group'] = pd.cut(data['AMT_INCOME_TOTAL'], bins=income_bins, labels=income_labels)

    # create a key for each group (combination of gender, age, and income)
    data['group_key'] = data['CODE_GENDER_M'].astype(str) + '_' + data['age_group'].astype(str) + '_' + data['income_group'].astype(str)

    # shuffle the data
    data = data.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # split the data based on key
    train_data = pd.DataFrame()
    val_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for key, group in data.group_by('group_key'):
        n = len(group)
        n_train = int(n * train_prop)
        n_val = int(n * valid_prop)

        train_data = pd.concat([train_data, group[:n_train]])
        val_data = pd.concat([val_data, group[n_train:n_train + n_val]])
        test_data = pd.concat([test_data, group[n_train + n_val:]])

    # drop all unnecesary columns
    train_data = train_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    val_data = val_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    test_data = test_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])

    X_train = train_data.drop(['TARGET'])
    y_train = train_data['TARGET']

    X_validation = val_data.drop(['TARGET'])
    y_validation = val_data['TARGET']

    X_test = test_data.drop(['TARGET'])
    y_test = test_data['TARGET']

    return X_train, y_train, X_validation, y_validation, X_test, y_test


#### Random Split

In [7]:
def random_train_test_split(X, y, test_size=0.2, val_size=0.2, random_seed=42):
    np.random.seed(random_seed)

    n = len(X)
    indices = np.arange(n)
    np.random.shuffle(indices)

    test_indices = indices[:int(n * test_size)]
    val_indices = indices[int(n * test_size):int(n * (test_size + val_size))]
    train_indices = indices[int(n * (test_size + val_size)):]

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    X_val, y_val = X[val_indices], y[val_indices]

    return X_train, y_train, X_val, y_val, X_test, y_test

### Metrics

#### Recall

In [8]:
def recall(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    return float(TP / (TP + FN)) if (TP + FN) > 0 else 0.0

#### Precision

In [9]:
def precision(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))

    return float(TP / (TP + FP)) if (TP + FP) > 0 else 0.0

#### Accuracy

In [10]:
# Accuracy
def compute_accuracy(truth, predicted):
    return np.mean(truth == predicted)


#### F1 Score

In [11]:
# F1 Score
def calculate_f1(y_true, y_pred):
    TP = FP = TN = FN = 0

    for true, pred in zip(y_true, y_pred):
        if true == 1 and pred == 1:
            TP += 1
        elif true == 0 and pred == 1:
            FP += 1
        elif true == 0 and pred == 0:
            TN += 1
        elif true == 1 and pred == 0:
            FN += 1  ,

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    if precision + recall == 0:
        return 0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

#### F2 Score

In [12]:
# F2 Score
def calculate_f2(y_true, y_pred, b=2):
    r = recall(y_true, y_pred)
    p = precision(y_true, y_pred)

    if r + p == 0:
        return 0

    f2 = (1 + b**2) * (r * p) / (b**2 * r + p)
    return f2

#### ROC-AUC

In [13]:
def calc_roc_auc(y_true, y_prob):
  sorted_indices = np.argsort(y_prob)
  true_sort = y_true[sorted_indices]
  prob_sort = y_prob[sorted_indices]

  TP = np.cumsum(true_sort)
  FP = np.cumsum(1 - true_sort)

  TPR = TP / TP[-1]
  FPR = FP / FP[-1]

  # rocauc
  rocauc_score = np.trapz(FPR, TPR)

  return rocauc_score

### Models

#### Logistic Regression

In [14]:
# logistic regression
lr = LogisticRegression()
lr.fit(X, y)
w = lr.coef_[0]
b = lr.intercept_[0]
print(f'intercept: {b} \n weights: {w}')

intercept: -7.61617123548896e-08 
 weights: [-9.20745121e-08 -7.82268465e-07  3.09376823e-06  3.83392256e-06
 -3.85531612e-06 -2.97772617e-09  1.11785036e-04  1.20372071e-06
  2.37650110e-05  1.14889235e-04 -7.61617124e-08 -7.51256996e-08
  1.91869578e-08 -7.53717094e-08 -3.95068036e-08 -1.52292429e-08
 -2.28318903e-07 -4.96404524e-08 -4.45214285e-08 -1.30848943e-06
 -1.76998237e-09 -2.35790368e-09 -2.50407522e-09  1.39153318e-08
  2.27292003e-08  1.54017655e-08 -1.53657166e-07 -1.63871511e-08
  4.72020715e-08 -1.55511883e-08  3.72698940e-08  1.40631199e-04
  0.00000000e+00  2.11496486e-08 -1.03258035e-10 -1.05356774e-09
  2.15784443e-09 -4.11107380e-11 -2.49338378e-08 -1.71349204e-09
 -2.86347815e-11 -2.64412419e-09 -5.17452937e-12 -3.87559684e-09
 -3.43942271e-09 -1.08448408e-09 -7.12148851e-09 -4.53765732e-10
 -6.63137818e-09 -1.13831133e-10  1.28803384e-10 -1.19415074e-10
 -4.42801957e-10  9.73633975e-10 -1.54825652e-09 -4.52503807e-08
 -2.00152160e-08  1.18011109e-07 -6.23376446e-

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Logistic Regression W/ Penalization

In [15]:
# logistic regression w penalty
lr_penalty = LogisticRegression(penalty='l2', C=1.0)
lr_penalty.fit(X, y)
w = lr_penalty.coef_[0]
b = lr_penalty.intercept_[0]
print(f'intercept: {b} \n weights: {w}')

intercept: -7.61617123548896e-08 
 weights: [-9.20745121e-08 -7.82268465e-07  3.09376823e-06  3.83392256e-06
 -3.85531612e-06 -2.97772617e-09  1.11785036e-04  1.20372071e-06
  2.37650110e-05  1.14889235e-04 -7.61617124e-08 -7.51256996e-08
  1.91869578e-08 -7.53717094e-08 -3.95068036e-08 -1.52292429e-08
 -2.28318903e-07 -4.96404524e-08 -4.45214285e-08 -1.30848943e-06
 -1.76998237e-09 -2.35790368e-09 -2.50407522e-09  1.39153318e-08
  2.27292003e-08  1.54017655e-08 -1.53657166e-07 -1.63871511e-08
  4.72020715e-08 -1.55511883e-08  3.72698940e-08  1.40631199e-04
  0.00000000e+00  2.11496486e-08 -1.03258035e-10 -1.05356774e-09
  2.15784443e-09 -4.11107380e-11 -2.49338378e-08 -1.71349204e-09
 -2.86347815e-11 -2.64412419e-09 -5.17452937e-12 -3.87559684e-09
 -3.43942271e-09 -1.08448408e-09 -7.12148851e-09 -4.53765732e-10
 -6.63137818e-09 -1.13831133e-10  1.28803384e-10 -1.19415074e-10
 -4.42801957e-10  9.73633975e-10 -1.54825652e-09 -4.52503807e-08
 -2.00152160e-08  1.18011109e-07 -6.23376446e-

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### SVC

In [19]:
# SVC Model
#SVC
# Define the pipeline with PCA, StandardScaler, and SVC
clf = make_pipeline(
    PCA(n_components=0.95),          # Retain x% of the variance or set a fixed number of components
    StandardScaler(),
    SVC(kernel="linear", gamma="auto")
)

# Fit the model
clf.fit(X, y)

# Retrieve the trained SVC model from the pipeline
svc_model = clf.named_steps['svc']

# Access the coefficients and intercept
weights = svc_model.coef_
intercept = svc_model.intercept_

print(f'weights: {weights} \n constant: {b}')
print(f'Actual decision boundary: {-b/weights}')


weights: [[ 5.81331135e-05 -4.79375358e-06 -1.66996527e-04]] 
 constant: [-0.96448307]
Actual decision boundary: [[  16590.94127187 -201195.79725874   -5775.46784354]]


#### LDA


In [17]:
# LDA Model

lda_model = LinearDiscriminantAnalysis()

lda_model.fit(X,y)

weights = lda_model.coef_
b = lda_model.intercept_

print(f'weights: {weights} \n constant: {b}')

weights: [[-6.99163540e-03  1.31726591e-07  2.51570304e-06  7.90656431e-06
  -2.97870729e-06  3.30432518e+00  3.10964090e-05  6.09622973e-05
   1.22852234e-05  6.92994698e-05  2.55523588e-14 -1.13795177e+00
   2.32766005e-01 -1.04844708e-01 -6.25567688e-02 -3.06116334e-02
   1.04697501e-02 -1.36067968e-01  3.02887942e-01  2.76813800e-03
  -2.55896995e-01  1.49182311e-01 -1.99011053e-01  3.04084734e-01
   3.83162041e-03  3.11606065e-02 -2.58909916e+00  2.51117890e-02
   1.66423113e-01 -2.74171808e-02  6.51061324e-02  9.02112208e-05
  -5.85632194e-14  5.84966274e-01 -2.68696131e-01  5.20468085e-01
   5.29385595e-01  1.62834989e-01  3.62887028e-01  3.65029890e-01
  -1.06538072e+00 -1.13021294e-01  2.83997546e-01 -4.60871568e-01
  -5.49975143e-01 -5.08995001e-01 -4.66557311e-01 -1.02500534e+00
  -4.28563348e-01 -2.19961398e-01  5.62758668e-01  4.62208840e-01
  -2.76730968e-02  1.31825353e-01 -1.53816093e-02 -3.96546600e-03
  -1.66288010e-02  3.37600839e-02  2.41319189e-01  4.22802912e-01
 