In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from metrics import *

df = pd.read_csv("dataset/application_train.csv").reset_index(drop=True)
df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
def clean(data):
    for col in data.columns:
        if data[col].dtype == "object":
            data[col].fillna("", inplace=True)
        else:
            data[col].fillna(0, inplace=True)
    return data
df = clean(df)

In [8]:
df["TARGET"].value_counts(normalize=True)

TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

# Cross Validation Splits

In [25]:
def dummies(data):
    dummyCols = [col for col in data.columns if data[col].nunique() < data.shape[0]] # filter out id columns automatically
    return pd.get_dummies(data[dummyCols]).astype(int)

def splits(data, mode="random", v=5):
    data.reset_index(drop=True, inplace=True)
    data = dummies(data)
    X = data.drop("TARGET", axis=1)
    y = data["TARGET"]
    folds = []
    if mode == "random":
        for i in range(v):
            trainIndex = np.random.choice(X.index, size=int(len(X)*0.8), replace=False)
            testIndex = X.index.difference(trainIndex)
            X_train = X.iloc[trainIndex]
            X_test = X.iloc[testIndex]
            y_train = y.iloc[trainIndex]
            y_test = y.iloc[testIndex]
            folds.append((X_train, X_test, y_train, y_test))
    elif mode == "stratified":
        classes = y.unique()
        classIndices = [y[y==c].index for c in classes]
        classIndices = pd.Series(classIndices)
        for i in range(v):
            choices = classIndices.apply(lambda c: np.random.choice(c, size=int(len(c)*0.8), replace=False)) # build the training set
            trainIndex = np.concatenate(choices)
            testIndex = X.index.difference(trainIndex)
            X_train = X.iloc[trainIndex]
            X_test = X.iloc[testIndex]
            y_train = y.iloc[trainIndex]
            y_test = y.iloc[testIndex]
            folds.append((X_train, X_test, y_train, y_test))
    elif mode == "non-random":
        interval = len(X) // v
        for i in range(v):
            testIndex = np.arange(i * interval, (i+1) * interval)
            testIndex = testIndex[testIndex < len(X)]
            trainIndex = X.index.difference(testIndex)
            trainIndex = trainIndex[trainIndex < len(X)]
            X_train = X.iloc[trainIndex]
            X_test = X.iloc[testIndex]
            y_train = y.iloc[trainIndex]
            y_test = y.iloc[testIndex]
            folds.append((X_train, X_test, y_train, y_test))
    else:
        raise ValueError(f"Invalid mode: {mode}")
    return folds

def cv(data, mode="random", v=5, models=[LogisticRegression(), SVC(), LinearDiscriminantAnalysis()]):
    folds = splits(data, mode, v)
    things = pd.DataFrame(columns=["model", "fold", "accuracy", "precision", "recall", "f1", "roc_auc"])
    for X_train, X_test, y_train, y_test in folds:
        i = 1
        for m in models:
            m.fit(X_train, y_train)
            predictions = m.predict(X_test)
            probabilities = m.predict_proba(X_test)
            stuff = {}
            stuff["accuracy"] = accuracy(y_test, predictions)
            stuff["precision"] = precision(y_test, predictions)
            stuff["recall"] = recall(y_test, predictions)
            stuff["f1"] = f1(y_test, predictions)
            stuff["roc_auc"] = roc_auc(y_test, probabilities)
            stuff["model"] = str(m)
            stuff["fold"] = i
            i += 1
            things = things.append(stuff, ignore_index=True)
    return things

In [26]:
sample = df.sample(n=500)
stuff = splits(sample, mode="non-random")
X_train, X_test, y_train, y_test = stuff[0]

In [24]:
for X_train, X_test, y_train, y_test in stuff:
    print(y_train.shape)

(400,)
(400,)
(400,)
(400,)
(400,)
