In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from joblib import Parallel, delayed
import numpy.random as random

In [2]:
df_path = "../data/application_train.csv"
df = pd.read_csv(df_path)
df.shape

(307511, 122)

In [14]:
df["DAYS_ID_PUBLISH"].name

'ORGANIZATION_TYPE'

In [15]:
class TargetEncoder:
    def __init__(self):
        self.encoder = None
        
    def fit(self, cat, target):
        colname_cat = cat.name
        colname_target = target.name
        
        concat = pd.concat([cat, target], axis=1)
        self.encoder = concat.groupby(colname_cat)[colname_target].mean()
    
    def transform(self, cat):
        target = cat.map(self.encoder)
        return target
    
    def fit_transform(self, cat, target):
        self.fit(cat, target)
        encoded = self.transform(cat)
        return encoded

In [34]:
df = df.dropna(subset=["AMT_GOODS_PRICE", "AMT_ANNUITY"])

In [35]:
df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].map(lambda x:x if x != 365243 else 0)
df["CREDIT_INCOME_RATIO"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["CREDIT_GOODS_RATIO"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["CREDIT_ANNUITY_RATIO"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["EMPLOYED_BIRTH_RATIO"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]

te = TargetEncoder()
df["REGION_TARGET_ENCODED"] = te.fit_transform(df["REGION_POPULATION_RELATIVE"], df["TARGET"])
df["ORGANIZATION_TARGET_ENCODED"] = te.fit_transform(df["ORGANIZATION_TYPE"], df["TARGET"])

columns_to_use = ["DAYS_EMPLOYED", "CREDIT_INCOME_RATIO", "CREDIT_GOODS_RATIO",
                  "CREDIT_ANNUITY_RATIO", "REGION_TARGET_ENCODED",
                  "DAYS_BIRTH", "EMPLOYED_BIRTH_RATIO", "DAYS_ID_PUBLISH", "ORGANIZATION_TARGET_ENCODED"]

X = df[columns_to_use].values
y = df["TARGET"].values

In [36]:
skf = StratifiedKFold(n_splits=3)
for train_idx, test_idx in skf.split(X, y):
    train_tmp = X[train_idx]
    y_train_tmp = y[train_idx]
    Xfold3 = X[test_idx]
    yfold3 = y[test_idx]
    
skf2 = StratifiedKFold(n_splits=2)
for train_idx, test_idx in skf2.split(train_tmp, y_train_tmp):
    Xfold1 = train_tmp[train_idx]
    yfold1 = y_train_tmp[train_idx]
    Xfold2 = train_tmp[test_idx]
    yfold2 = y_train_tmp[test_idx]
    
Xfold1.shape, Xfold2.shape, Xfold3.shape, yfold1.shape, yfold2.shape, yfold3.shape

((102407, 9), (102407, 9), (102407, 9), (102407,), (102407,), (102407,))

In [38]:
old_model = RandomForestRegressor()
old_model.fit(Xfold1, yfold1)
ypred1 = old_model.predict(Xfold2)
auc = roc_auc_score(yfold2, ypred1)
print("AUC: ", auc)

AUC:  0.5859685617690992
