In [1]:
!pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np

import os, sys, json
import gc

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


# Stratégie pour le ML

A partir des différents dataset, nous devons retomber que sur un seul pour pouvoir avancer sur la partie ML.

Nous allons :
- supprimer les colonnes pas assez renseignées.
- transformer les colonnes catégorielles en colonnes numériques via get_dummies

Ensuite nous allons aggréger les données selon la colonne SK_ID_CURR en obtenant pour chaque colonne originelle:
- la valeur minimale
- la valeur maximale
- la valeur moyenne
- la variance
- la somme des valeurs agrégées

Nous créons aussi pour chaque ligne une feature dont le nom est composé du basename du fichier originel avec somme suffixe '_NB_ROWS" pour indiquer le nombre de lignes agrégées

In [3]:
threshold = 0.5

In [4]:
def aggregate_df(df, by_col, df_name):
    old_cols = list(df.columns)
    df2 = pd.get_dummies(df)
    new_cols = list(df2.columns)
    added_cols = [item for item in new_cols if item not in old_cols]
    
    dict_agg = {}
    size_added = False

    for c in df.select_dtypes(include=[int, float]).columns:
        if c==by_col: continue
        dict_agg[c] = []
        if size_added==False:
            dict_agg[c].append((f"{df_name}_NB_ROWS", "size"))
            size_added = True

        dict_agg[c].append((f"{df_name}_{c}_min", "min"))
        dict_agg[c].append((f"{df_name}_{c}_max", "max"))
        dict_agg[c].append((f"{df_name}_{c}_mean", "mean"))
        dict_agg[c].append((f"{df_name}_{c}_var", "var"))
        dict_agg[c].append((f"{df_name}_{c}_sum", "sum"))

        # Should not be used anymore, was used in case of aggregating aggregated tables
        if c.endswith("_NB_ROWS"):
            dict_agg[c].append((c, "sum"))

    del df
    gc.collect()

    df3 = df2.groupby(by_col).agg(dict_agg).reset_index()
    del df2
    gc.collect()
    
    cur_cols = list(df3.columns)
    new_cols = []
    new_cols.append(cur_cols[0][0])
    for i in range(1, len(cur_cols)):
        new_cols.append(f"{cur_cols[i][1]}")
    
    df3.columns = new_cols

    del added_cols
    del new_cols
    del dict_agg
    
    return df3

In [5]:
def load_csv(table, threshold=0.5):
    df = pd.read_csv(f"../assets/{table}.csv")
    s = df.notna().mean()
    cols_to_remove = []
    for c in s.index:
        if s[c]<threshold:
            cols_to_remove.append(c)
    if len(cols_to_remove)>0:
        df.drop(columns=cols_to_remove, inplace=True)

    
    for c in df.select_dtypes(exclude='number').columns:
        df[c].fillna(df[c].value_counts().index[0], inplace=True)

    for c in df.select_dtypes(include='number').columns:
        df[c].fillna(df[c].median(), inplace=True)
        
    return df

# Fusion de bureau_balance et bureau

In [6]:
df_bureau_balance = load_csv("bureau_balance")

In [7]:
df_bureau_balance.shape

(27299925, 3)

In [8]:
df_bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [9]:
df_bureau = load_csv("bureau")

In [10]:
df_bureau.loc[df_bureau["DAYS_CREDIT_ENDDATE"]<-20000, "DAYS_CREDIT_ENDDATE"] = np.NaN
df_bureau.loc[df_bureau["DAYS_CREDIT_UPDATE"]<-20000, "DAYS_CREDIT_UPDATE"] = np.NaN

In [11]:
df_bureau = df_bureau.merge(df_bureau_balance, on="SK_ID_BUREAU", how="left")

In [12]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,MONTHS_BALANCE,STATUS
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,0,91323.0,0.0,0.0,0.0,Consumer credit,-131.0,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,-897.0,0,225000.0,171342.0,0.0,0.0,Credit card,-20.0,,
2,215354,5714464,Active,currency 1,-203,0,528.0,-897.0,0,464323.5,0.0,0.0,0.0,Consumer credit,-16.0,,
3,215354,5714465,Active,currency 1,-203,0,-330.0,-897.0,0,90000.0,0.0,0.0,0.0,Credit card,-16.0,,
4,215354,5714466,Active,currency 1,-629,0,1197.0,-897.0,0,2700000.0,0.0,0.0,0.0,Consumer credit,-21.0,,


In [13]:
df_bureau.drop(columns="SK_ID_BUREAU", inplace=True)

# Aggrégation de bureau

In [14]:
df_bureau = aggregate_df(df_bureau, "SK_ID_CURR", "BUREAU")

In [15]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,BUREAU_NB_ROWS,BUREAU_DAYS_CREDIT_min,BUREAU_DAYS_CREDIT_max,BUREAU_DAYS_CREDIT_mean,BUREAU_DAYS_CREDIT_var,BUREAU_DAYS_CREDIT_sum,BUREAU_CREDIT_DAY_OVERDUE_min,BUREAU_CREDIT_DAY_OVERDUE_max,BUREAU_CREDIT_DAY_OVERDUE_mean,...,BUREAU_DAYS_CREDIT_UPDATE_min,BUREAU_DAYS_CREDIT_UPDATE_max,BUREAU_DAYS_CREDIT_UPDATE_mean,BUREAU_DAYS_CREDIT_UPDATE_var,BUREAU_DAYS_CREDIT_UPDATE_sum,BUREAU_MONTHS_BALANCE_min,BUREAU_MONTHS_BALANCE_max,BUREAU_MONTHS_BALANCE_mean,BUREAU_MONTHS_BALANCE_var,BUREAU_MONTHS_BALANCE_sum
0,100001,172,-1572,-49,-1009.284884,168743.397899,-173597,0,0,0.0,...,-155.0,-6.0,-127.651163,3292.906841,-21956.0,-51.0,0.0,-16.27907,154.518156,-2800.0
1,100002,110,-1437,-103,-996.781818,108311.29141,-109646,0,0,0.0,...,-1185.0,-7.0,-631.963636,225772.842702,-69516.0,-47.0,0.0,-24.554545,123.313511,-2701.0
2,100003,4,-2586,-606,-1400.75,827783.583333,-5603,0,0,0.0,...,-2131.0,-43.0,-816.0,824562.0,-3264.0,,,,,0.0
3,100004,2,-1326,-408,-867.0,421362.0,-1734,0,0,0.0,...,-682.0,-382.0,-532.0,45000.0,-1064.0,,,,,0.0
4,100005,21,-373,-62,-272.380952,17801.747619,-5720,0,0,0.0,...,-121.0,-11.0,-81.952381,2639.047619,-1721.0,-12.0,0.0,-4.333333,14.533333,-91.0


In [16]:
del df_bureau_balance
gc.collect()

0

In [17]:
if not os.path.isdir("working"):
    os.mkdir("working")
df_bureau.to_parquet("../working/bureau.parquet", engine="pyarrow")

In [18]:
del df_bureau
gc.collect()

0

# Aggrégation de credit_card_balance

In [19]:
df_credit_card_balance = load_csv("credit_card_balance")

In [20]:
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [21]:
df_credit_card_balance.shape

(3840312, 23)

In [22]:
df_credit_card_balance["SK_ID_PREV"].unique().shape

(104307,)

In [23]:
df_credit_card_balance.drop(columns="SK_ID_PREV", inplace=True)

In [24]:
df_credit_card_balance = aggregate_df(df_credit_card_balance, "SK_ID_CURR", "CREDIT_CARD_BALANCE")

In [25]:
df_credit_card_balance.head()

Unnamed: 0,SK_ID_CURR,CREDIT_CARD_BALANCE_NB_ROWS,CREDIT_CARD_BALANCE_MONTHS_BALANCE_min,CREDIT_CARD_BALANCE_MONTHS_BALANCE_max,CREDIT_CARD_BALANCE_MONTHS_BALANCE_mean,CREDIT_CARD_BALANCE_MONTHS_BALANCE_var,CREDIT_CARD_BALANCE_MONTHS_BALANCE_sum,CREDIT_CARD_BALANCE_AMT_BALANCE_min,CREDIT_CARD_BALANCE_AMT_BALANCE_max,CREDIT_CARD_BALANCE_AMT_BALANCE_mean,...,CREDIT_CARD_BALANCE_SK_DPD_min,CREDIT_CARD_BALANCE_SK_DPD_max,CREDIT_CARD_BALANCE_SK_DPD_mean,CREDIT_CARD_BALANCE_SK_DPD_var,CREDIT_CARD_BALANCE_SK_DPD_sum,CREDIT_CARD_BALANCE_SK_DPD_DEF_min,CREDIT_CARD_BALANCE_SK_DPD_DEF_max,CREDIT_CARD_BALANCE_SK_DPD_DEF_mean,CREDIT_CARD_BALANCE_SK_DPD_DEF_var,CREDIT_CARD_BALANCE_SK_DPD_DEF_sum
0,100006,6,-6,-1,-3.5,3.5,-21,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
1,100011,74,-75,-2,-38.5,462.5,-2849,0.0,189000.0,54482.111149,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
2,100013,96,-96,-1,-48.5,776.0,-4656,0.0,161420.22,18159.919219,...,0,1,0.010417,0.010417,1,0,1,0.010417,0.010417,1
3,100021,17,-18,-2,-10.0,25.5,-170,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
4,100023,8,-11,-4,-7.5,6.0,-60,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0


In [26]:
df_credit_card_balance.shape

(103558, 102)

In [27]:
df_credit_card_balance.to_parquet("../working/credit_card_balance.parquet", engine="pyarrow")

In [28]:
del df_credit_card_balance
gc.collect()

0

# Aggrégation de installments_payments

In [29]:
df_installments_payments = load_csv("installments_payments")

In [30]:
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [31]:
df_installments_payments.shape

(13605401, 8)

In [32]:
df_installments_payments.drop(columns="SK_ID_PREV", inplace=True)

In [33]:
# Percentage and difference paid in each installment (amount paid and installment value)
df_installments_payments['PAYMENT_PERC'] = df_installments_payments['AMT_PAYMENT'] / df_installments_payments['AMT_INSTALMENT']
df_installments_payments['PAYMENT_DIFF'] = df_installments_payments['AMT_INSTALMENT'] - df_installments_payments['AMT_PAYMENT']
# Days past due and days before due (no negative values)
df_installments_payments['DPD'] = df_installments_payments['DAYS_ENTRY_PAYMENT'] - df_installments_payments['DAYS_INSTALMENT']
df_installments_payments['DBD'] = df_installments_payments['DAYS_INSTALMENT'] - df_installments_payments['DAYS_ENTRY_PAYMENT']
df_installments_payments['DPD'] = df_installments_payments['DPD'].apply(lambda x: x if x > 0 else 0)
df_installments_payments['DBD'] = df_installments_payments['DBD'].apply(lambda x: x if x > 0 else 0)


In [34]:
df_installments_payments = aggregate_df(df_installments_payments, "SK_ID_CURR", "INSTALLMENTS_PAYMENTS")

In [35]:
df_installments_payments.head()

Unnamed: 0,SK_ID_CURR,INSTALLMENTS_PAYMENTS_NB_ROWS,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_VERSION_min,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_VERSION_max,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_VERSION_mean,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_VERSION_var,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_VERSION_sum,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_NUMBER_min,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_NUMBER_max,INSTALLMENTS_PAYMENTS_NUM_INSTALMENT_NUMBER_mean,...,INSTALLMENTS_PAYMENTS_DPD_min,INSTALLMENTS_PAYMENTS_DPD_max,INSTALLMENTS_PAYMENTS_DPD_mean,INSTALLMENTS_PAYMENTS_DPD_var,INSTALLMENTS_PAYMENTS_DPD_sum,INSTALLMENTS_PAYMENTS_DBD_min,INSTALLMENTS_PAYMENTS_DBD_max,INSTALLMENTS_PAYMENTS_DBD_mean,INSTALLMENTS_PAYMENTS_DBD_var,INSTALLMENTS_PAYMENTS_DBD_sum
0,100001,7,1.0,2.0,1.142857,0.142857,8.0,1,4,2.714286,...,0.0,11.0,1.571429,17.285714,11.0,0.0,36.0,8.857143,164.142857,62.0
1,100002,19,1.0,2.0,1.052632,0.052632,20.0,1,19,10.0,...,0.0,0.0,0.0,0.0,0.0,12.0,31.0,20.421053,24.25731,388.0
2,100003,25,1.0,2.0,1.04,0.04,26.0,1,12,5.08,...,0.0,0.0,0.0,0.0,0.0,1.0,14.0,7.16,13.89,179.0
3,100004,3,1.0,2.0,1.333333,0.333333,4.0,1,3,2.0,...,0.0,0.0,0.0,0.0,0.0,3.0,11.0,7.666667,17.333333,23.0
4,100005,9,1.0,2.0,1.111111,0.111111,10.0,1,9,5.0,...,0.0,1.0,0.111111,0.111111,1.0,0.0,37.0,23.666667,176.5,213.0


In [36]:
df_installments_payments.shape

(339587, 52)

In [37]:
df_installments_payments.to_parquet("../working/installments_payments.parquet")

In [38]:
del df_installments_payments
gc.collect()

0

# Aggrégarion de pos_cash_balance

In [39]:
df_pos_cash_balance = load_csv("POS_CASH_balance")

In [40]:
df_pos_cash_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [41]:
df_pos_cash_balance.shape

(10001358, 8)

In [42]:
df_pos_cash_balance.drop(columns="SK_ID_PREV", inplace=True)

In [43]:
df_pos_cash_balance = aggregate_df(df_pos_cash_balance, "SK_ID_CURR", "POS_CASH")

In [44]:
df_pos_cash_balance.head()

Unnamed: 0,SK_ID_CURR,POS_CASH_NB_ROWS,POS_CASH_MONTHS_BALANCE_min,POS_CASH_MONTHS_BALANCE_max,POS_CASH_MONTHS_BALANCE_mean,POS_CASH_MONTHS_BALANCE_var,POS_CASH_MONTHS_BALANCE_sum,POS_CASH_CNT_INSTALMENT_min,POS_CASH_CNT_INSTALMENT_max,POS_CASH_CNT_INSTALMENT_mean,...,POS_CASH_SK_DPD_min,POS_CASH_SK_DPD_max,POS_CASH_SK_DPD_mean,POS_CASH_SK_DPD_var,POS_CASH_SK_DPD_sum,POS_CASH_SK_DPD_DEF_min,POS_CASH_SK_DPD_DEF_max,POS_CASH_SK_DPD_DEF_mean,POS_CASH_SK_DPD_DEF_var,POS_CASH_SK_DPD_DEF_sum
0,100001,9,-96,-53,-72.555556,435.277778,-653,4.0,4.0,4.0,...,0,7,0.777778,5.444444,7,0,7,0.777778,5.444444,7
1,100002,19,-19,-1,-10.0,31.666667,-190,24.0,24.0,24.0,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
2,100003,28,-77,-18,-43.785714,607.137566,-1226,6.0,12.0,10.107143,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
3,100004,4,-27,-24,-25.5,1.666667,-102,3.0,4.0,3.75,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0
4,100005,11,-25,-15,-20.0,11.0,-220,9.0,12.0,11.727273,...,0,0,0.0,0.0,0,0,0,0.0,0.0,0


In [45]:
df_pos_cash_balance.shape

(337252, 27)

In [46]:
df_pos_cash_balance.to_parquet("../working/pos_cash_balance.parquet")

In [47]:
del df_pos_cash_balance
gc.collect()

0

# Aggrégation de previous_application

In [48]:
df_previous_application = load_csv("previous_application")

In [49]:
for c in ["DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE", "DAYS_LAST_DUE_1ST_VERSION", "DAYS_LAST_DUE", "DAYS_TERMINATION"]:
    if c in df_previous_application.columns:
        df_previous_application.loc[df_previous_application[c]>20000, c] = np.NaN
df_previous_application['APP_CREDIT_PERC'] = df_previous_application['AMT_APPLICATION'] / df_previous_application['AMT_CREDIT']

df_previous_application.drop(columns="SK_ID_PREV", inplace=True)

df_previous_application = aggregate_df(df_previous_application, "SK_ID_CURR", "PREVIOUS_APPLICATION")
df_previous_application.to_parquet("../working/previous_application.parquet")

In [50]:
df_previous_application.shape

(338857, 82)

In [51]:
df_previous_application.head()

Unnamed: 0,SK_ID_CURR,PREVIOUS_APPLICATION_NB_ROWS,PREVIOUS_APPLICATION_AMT_ANNUITY_min,PREVIOUS_APPLICATION_AMT_ANNUITY_max,PREVIOUS_APPLICATION_AMT_ANNUITY_mean,PREVIOUS_APPLICATION_AMT_ANNUITY_var,PREVIOUS_APPLICATION_AMT_ANNUITY_sum,PREVIOUS_APPLICATION_AMT_APPLICATION_min,PREVIOUS_APPLICATION_AMT_APPLICATION_max,PREVIOUS_APPLICATION_AMT_APPLICATION_mean,...,PREVIOUS_APPLICATION_NFLAG_INSURED_ON_APPROVAL_min,PREVIOUS_APPLICATION_NFLAG_INSURED_ON_APPROVAL_max,PREVIOUS_APPLICATION_NFLAG_INSURED_ON_APPROVAL_mean,PREVIOUS_APPLICATION_NFLAG_INSURED_ON_APPROVAL_var,PREVIOUS_APPLICATION_NFLAG_INSURED_ON_APPROVAL_sum,PREVIOUS_APPLICATION_APP_CREDIT_PERC_min,PREVIOUS_APPLICATION_APP_CREDIT_PERC_max,PREVIOUS_APPLICATION_APP_CREDIT_PERC_mean,PREVIOUS_APPLICATION_APP_CREDIT_PERC_var,PREVIOUS_APPLICATION_APP_CREDIT_PERC_sum
0,100001,1,3951.0,3951.0,3951.0,,3951.0,24835.5,24835.5,24835.5,...,0.0,0.0,0.0,,0.0,1.044079,1.044079,1.044079,,1.044079
1,100002,1,9251.775,9251.775,9251.775,,9251.775,179055.0,179055.0,179055.0,...,0.0,0.0,0.0,,0.0,1.0,1.0,1.0,,1.0
2,100003,3,6737.31,98356.995,56553.99,2146706000.0,169661.97,68809.5,900000.0,435436.5,...,0.0,1.0,0.666667,0.333333,2.0,0.868825,1.011109,0.949329,0.005324,2.847988
3,100004,1,5357.25,5357.25,5357.25,,5357.25,24282.0,24282.0,24282.0,...,0.0,0.0,0.0,,0.0,1.207699,1.207699,1.207699,,1.207699
4,100005,2,4813.2,11250.0,8031.6,20716200.0,16063.2,0.0,44617.5,22308.75,...,0.0,0.0,0.0,0.0,0.0,1.111173,1.111173,1.111173,,1.111173


In [52]:
gc.collect()

0

# Fusion de application_train et application_test

Nous commencons par fusionner ces 2 tables pour être sur d'avoir les même colonnes (par exemple eviter des valeurs de features categorielles non présentes dans un des 2 dataset)

In [53]:
df_application_train = pd.read_csv("../assets/application_train.csv")
df_application_test = pd.read_csv("../assets/application_test.csv")

In [54]:
s = df_application_train.notna().mean()
cols_to_remove = []
for c in s.index:
    if s[c]<threshold:
        cols_to_remove.append(c)
if len(cols_to_remove)>0:
    df_application_train.drop(columns=cols_to_remove, inplace=True)
    df_application_test.drop(columns=cols_to_remove, inplace=True)

In [55]:
df_application_test["TARGET"] = np.NaN

In [56]:
df_application = pd.concat([df_application_train, df_application_test])

df_application['DAYS_EMPLOYED_PERC'] = df_application['DAYS_EMPLOYED'] / df_application['DAYS_BIRTH']
df_application['INCOME_CREDIT_PERC'] = df_application['AMT_INCOME_TOTAL'] / df_application['AMT_CREDIT']
df_application['INCOME_PER_PERSON'] = df_application['AMT_INCOME_TOTAL'] / df_application['CNT_FAM_MEMBERS']
df_application['ANNUITY_INCOME_PERC'] = df_application['AMT_ANNUITY'] / df_application['AMT_INCOME_TOTAL']
df_application['PAYMENT_RATE'] = df_application['AMT_ANNUITY'] / df_application['AMT_CREDIT']

for c in df_application.select_dtypes(exclude='number').columns:
    if c!="TARGET":
        df_application[c].fillna(df_application[c].value_counts().index[0], inplace=True)
    
for c in df_application.select_dtypes(include='number').columns:
    if c!="TARGET":
        df_application[c].fillna(df_application[c].median(), inplace=True)


In [57]:
del df_application_train, df_application_test
gc.collect()

0

In [58]:
df_application.shape

(356255, 86)

# Fusion avec les autres datasets

In [59]:
df_application = pd.get_dummies(df_application)

df_previous_application = pd.read_parquet("../working/previous_application.parquet")
df_application = df_application.merge(df_previous_application, on="SK_ID_CURR", how="left")
del df_previous_application
gc.collect()

df_pos_cash_balance = pd.read_parquet("../working/pos_cash_balance.parquet")
df_application = df_application.merge(df_pos_cash_balance, on="SK_ID_CURR", how="left")
del df_pos_cash_balance
gc.collect()

df_installments_payments = pd.read_parquet("../working/installments_payments.parquet")
df_application = df_application.merge(df_installments_payments, on="SK_ID_CURR", how="left")
del df_installments_payments
gc.collect()

df_credit_card_balance = pd.read_parquet("../working/credit_card_balance.parquet")
df_application = df_application.merge(df_credit_card_balance, on="SK_ID_CURR", how="left")
del df_credit_card_balance
gc.collect()

df_bureau = pd.read_parquet("../working/bureau.parquet")
df_application = df_application.merge(df_bureau, on="SK_ID_CURR", how="left")
del df_bureau
gc.collect()


for c in df_application.columns:
    if c.endswith("_NB_ROWS") or c.endswith("_nb"):
        df_application.loc[df_application[c].isna(), c] = 0
    if c.endswith("_var"):
        prefix = c[0:-4]
        df_application.loc[df_application[c].isna() & ~df_application[prefix+"_mean"].isna(), c] = 0
    # On remplace les inf par des NaN
    df_application.replace([np.inf, -np.inf], np.nan, inplace=True)



df_application.to_parquet("../working/df_application_all.parquet")

# Sauvegarde des datasets application_train et application_test

In [60]:
df_application.loc[df_application["TARGET"].isna()].drop(columns="TARGET").reset_index(drop=True).to_parquet("../working/df_application_test.parquet")
df_application.loc[~df_application["TARGET"].isna()].reset_index(drop=True).to_parquet("../working/df_application_train.parquet")

In [61]:
del df_application
gc.collect()

0

# Création des datasets de Train et de Test

Les données étant assez voluminueuses, nous allons créer plusieurs jeux de donnnées:
- un jeu avec 10% des données initiales
- un jeu avec 25% des données initiales
- un jeu toutes les données

Ceci nous permettra de modéliser plus rapidement avec moins de données au départ et ainsi d'obtenir des intuitions plus rapidement

In [62]:
fracs = [0.1, 0.25, 1]

df = pd.read_parquet("../working/df_application_train.parquet")
df.drop(columns="SK_ID_CURR", inplace=True)

if not os.path.isdir("../working/train_test_data"):
    os.mkdir("../working/train_test_data")
    
for frac in fracs:
    if not os.path.isdir(f"../working/train_test_data/{frac}"):
        os.mkdir(f"../working/train_test_data/{frac}")
    
    if frac<1:
        df_frac = df.groupby('TARGET', group_keys=False).apply(lambda x: x.sample(frac=frac))
    else:
        df_frac = df
    s = df_frac.groupby("TARGET").size()
    
    print(f"frac={frac}, size={df_frac.shape[0]}, ratio 0/1={s[0]/s[1]}")
    X_train, X_test, y_train, y_test = train_test_split(df_frac.drop(columns="TARGET"), df_frac["TARGET"], test_size=0.33, random_state=42, shuffle=True)
    X_train.reset_index(drop=True).to_parquet(f"../working/train_test_data/{frac}/X_train.parquet")
    X_test.reset_index(drop=True).to_parquet(f"../working/train_test_data/{frac}/X_test.parquet")
    np.save(f"../working/train_test_data/{frac}/y_train", y_train)
    np.save(f"../working/train_test_data/{frac}/y_test", y_test)
    

frac=0.1, size=30751, ratio 0/1=11.389605157131346
frac=0.25, size=76878, ratio 0/1=11.387689332903642
frac=1, size=307511, ratio 0/1=11.387150050352467
