In [43]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import time


from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import import_ipynb
from function_for_eda import *

<b>Bước 1</b>: Merging tất cả các bảng qua SK_ID_CURR feature <br>
<b>Bước 2</b>: Tạo một số feature mới liên quan đến AMT_INCOME ở bảng tổng(bảng đã merge tất cả các bảng)

In [44]:
def merge_all_tables(application_train, application_test, bureau_aggregated, previous_aggregated, 
                    installments_aggregated, pos_aggregated, cc_aggregated):
    '''
    Function to merge all the tables together with the application_train and application_test tables
    on SK_ID_CURR.
    
    Inputs:
        All the previously pre-processed Tables.
        
    Returns:
        Single merged tables, one for training data and one for test data
    '''

    #merging application_train and application_test with Aggregated bureau table
    app_train_merged = application_train.merge(bureau_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = application_test.merge(bureau_aggregated, on = 'SK_ID_CURR', how = 'left')
    #merging with aggregated previous_applications
    app_train_merged = app_train_merged.merge(previous_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(previous_aggregated, on = 'SK_ID_CURR', how = 'left')
    #merging with aggregated installments tables
    app_train_merged = app_train_merged.merge(installments_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(installments_aggregated, on = 'SK_ID_CURR', how = 'left')
    #merging with aggregated POS_Cash balance table
    app_train_merged = app_train_merged.merge(pos_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(pos_aggregated, on = 'SK_ID_CURR', how = 'left')
    #merging with aggregated credit card table
    app_train_merged = app_train_merged.merge(cc_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(cc_aggregated, on = 'SK_ID_CURR', how = 'left')

    return app_train_merged, app_test_merged

In [45]:
application_train = pd.read_csv('/kaggle/input/final-data/application_train_final.csv')
application_test = pd.read_csv('/kaggle/input/final-data/application_test_final.csv')
bureau_aggregated = pd.read_csv('/kaggle/input/final-data/bureau_balance_final.csv')
previous_aggregated = pd.read_csv('/kaggle/input/pre-app/previous_application_final.csv')
installments_aggregated = pd.read_csv('/kaggle/input/final-data/installments_payments_final.csv')
pos_aggregated  = pd.read_csv('/kaggle/input/final-data/pos_cash_final.csv')
cc_aggregated = pd.read_csv('/kaggle/input/final-data/credit_cat_final.csv')

In [46]:
train_data, test_data = merge_all_tables(application_train, application_test, 
                                         bureau_aggregated, previous_aggregated, 
                                         installments_aggregated, pos_aggregated, 
                                         cc_aggregated)

### NEW FEATURE

#### Đây đều là những features liên quan đến tỉ lệ của các khoản 'AMT_ANNUITY', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_GOODS',  'AMT_PAYMENT' so với INCOME của người đi vay

In [47]:
def create_new_features(data):
    '''
    Function to create few more features after the merging of features, by using the
    interactions between various tables.
    
    Inputs:
        data: DataFrame
    
    Returns:
        None
    '''
    
    #previous applications columns
    prev_annuity_columns = [ele for ele in previous_aggregated.columns if 'AMT_ANNUITY' in ele]
    for col in prev_annuity_columns:
        data['PREV_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    prev_goods_columns = [ele for ele in previous_aggregated.columns if 'AMT_GOODS' in ele]
    for col in prev_goods_columns:
        data['PREV_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
  
    #credit_card_balance columns
    cc_amt_principal_cols = [ele for ele in cc_aggregated.columns if 'AMT_RECEIVABLE_PRINCIPAL' in ele]
    for col in cc_amt_principal_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    cc_amt_recivable_cols = [ele for ele in cc_aggregated.columns if 'AMT_RECIVABLE' in ele]
    for col in cc_amt_recivable_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    cc_amt_total_receivable_cols = [ele for ele in cc_aggregated.columns if 'TOTAL_RECEIVABLE' in ele]
    for col in cc_amt_total_receivable_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    
    #installments_payments columns
    installments_payment_cols = [ele for ele in installments_aggregated.columns if 'AMT_PAYMENT' in ele and 'RATIO' not in ele and 'DIFF' not in ele]
    for col in installments_payment_cols:
        data['INSTALLMENTS_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    #https://www.kaggle.com/c/home-credit-default-risk/discussion/64821
    installments_max_installment = ['AMT_INSTALMENT_MEAN_MAX', 'AMT_INSTALMENT_SUM_MAX']
    for col in installments_max_installment:
        data['INSTALLMENTS_ANNUITY_' + col + '_RATIO'] = data['AMT_ANNUITY'] / (data[col] + 0.00001)

    bureau_overdue_cols = [ele for ele in bureau_aggregated.columns if 'AMT_CREDIT' in ele and 'OVERDUE' in ele]
    for col in bureau_overdue_cols:
        data['BUREAU_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    bureau_amt_annuity_cols = [ele for ele in bureau_aggregated.columns if 'AMT_ANNUITY' in ele and 'CREDIT'  not in ele]
    for col in bureau_amt_annuity_cols:
        data['BUREAU_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)   

In [48]:
create_new_features(train_data)
create_new_features(test_data)

print("After Pre-processing, aggregation, merging and Feature Engineering,")
print(f"Final Shape of Training Data = {train_data.shape}")
print(f"Final Shape of Test Data = {test_data.shape}")

After Pre-processing, aggregation, merging and Feature Engineering,
Final Shape of Training Data = (246009, 1649)
Final Shape of Test Data = (61502, 1648)


In [51]:
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data.replace([np.inf, -np.inf], np.nan, inplace=True)

train_data = remove_missing_col(train_data)
test_data = remove_missing_col(test_data)

train_data = fill_nan(train_data)
test_data = fill_nan(test_data)

train_data = replace_outlier(train_data)
test_data = replace_outlier(test_data)


In [52]:
empty_columns = []
for col in train_data.columns:
    if len(train_data[col].unique()) <=1:
        empty_columns.append(col)
    
print(f"There are {len(empty_columns)} columns with just 1 unique value")
print("Removing these from dataset")
train_data = train_data.drop(empty_columns, axis = 1)
test_data = test_data.drop(empty_columns, axis = 1)

There are 5 columns with just 1 unique value
Removing these from dataset


In [53]:
# Align the training and testing data, keep only columns present in both dataframes
train_data = train_data.drop('SK_ID_CURR', axis =1)
train_labels = train_data.pop('TARGET')
train_data, test_data = train_data.align(test_data, join = 'inner', axis = 1)

# Add the target back in
train_data['TARGET'] = train_labels

print('Training Features shape: ', train_data.shape)
print('Testing Features shape: ', test_data.shape)

Training Features shape:  (246009, 1204)
Testing Features shape:  (61502, 1203)


In [54]:
#train_data.to_csv('data_train_final.csv', index = False)

In [55]:
#test_data.to_csv('data_test_final.csv', index = False)

## Test on data train tổng

In [56]:
train,test1 = train_test_split(train_data,test_size=.25,random_state = 123)

#separating dependent and independent variables
train_x1 = train[[i for i in train.columns if i not in ['SK_ID_CURR'] + [ 'TARGET']]]
scaler = StandardScaler()
train_x1 = scaler.fit_transform(train_x1)
train_y1 = np.array(train[["TARGET"]])

test_x1 = test1[[i for i in test1.columns if i not in ['SK_ID_CURR'] + [ 'TARGET']]]
test_x1 = scaler.fit_transform(test_x1)
test_y1 = np.array(test1[["TARGET"]])

In [57]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(solver = 'newton-cholesky', penalty = 'l2', C= 0.001, max_iter = 500)
#log_reg = LogisticRegression(solver = 'saga', max_iter = 97, penalty = 'l2', C= 0.00160957244252) # thay bộ best param vào đây


log_reg.fit(train_x1, train_y1)
log_reg_pred = log_reg.predict_proba(test_x1)[:, 1]

In [58]:
from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(test_y1,log_reg_pred)
gini = 2 * auc_roc - 1
print(f"AUC-ROC Score: {auc_roc}")
print(f"Gini Score: {gini}")

AUC-ROC Score: 0.7806501776667601
Gini Score: 0.5613003553335203
