In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datetime import date, timedelta # date and time

import re # regex

from matplotlib import pyplot as plt # data viz
import seaborn as sns
%matplotlib inline

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression

import os
import gc


In [2]:
SEED = 47

debug = False
nrows_train =  233154
nrows_test =  112392

In [3]:
train = pd.read_csv("train.csv", index_col="UniqueID", 
                    nrows=[nrows_train if debug else None].pop())
test = pd.read_csv("test_bqCt9Pv.csv", index_col="UniqueID", 
                   nrows=[nrows_test if debug else None].pop())
sample = pd.read_csv("sample_submission_24jSKY6.csv", index_col="UniqueID", 
                     nrows=[nrows_test if debug else None].pop())

df = pd.concat([train, test], sort=False)

In [4]:
df.head()

Unnamed: 0_level_0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,03-08-18,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0.0
537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,26-09-18,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1.0
417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,01-08-18,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0.0
624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,26-10-18,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1.0
539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,26-09-18,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1.0


In [5]:
print("Train shape: {}, \nTest shape: {}, \nFull df shape: {}".format(train.shape, test.shape, df.shape))

Train shape: (233154, 40), 
Test shape: (112392, 39), 
Full df shape: (345546, 40)


In [6]:
pd.set_option('display.max_colwidth', -1)
data_dictionary = pd.read_excel("Data Dictionary.xlsx").iloc[:, :2]
data_dictionary.style.set_properties(**{'text-align': 'left'})

Unnamed: 0,Variable Name,Description
0,UniqueID,Identifier for customers
1,loan_default,Payment default in the first EMI on due date
2,disbursed_amount,Amount of Loan disbursed
3,asset_cost,Cost of the Asset
4,ltv,Loan to Value of the asset
5,branch_id,Branch where the loan was disbursed
6,supplier_id,Vehicle Dealer where the loan was disbursed
7,manufacturer_id,"Vehicle manufacturer(Hero, Honda, TVS etc.)"
8,Current_pincode,Current pincode of the customer
9,Date.of.Birth,Date of birth of the customer


In [7]:
df.dtypes

disbursed_amount                       int64  
asset_cost                             int64  
ltv                                    float64
branch_id                              int64  
supplier_id                            int64  
manufacturer_id                        int64  
Current_pincode_ID                     int64  
Date.of.Birth                          object 
Employment.Type                        object 
DisbursalDate                          object 
State_ID                               int64  
Employee_code_ID                       int64  
MobileNo_Avl_Flag                      int64  
Aadhar_flag                            int64  
PAN_flag                               int64  
VoterID_flag                           int64  
Driving_flag                           int64  
Passport_flag                          int64  
PERFORM_CNS.SCORE                      int64  
PERFORM_CNS.SCORE.DESCRIPTION          object 
PRI.NO.OF.ACCTS                        int64  
PRI.ACTIVE.AC

In [8]:
print("There are {} same employees in train and test".format(len(set(train.Employee_code_ID) & set(test.Employee_code_ID))))

There are 3015 same employees in train and test


### Feature Engineering

Convert date variables to datetime format

In [9]:
def fix_date(df, col_names = None):
    """
    Convert the dates columns to datetime format and fixes the 2068 -> 1968
    """
    for col in col_names:
        df[col] = pd.to_datetime(df[col])
        future = df[col] > pd.Timestamp(year=2019, month=1, day=1)
        df.loc[future, col] -= timedelta(days=365.25*100)

In [10]:
%%time
fix_date(df, col_names = ["DisbursalDate", "Date.of.Birth"])

CPU times: user 1min 17s, sys: 484 ms, total: 1min 17s
Wall time: 1min 12s


In [11]:
def string_to_month(string):
    year, month = re.findall(r"[0-9]+", string)
    months = int(year) * 12 + int(month)
    return months

In [12]:
df.loc[:, "AVERAGE.ACCT.AGE"] = df.loc[:, "AVERAGE.ACCT.AGE"].apply(string_to_month)
df.loc[:, "CREDIT.HISTORY.LENGTH"] = df.loc[:, "CREDIT.HISTORY.LENGTH"].apply(string_to_month)

Calculate the age of the client in years at the moment of disbursal

In [13]:
def calculate_age(born, today):
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [14]:
import collections
import functools

import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class BayesianTargetEncoder(BaseEstimator, TransformerMixin):

    """
    Reference: https://www.wikiwand.com/en/Bayes_estimator#/Practical_example_of_Bayes_estimators
    Args:
        columns (list of strs): Columns to encode.
        weighting (int or dict): Value(s) used to weight each prior.
        suffix (str): Suffix used for naming the newly created variables.
    """

    def __init__(self, columns=None, prior_weight=100, suffix='_mean'):
        self.columns = columns
        self.prior_weight = prior_weight
        self.suffix = suffix

    def fit(self, X, y=None, **fit_params):

        if not isinstance(X, pd.DataFrame):
            raise ValueError('X has to be a pandas.DataFrame')

        if not isinstance(y, pd.Series):
            raise ValueError('y has to be a pandas.Series')

        X = X.copy()

        # Default to using all the categorical columns
        columns = X.select_dtypes(['object', 'category']).columns\
            if self.columns is None\
            else self.columns

        names = []
        for cols in columns:
            if isinstance(cols, list):
                name = '_'.join(cols)
                names.append('_'.join(cols))
                X[name] = functools.reduce(
                    lambda a, b: a.astype(str) + '_' + b.astype(str),
                    [X[col] for col in cols]
                )
            else:
                names.append(cols)

        # Compute prior and posterior probabilities for each feature
        X = pd.concat((X[names], y.rename('y')), axis='columns')
        self.prior_ = y.mean()
        self.posteriors_ = {}

        for name in names:
            agg = X.groupby(name)['y'].agg(['count', 'mean'])
            counts = agg['count']
            means = agg['mean']
            pw = self.prior_weight
            self.posteriors_[name] = collections.defaultdict(
                lambda: self.prior_,
                ((pw * self.prior_ + counts * means) / (pw + counts)).to_dict()
            )

        return self

    def transform(self, X, y=None):

        if not isinstance(X, pd.DataFrame):
            raise ValueError('X has to be a pandas.DataFrame')

        for cols in self.columns:

            if isinstance(cols, list):
                name = '_'.join(cols)
                x = functools.reduce(
                    lambda a, b: a.astype(str) + '_' + b.astype(str),
                    [X[col] for col in cols]
                )
            else:
                name = cols
                x = X[name]

            posteriors = self.posteriors_[name]
            X[name + self.suffix] = x.map(posteriors).astype(float)

        return X

In [15]:
# for desc before we encode
df['description_long'] = df['PERFORM_CNS.SCORE.DESCRIPTION'].apply(lambda x: re.sub('Not Scored: ', '', x))
df['description_short'] = df['PERFORM_CNS.SCORE.DESCRIPTION'].apply(lambda x: re.sub('(Not Scored: ).*', 'Not Scored', x))

## Bayesian Target Encoding


In [16]:
encoder = BayesianTargetEncoder(
        columns=['PERFORM_CNS.SCORE.DESCRIPTION', 'State_ID'],
        prior_weight=3,
        suffix='')
%time encoder.fit_transform(df, df['loan_default'])

CPU times: user 4.09 s, sys: 485 ms, total: 4.57 s
Wall time: 381 ms


Unnamed: 0_level_0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,description_long,description_short
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
420825,50578,58400,89.55,67,22807,45,1441,1984-01-01,Salaried,2018-03-08,...,0,0,0,0,0,0,0,0.0,No Bureau History Available,No Bureau History Available
537409,47145,65550,73.23,67,22807,45,1502,1985-07-31,Self employed,2018-09-26,...,1991,0,0,1,23,23,0,1.0,I-Medium Risk,I-Medium Risk
417566,53278,61360,89.63,67,22807,45,1497,1985-08-24,Self employed,2018-01-08,...,0,0,0,0,0,0,0,0.0,No Bureau History Available,No Bureau History Available
624493,57513,66113,88.48,67,22807,45,1501,1993-12-30,Self employed,2018-10-26,...,31,0,0,0,8,15,1,1.0,L-Very High Risk,L-Very High Risk
539055,52378,60300,88.39,67,22807,45,1495,1977-09-12,Self employed,2018-09-26,...,0,0,0,0,0,0,1,1.0,No Bureau History Available,No Bureau History Available
518279,54513,61900,89.66,67,22807,45,1501,1990-08-09,Self employed,2018-09-19,...,1347,0,0,0,21,24,0,0.0,A-Very Low Risk,A-Very Low Risk
529269,46349,61500,76.42,67,22807,45,1502,1988-01-06,Salaried,2018-09-23,...,0,0,0,0,0,0,0,0.0,No Bureau History Available,No Bureau History Available
510278,43894,61900,71.89,67,22807,45,1501,1989-04-10,Salaried,2018-09-16,...,0,0,0,0,2,2,0,0.0,Not Enough Info available on the customer,Not Scored
490213,53713,61973,89.56,67,22807,45,1497,1991-11-15,Self employed,2018-05-09,...,0,0,0,0,56,56,1,0.0,D-Very Low Risk,D-Very Low Risk
510980,52603,61300,86.95,67,22807,45,1492,1968-01-06,Salaried,2018-09-16,...,2608,0,0,0,19,19,0,0.0,A-Very Low Risk,A-Very Low Risk


In [17]:
df['age'] = df.apply(lambda row: calculate_age(row['Date.of.Birth'], row['DisbursalDate']), axis=1)
df['loan_to_asset_ratio'] = df['disbursed_amount'] / df['asset_cost']
df['disbursed_more'] = (df['SEC.DISBURSED.AMOUNT'] > df['SEC.SANCTIONED.AMOUNT']) * 1
df['disbursed_less'] = (df['SEC.DISBURSED.AMOUNT'] < df['SEC.SANCTIONED.AMOUNT']) * 1
df['PRI_loan_emi_ratio'] = df['PRI.CURRENT.BALANCE'] / df['PRIMARY.INSTAL.AMT']
df['SEC_loan_emi_ratio'] = df['SEC.CURRENT.BALANCE'] / df['SEC.INSTAL.AMT']
df['emi_total'] = df['PRIMARY.INSTAL.AMT'] + df['SEC.INSTAL.AMT']
df['balance_total'] = df['PRI.CURRENT.BALANCE'] + df['SEC.CURRENT.BALANCE']
df['total_loan_emi_ratio'] = df['balance_total'] / df['emi_total']
df['PRI_balance_disbursed_ratio'] = df['PRI.CURRENT.BALANCE'] / df['PRI.DISBURSED.AMOUNT']
df['SEC_balance_disbursed_ratio'] = df['SEC.CURRENT.BALANCE'] / df['SEC.DISBURSED.AMOUNT']
df['disbursed_total'] = df['PRI.DISBURSED.AMOUNT'] + df['SEC.DISBURSED.AMOUNT']
df['total_balance_disbursed_ratio'] = df['balance_total'] / df['disbursed_total']
df['six_month_ratio'] = df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] / df['balance_total']

df['disbursal_dayofweek'] = df['DisbursalDate'].dt.dayofweek
df['disbursal_day'] = df['DisbursalDate'].dt.day

df['employee_counts'] = df.groupby('Employee_code_ID')['Employee_code_ID'].transform('count')
df['EMPLOYEE_LTV_mean'] = df.groupby('Employee_code_ID')['ltv'].transform('mean')
df['EMPLOYEE_LTV_median'] = df.groupby('Employee_code_ID')['ltv'].transform('median')
df['EMPLOYEE_LTV_min'] = df.groupby('Employee_code_ID')['ltv'].transform('min')
df['EMPLOYEE_LTV_max'] = df.groupby('Employee_code_ID')['ltv'].transform('max')
df['EMPLOYEE_LTV_std'] = df.groupby('Employee_code_ID')['ltv'].transform('std')

df['EMPLOYEE_DA_mean'] = df.groupby('Employee_code_ID')['disbursed_amount'].transform('mean')
df['EMPLOYEE_DA_median'] = df.groupby('Employee_code_ID')['disbursed_amount'].transform('median')
df['EMPLOYEE_DA_min'] = df.groupby('Employee_code_ID')['disbursed_amount'].transform('min')
df['EMPLOYEE_DA_max'] = df.groupby('Employee_code_ID')['disbursed_amount'].transform('max')
df['EMPLOYEE_DA_std'] = df.groupby('Employee_code_ID')['disbursed_amount'].transform('std')

df['STATE_LTV_mean'] = df.groupby('State_ID')['ltv'].transform('mean')
df['STATE_LTV_median'] = df.groupby('State_ID')['ltv'].transform('median')
df['STATE_LTV_min'] = df.groupby('State_ID')['ltv'].transform('min')
df['STATE_LTV_max'] = df.groupby('State_ID')['ltv'].transform('max')

df['branch_counts'] = df.groupby('branch_id')['branch_id'].transform('count')
df['COUNTS_branch_/_employee'] = df['branch_counts']/df['employee_counts']
df['BRANCH_LTV_median'] = df.groupby('branch_id')['ltv'].transform('median')
df['BRANCH_LTV_max'] = df.groupby('branch_id')['ltv'].transform('max')
df['BRANCH_LTV_min'] = df.groupby('branch_id')['ltv'].transform('min')
df['BRANCH_LTV_mean'] = df.groupby('branch_id')['ltv'].transform('mean')

df['BRANCH_disbursedAMT_median'] = df.groupby('branch_id')['disbursed_amount'].transform('median')
df['BRANCH_disbursedAMT_max'] = df.groupby('branch_id')['disbursed_amount'].transform('max')
df['BRANCH_disbursedAMT_min'] = df.groupby('branch_id')['disbursed_amount'].transform('min')
df['BRANCH_disbursedAMT_mean'] = df.groupby('branch_id')['disbursed_amount'].transform('mean')

df['RATIO_PRI_sactioned_vs_disbursed'] = df['PRI.SANCTIONED.AMOUNT']/df['PRI.DISBURSED.AMOUNT']
df['RATIO_SEC_sactioned_vs_disbursed'] = df['SEC.SANCTIONED.AMOUNT']/df['SEC.DISBURSED.AMOUNT']
df['RATIO_PRI_active_vs_overdue'] = df['PRI.OVERDUE.ACCTS']/df['PRI.ACTIVE.ACCTS']
df['RATIO_SEC_active_vs_overdue'] = df['SEC.OVERDUE.ACCTS']/df['SEC.ACTIVE.ACCTS']

df['voluntary_info_count'] = df['MobileNo_Avl_Flag'] + df['Aadhar_flag'] + df['PAN_flag'] + df['VoterID_flag'] + df['Driving_flag'] + df['Passport_flag']

df['NEW.ACCTS.IN.LAST.SIX.MONTHS_/_CREDIT.HISTORY.LENGTH'] = df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] / df['CREDIT.HISTORY.LENGTH']
df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_/_NEW.ACCTS.IN.LAST.SIX.MONTHS'] = df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] / df['NEW.ACCTS.IN.LAST.SIX.MONTHS']
df['AVERAGE.ACCT.AGE_/_CREDIT.HISTORY.LENGTH'] = df['AVERAGE.ACCT.AGE'] / df['CREDIT.HISTORY.LENGTH']
df['PRI.NO.OF.ACCTS /DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] = df['PRI.NO.OF.ACCTS'] / df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']
df['PRI.ACTIVE.ACCTS/DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] = df['PRI.ACTIVE.ACCTS'] / df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']

df['Credit.History_/_AVERAGE.ACCT.AGE'] = df['CREDIT.HISTORY.LENGTH'] / df['AVERAGE.ACCT.AGE']
df['CREDIT.HISTORY.LENGTH/NO.OF_INQUIRIES'] = df["CREDIT.HISTORY.LENGTH"] / df["NO.OF_INQUIRIES"]

df['CREDIT.HISTORY.LENGTH_/_age'] = df["CREDIT.HISTORY.LENGTH"] / df["age"]
df['PRI.ACTIVE.ACCTS + PRI.OVERDUE.ACCTS ) / PRI.NO.OF.ACCTS'] = (df['PRI.ACTIVE.ACCTS'] + df['PRI.OVERDUE.ACCTS'] ) / df['PRI.NO.OF.ACCTS']

df['PRI.ACTIVE.ACCTS_/_PRI.NO.OF.ACCTS'] = df['PRI.ACTIVE.ACCTS'] / df['PRI.NO.OF.ACCTS']
df['ltv'] = np.round(df['ltv'], decimals = 0)


In [18]:
score_desc = [n for n in df['PERFORM_CNS.SCORE.DESCRIPTION'].unique()]
score_cats = {'No Bureau History Available':1.5,
             'I-Medium Risk':2,
             'L-Very High Risk':4,
             'A-Very Low Risk':0.5,
             'Not Scored: Not Enough Info available on the customer':1.5,
             'D-Very Low Risk':0.5,
             'M-Very High Risk':4,
             'B-Very Low Risk':0.5,
             'C-Very Low Risk':0.5,
             'E-Low Risk':1,
             'H-Medium Risk':2,
             'F-Low Risk':1,
             'K-High Risk':3,
             'Not Scored: No Activity seen on the customer (Inactive)':1.5,
             'Not Scored: Sufficient History Not Available':1.5,
             'Not Scored: No Updates available in last 36 months':1.5,
             'G-Low Risk':1,
             'J-High Risk':3,
             'Not Scored: Only a Guarantor':1.5,
             'Not Scored: More than 50 active Accounts found':1.5}

Deal with NANs

In [19]:
na_cols = df.columns[df.isna().any()].tolist()

In [20]:
# replace NA values with "NA"
df["Employment.Type"].fillna(value="NA", inplace=True)
df.drop("loan_default", axis=1).fillna(0, inplace=True)
for col in na_cols[2:]:
    if col not in ["Employment.Type", "loan_default"]:
        df[col].fillna(0, inplace=True)

In [21]:
categorical_cols = ['branch_id', 'supplier_id', 'manufacturer_id', 
                    'Current_pincode_ID', 'Employment.Type', 'State_ID', 
                    'Employee_code_ID','description_long', 'description_short','voluntary_info_count']

binary_cols = ['MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 
               'VoterID_flag', 'Driving_flag', 'Passport_flag',
               'disbursed_more', 'disbursed_less',
#                'Is_No Bureau History Available', 'Is_I-Medium Risk',
#                'Is_L-Very High Risk', 'Is_A-Very Low Risk',
#                'Is_Not Scored: Not Enough Info available on the customer',
#                'Is_D-Very Low Risk', 'Is_M-Very High Risk', 'Is_B-Very Low Risk',
#                'Is_C-Very Low Risk', 'Is_E-Low Risk', 'Is_H-Medium Risk',
#                'Is_F-Low Risk', 'Is_K-High Risk', 
#                'Is_Not Scored: No Activity seen on the customer (Inactive)',
#                'Is_Not Scored: Sufficient History Not Available',
#                'Is_Not Scored: No Updates available in last 36 months',
#                'Is_G-Low Risk', 'Is_J-High Risk', 'Is_Not Scored: Only a Guarantor',
#                'Is_Not Scored: More than 50 active Accounts found'
              ]

drop_cols = ['Date.of.Birth', 'DisbursalDate', 
            'MobileNo_Avl_Flag',
             'Passport_flag',
             'SEC.NO.OF.ACCTS',
             'SEC.ACTIVE.ACCTS',
             'SEC.OVERDUE.ACCTS',
             'SEC.CURRENT.BALANCE',
             'SEC.SANCTIONED.AMOUNT',
             'SEC.DISBURSED.AMOUNT',
             'SEC.INSTAL.AMT',
             'disbursed_more',
             'disbursed_less',
             'SEC_loan_emi_ratio',
             'SEC_balance_disbursed_ratio',
             'RATIO_SEC_sactioned_vs_disbursed',
             'RATIO_SEC_active_vs_overdue',
             'Is_No Bureau History Available',
             'Is_I-Medium Risk',
             'Is_L-Very High Risk',
             'Is_A-Very Low Risk',
             'Is_D-Very Low Risk',
             'Is_B-Very Low Risk',
             'Is_E-Low Risk',
             'Is_H-Medium Risk',
             'Is_F-Low Risk',
             'Is_Not Scored: No Activity seen on the customer (Inactive)',
             'Is_Not Scored: No Updates available in last 36 months',
             'Is_G-Low Risk',
             'Is_J-High Risk',
             'Is_Not Scored: Only a Guarantor',
             'Is_Not Scored: More than 50 active Accounts found',
             'loan_default', 'Current_pincode_ID', 'supplier_id', 'Employee_code_ID',
             'voluntary_info_count','branch_id',
             'PRI.CURRENT.BALANCE/NO.OF_INQUIRIES','NEW.ACCTS.6MO_/_NO.INQ','NEW.ACCTS_/_AVG.ACCT.AGE']

In [22]:
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in binary_cols:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

In [23]:
for cat_feature in categorical_cols:
    df[cat_feature] = df[cat_feature].astype('category')
    
df['State_ID'] = df['State_ID'].astype(float)

## Cleaning Dataset

In [24]:
def create_label_encoding_with_min_count(df, column, min_count=50):
    column_counts = df.groupby([column])[column].transform("count").astype(int)
    column_values = np.where(column_counts >= min_count, df[column], "")
    df[column+"_label"] = LabelEncoder().fit_transform(column_values)

In [25]:
for col in categorical_cols:
    if col not in ["State_ID", 'PERFORM_CNS.SCORE.DESCRIPTION']:
        create_label_encoding_with_min_count(df, col, min_count=300)

In [26]:
new_cat_cols = [c for c in df.columns if ("_label" in c) ]

In [27]:
df.drop([x for x in categorical_cols if x not in ["State_ID", 'PERFORM_CNS.SCORE.DESCRIPTION']], axis=1, inplace=True)


In [28]:
%%time
df = pd.get_dummies(df, columns = new_cat_cols)

CPU times: user 5.62 s, sys: 628 ms, total: 6.25 s
Wall time: 1.9 s


In [29]:
feats = [f for f in df.columns if f not in drop_cols]

### Modeling

In [30]:

# lgbm_params = {'bagging_fraction': 0.7338059252308635, 
#                'feature_fraction': 0.5468194040408916, 
#                'learning_rate': 0.03430358037778243, 
#                'max_depth': 200, 
#                'min_child_weight': 19, 
#                'min_split_gain': 0.3656101768694246, 
#                'n_estimators': 803, 
#                'num_leaves': 250}

# lgb1 = LGBMClassifier(
#                 **lgbm_params,
#                 n_jobs=-1,
#                 colsample_bytree=0.95,
#                 subsample=0.87,
#                 reg_alpha=0.04,
#                 reg_lambda=0.073,
#                 silent=-1,
#                 verbose=-1,
#                 seed=SEED
#                 )

# xgb_params = {'max_depth': 5, 
#              'min_child_weight': 4.833822465839179, 
#              'colsample_bytree': 0.7480334635236953, 
#              'gamma': 0.05510955496635193, 
#              'max_delta_step': 1.0935343938811055, 
#              'subsample': 0.6529358763366296}
# xgb = XGBClassifier(**xgb_params,verbose_eval=500, seed=SEED, eval_metric='auc', n_jobs=-1, verbose=500)

# cb_params = {'rsm': 0.7094968413711019, 'l2_leaf_reg': 49.997096517399854, 'random_strength': 0.02763731343920781}
# ctb = CatBoostClassifier(**cb_params,
#         loss_function='Logloss',
#         eval_metric='AUC',
#         random_seed=SEED,
#         verbose=500
#     )


# rf_params = {'max_features':  0.04 , 'min_samples_split': 99 , 'n_estimators': 837 }
# rf1 = RandomForestClassifier(**rf_params, random_state=SEED, n_jobs=-1)
# rf2 = RandomForestClassifier(**rf_params, random_state=SEED, max_depth=2, n_jobs=-1)
# rf3 = RandomForestClassifier(**rf_params, random_state=SEED, max_depth=5, n_jobs=-1)

# xt_params = {'max_features': 0.1, 'min_samples_split': 99 , 'n_estimators': 848 }
# xt = ExtraTreesClassifier(**xt_params, random_state=SEED, n_jobs=-1)


# lgb2 = LGBMClassifier(**lgbm_params, colsample_bytree=0.5, subsample=0.5,
#                                           n_jobs=-1, random_state=SEED)

# lgb3 = LGBMClassifier(**lgbm_params, colsample_bytree=0.5, subsample=0.5,
#                                           n_jobs=-1, random_state=SEED)

# lgb4 = LGBMClassifier(**lgbm_params, colsample_bytree=0.5, subsample=0.5,
#                                           n_jobs=-1, random_state=SEED)

# lgb5 = LGBMClassifier(**lgbm_params, colsample_bytree=0.5, subsample=0.5,
#                                           n_jobs=-1, random_state=SEED)

# knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)

# lm = LogisticRegression(C=0.022697694378729906,random_state=SEED)
lgb1 = LGBMClassifier(
                n_jobs=-1,
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=32,
                colsample_bytree=0.95,
                subsample=0.87,
                max_depth=8,
                reg_alpha=0.04,
                reg_lambda=0.073,
                min_split_gain=0.02,
                min_child_weight=40,
                silent=-1,
                verbose=-1,
                seed=SEED
                )

lgb2 = LGBMClassifier(num_leaves=2, learning_rate=0.07, n_estimators=1400,
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=SEED)

lgb3 = LGBMClassifier(num_leaves=3, learning_rate=0.07, n_estimators=800,
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=SEED)

lgb4 = LGBMClassifier(num_leaves=4, learning_rate=0.07, n_estimators=800,
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=SEED)

lgb5 = LGBMClassifier(num_leaves=5, learning_rate=0.07, n_estimators=600,
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=SEED)

xgb = XGBClassifier(verbose_eval=500, seed=SEED, eval_metric='auc', n_jobs=-1, verbose=500)


ctb = CatBoostClassifier(random_seed=SEED, loss_function='Logloss', eval_metric='AUC', verbose=500)

rf1 = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
rf2 = RandomForestClassifier(n_estimators=100, random_state=SEED, max_depth=2, n_jobs=-1)
rf3 = RandomForestClassifier(n_estimators=100, random_state=SEED, max_depth=5, n_jobs=-1)

xt = ExtraTreesClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)

knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)

lm = LogisticRegression(random_state=SEED)

models = [lgb1, xgb, rf1, ctb] # , lgb2, lgb3, lgb4, lgb5, rf2, rf3, lm, 

In [31]:
df.loc[:, feats] = df[feats].replace([np.inf, -np.inf], np.nan)
df.loc[:, feats] = df[feats].fillna(-1)

s = StandardScaler()
df.loc[:, feats] = s.fit_transform(df[feats])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [32]:
# Divide in training/validation and test data
train_df = df[df['loan_default'].notnull()]
test_df = df[df['loan_default'].isnull()]
y = df[df['loan_default'].notnull()]['loan_default']

del df
gc.collect()

21

In [33]:
# GBDT withStratified KFold
def kfold_gbm(train_df, test_df, clf, feats=feats, num_folds=5, stratified=True, debug=False, random_state=47, feature_importance=False):
    clf_name = type(clf).__name__
    print("Starting {}. Train shape: {}, test shape: {}".format(clf_name, train_df.shape, test_df.shape))
    
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
    
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['loan_default'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['loan_default'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['loan_default'].iloc[valid_idx]

        # GBM
        if clf_name in ['CatBoostClassifier', 'XGBClassifier']:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    verbose=500, early_stopping_rounds=200)
        
        elif clf_name in ['LGBMClassifier']:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    verbose=500, early_stopping_rounds=200, eval_metric='auc')
        # sklearn
        else:
            clf.fit(train_x, train_y)

        if clf_name == 'LGBMClassifier':
            oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        elif clf_name == 'CatBoostClassifier':
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits
        elif clf_name == 'XGBClassifier':
            oof_preds[valid_idx] = clf.predict_proba(valid_x, ntree_limit=clf.best_ntree_limit)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats], ntree_limit=clf.best_ntree_limit)[:, 1] / folds.n_splits
        else:
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits
        
        if feature_importance:
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del train_x, train_y, valid_x, valid_y
        gc.collect()
    
    full_score = roc_auc_score(train_df['loan_default'], oof_preds)
    print('Full AUC score %.6f \n\n' % full_score)
    
    # Write submission file and plot feature importance
    if not debug:
        sample['loan_default'] = sub_preds
        sample.to_csv("submission_kernel_{}.csv".format(clf_name), index=True)
    
    if feature_importance:
        display_importances(feature_importance_df)
        return oof_preds, sub_preds, feature_importance_df
    else:
        return oof_preds, sub_preds

In [34]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].\
    groupby("feature").mean().\
    sort_values(by="importance", ascending=False)[:40].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('GBDT_Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('GBDT_importances.png')

In [35]:
def stacker_n_blender(X_train=train_df, X_test=test_df, y=y, models=models, seed=SEED, debug=True):
    np.random.seed(42)
    X_train_meta = 0.03*np.random.randn(X_train.shape[0], len(models))
    X_test_meta = np.zeros((X_test.shape[0], len(models)))

    for t, model in enumerate(models):
        oof_preds, sub_preds = kfold_gbm(X_train, X_test, clf=model, random_state=SEED, debug=debug)
        if debug: print("{} Score: {}:.4f\n".format(type(model).__name__,roc_auc_score(y_valid, sub_preds) ))

        X_train_meta[:, t] += oof_preds
        X_test_meta[:, t] = sub_preds
        
        del model
        gc.collect()
    
    ens_model = Ridge(0.001).fit(X_train_meta, y.values.reshape(-1, 1))
    stack = ens_model.predict(X_test_meta)
    blend = X_test_meta.mean(axis=1)
    
    return stack, blend

In [36]:
if debug:
#     train_X, valid_X = train_test_split(train_df.sort_values("DisbursalDate"), test_size=0.3, shuffle=False)
#     y_train = train_X['loan_default']
#     y_valid = valid_X['loan_default']
    stack, blend = stacker_n_blender(train_X, valid_X, y_train, models=models, debug=True)

else:
    stack, blend = stacker_n_blender(train_df, test_df, y, models=models, debug=False)

Starting LGBMClassifier. Train shape: (233154, 759), test shape: (112392, 759)
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.706294	training's binary_logloss: 0.478198	valid_1's auc: 0.668718	valid_1's binary_logloss: 0.492238
[1000]	training's auc: 0.726389	training's binary_logloss: 0.469194	valid_1's auc: 0.671019	valid_1's binary_logloss: 0.491317
[1500]	training's auc: 0.744129	training's binary_logloss: 0.460953	valid_1's auc: 0.671892	valid_1's binary_logloss: 0.491117
Early stopping, best iteration is:
[1443]	training's auc: 0.742086	training's binary_logloss: 0.461924	valid_1's auc: 0.671968	valid_1's binary_logloss: 0.491047
Fold  1 AUC : 0.671968
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.704538	training's binary_logloss: 0.478909	valid_1's auc: 0.672927	valid_1's binary_logloss: 0.49097
[1000]	training's auc: 0.725698	training's binary_logloss: 0.469573	valid_1's auc: 0.674899	valid_1's bin

In [39]:
if debug: 
    print(roc_auc_score(y_valid, stack))
    print(roc_auc_score(y_valid, blend))

0.6636712114749956
0.6619565855954478


* `xgb + lgb + ctb = 0.6517`
* `gbdt + rf + xt = 0.6534`
* `gbdt + more rf + xt = 0.6531`
* `gbdt + more rf + xt + more lgb = 0.6525`
* `gbdt + more rf + xt + more lgb = 0.6528`
* `gbdt + rf + xt + MORE regularization = 0.6534`
* `SCALED gbdt + rf + xt = 0.6550`
* `SCALED gbdt + rf + xt + knn + lm = 0.6528`

In [38]:
sample['loan_default'] = stack
sample.to_csv("submission_kernel_stack_older.csv", index=True)
sample['loan_default'] = blend
sample.to_csv("submission_kernel_blend_older.csv", index=True)
