**Import the required libraries**

In [1]:
import os
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
import numpy as np
parDir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

#Please make sure you have the train and test csv files in /input/india-ml-hiring-av directory of your os. In case if you have them else where please change the code below****

In [2]:
train = pd.read_csv(parDir + "/input/india-ml-hiring-av/train.csv")
test = pd.read_csv(parDir+ "/input/india-ml-hiring-av/test.csv")

****Categorical Features are listed below**

In [3]:
categorical_cols = ['source','financial_institution','origination_date','first_payment_date',
                   'loan_purpose']

****Numerical Features are listed below**


In [4]:
values_to_scale = ['co-borrower_credit_score','borrower_credit_score','debt_to_income_ratio',
                  'loan_term']

In [5]:
#The below function takes a dataframe and columns as input and label encodes them.
def encodeCategoricals(df,columns):
    # apply le on categorical feature columns
    df[columns] = df[columns].apply(lambda col: LabelEncoder().fit_transform(col))
    return df
#The below function takes a dataframe and columns as input and scales them using standard scaler
def scaleNumericVariables(df,columns):
    sc = StandardScaler()
    new_data = pd.DataFrame(np.round(sc.fit_transform(df[values_to_scale]),3),
                            columns = columns)
    df[columns] = new_data[columns]
    return df
    

**Please transform the variables into the required variables**

In [6]:
train = encodeCategoricals(train,categorical_cols)
train = scaleNumericVariables(train,values_to_scale)

**Find the corelation of the variables to perform feature removal**

In [7]:
for col in train.columns:
    corr, _ = pearsonr(train[col], train['m13'])
    print('%.3f' % corr,col)

-0.003 loan_id
0.008 source
-0.002 financial_institution
0.054 interest_rate
-0.017 unpaid_principal_bal
0.024 loan_term
-0.011 origination_date
-0.012 first_payment_date
0.016 loan_to_value
-0.040 number_of_borrowers
0.038 debt_to_income_ratio
-0.094 borrower_credit_score
0.023 loan_purpose
0.006 insurance_percent
-0.043 co-borrower_credit_score
0.006 insurance_type
0.092 m1
0.147 m2
0.157 m3
0.219 m4
0.260 m5
0.288 m6
0.309 m7
0.339 m8
0.369 m9
0.368 m10
0.410 m11
0.466 m12
1.000 m13


**The final variables that are used in the model are listed below.**

In [8]:
fea_to_use = ['m1','m2','m3','m4','m5','m6','m7','m8','m9','m10','m11','m12',
             'co-borrower_credit_score','borrower_credit_score','loan_purpose',
             'debt_to_income_ratio','number_of_borrowers','loan_term']

target= ['m13']

In [9]:
#Final variables dataframe
train = train[fea_to_use+target]

**Perform Train Test Split**

In [10]:
X_train,X_test,y_train,y_test = train_test_split(train[fea_to_use],train[target],
                                       test_size=0.2,random_state=4)

**Define a custom metric for scoring the test set.**

In [11]:
def xg_f1(y,t):
    t = t.get_label()
    y_bin = [1. if y_cont > 0.5 else 0. for y_cont in y] # binaryzing your output
    return 'f1',f1_score(t,y_bin,average='binary')

**The below parameters have been fine tuned to give optimal performance on Test set.**

In [12]:
param_test1 = {
    
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.00001, n_estimators=11, max_depth=9,
 subsample=0.4, colsample_bytree=0.4,
 objective= 'binary:logistic', n_jobs=-1, scale_pos_weight=30,seed=27,verbosity=2), 
 param_grid = param_test1,n_jobs=-1,iid=False, cv=5,verbose=3)
gsearch1.fit(X_train,y_train,verbose=True,eval_set=[(X_test,y_test)],eval_metric=xg_f1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.7s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[17:04:55] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=9
[0]	validation_0-error:0.008272	validation_0-f1:0.388535
[17:04:55] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 0 pruned nodes, max_depth=9
[1]	validation_0-error:0.007324	validation_0-f1:0.401408
[17:04:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=9
[2]	validation_0-error:0.00517	validation_0-f1:0.473684
[17:04:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 128 extra nodes, 0 pruned nodes, max_depth=9
[3]	validation_0-error:0.005342	validation_0-f1:0.504
[17:04:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=9
[4]	validation_0-error:0.00461	validation_0-f1:0.532751
[17:04:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning e

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.4, gamma=0,
                                     learning_rate=1e-05, max_delta_step=0,
                                     max_depth=9, min_child_weight=1,
                                     missing=None, n_estimators=11, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=30, seed=27, silent=None,
                                     subsample=0.4, verbosity=2),
             iid=False, n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=3)

**Perform the same preprocessing steps on test data as the train data**

In [13]:
test_data = encodeCategoricals(test,categorical_cols)
test_data = scaleNumericVariables(test,values_to_scale)
test_data = test_data[fea_to_use]

**Predict on Test Set and submit final data**

In [14]:
preds_valid = pd.DataFrame(gsearch1.predict(test_data),columns=target)
preds_valid['loan_id'] = test['loan_id']
preds_valid = preds_valid[['loan_id', 'm13']]
preds_valid.to_csv("sample_submission.csv",index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
