In [41]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [42]:
# read in training data
xtrain = pd.read_csv('data/train.csv')
ytrain = xtrain['loan_status']
xtrain = xtrain.drop(['id', 'loan_status'], axis=1)
xtrain['person_home_ownership'] = xtrain['person_home_ownership'].map({'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'OTHER': 3})
xtrain['loan_intent'] = xtrain['loan_intent'].map({'PERSONAL': 0, 'EDUCATION': 1, 'MEDICAL': 2, 'VENTURE': 3, 'HOME': 4})
xtrain['loan_grade'] = xtrain['loan_grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5})
xtrain['cb_person_default_on_file'] = xtrain['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

# split data into training and validation sets
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.2, random_state=0)

In [58]:
# create and train model
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1
}

num_round = 1000
watchlist = [(xgb.DMatrix(xtrain, ytrain), 'train'), (xgb.DMatrix(xval, yval), 'eval')]

model = xgb.train(params, xgb.DMatrix(xtrain, ytrain), num_round, watchlist, early_stopping_rounds=25)

[0]	train-logloss:0.37183	eval-logloss:0.37241
[1]	train-logloss:0.34006	eval-logloss:0.34044
[2]	train-logloss:0.31567	eval-logloss:0.31588
[3]	train-logloss:0.29576	eval-logloss:0.29599
[4]	train-logloss:0.27946	eval-logloss:0.27965
[5]	train-logloss:0.26569	eval-logloss:0.26575
[6]	train-logloss:0.25402	eval-logloss:0.25403
[7]	train-logloss:0.24403	eval-logloss:0.24396
[8]	train-logloss:0.23505	eval-logloss:0.23494
[9]	train-logloss:0.22751	eval-logloss:0.22738
[10]	train-logloss:0.22084	eval-logloss:0.22078




[11]	train-logloss:0.21495	eval-logloss:0.21503
[12]	train-logloss:0.20987	eval-logloss:0.21005
[13]	train-logloss:0.20493	eval-logloss:0.20533
[14]	train-logloss:0.20062	eval-logloss:0.20127
[15]	train-logloss:0.19709	eval-logloss:0.19795
[16]	train-logloss:0.19379	eval-logloss:0.19488
[17]	train-logloss:0.19065	eval-logloss:0.19215
[18]	train-logloss:0.18777	eval-logloss:0.18948
[19]	train-logloss:0.18540	eval-logloss:0.18728
[20]	train-logloss:0.18313	eval-logloss:0.18538
[21]	train-logloss:0.18116	eval-logloss:0.18377
[22]	train-logloss:0.17927	eval-logloss:0.18214
[23]	train-logloss:0.17766	eval-logloss:0.18089
[24]	train-logloss:0.17613	eval-logloss:0.17968
[25]	train-logloss:0.17476	eval-logloss:0.17852
[26]	train-logloss:0.17352	eval-logloss:0.17749
[27]	train-logloss:0.17219	eval-logloss:0.17636
[28]	train-logloss:0.17108	eval-logloss:0.17543
[29]	train-logloss:0.17002	eval-logloss:0.17456
[30]	train-logloss:0.16900	eval-logloss:0.17381
[31]	train-logloss:0.16785	eval-logloss:

In [49]:
# predict on test data
test = pd.read_csv('data/test.csv')
test['person_home_ownership'] = test['person_home_ownership'].map({'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'OTHER': 3})
test['loan_intent'] = test['loan_intent'].map({'PERSONAL': 0, 'EDUCATION': 1, 'MEDICAL': 2, 'VENTURE': 3, 'HOME': 4})
test['loan_grade'] = test['loan_grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5})
test['cb_person_default_on_file'] = test['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

predicitions = model.predict(xgb.DMatrix(test.drop('id', axis=1)))
predicitions = [1.0 if x > 0.5 else 0.0 for x in predicitions]

# write predictions to file
output = pd.DataFrame({'id': test['id'], 'loan_status': predicitions})
output.to_csv('predictions.csv', index=False)