In [3]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

In [4]:
# read in training data
xtrain = pd.read_csv('data/train.csv')
ytrain = xtrain['loan_status']
xtrain = xtrain.drop(['id', 'loan_status'], axis=1)
xtrain['person_home_ownership'] = xtrain['person_home_ownership'].map({'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'OTHER': 3})
xtrain['loan_intent'] = xtrain['loan_intent'].map({'PERSONAL': 0, 'EDUCATION': 1, 'MEDICAL': 2, 'VENTURE': 3, 'HOME': 4})
xtrain['loan_grade'] = xtrain['loan_grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5})
xtrain['cb_person_default_on_file'] = xtrain['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

# split data into training and validation sets
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.1, random_state=0)

In [21]:
# create and train model
param_grid = {
    'objective': ['binary:logistic'],
    'device': ['gpu'],
    'max_depth': [6, 5],
    'eta': [0.1],
    'subsample': [1],
    'colsample_bytree': [1]
}

xgb_model = xgb.XGBClassifier()  # Or XGBRegressor for regression
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=10)
grid_search.fit(xtrain, ytrain)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_) 

params = {
    'objective': 'binary:logistic',
    'device': 'gpu',
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 1,
    'colsample_bytree': 1
}

num_round = 1000
watchlist = [(xgb.DMatrix(xtrain, ytrain), 'train'), (xgb.DMatrix(xval, yval), 'eval')]

model = xgb.train(params, xgb.DMatrix(xtrain, ytrain), num_round, watchlist, early_stopping_rounds=10)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1
[CV 1/5; 1/2] END colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1;, score=0.949 total time=   0.5s
[CV 2/5; 1/2] START colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1
[CV 2/5; 1/2] END colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1;, score=0.947 total time=   0.3s
[CV 3/5; 1/2] START colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1
[CV 3/5; 1/2] END colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1;, score=0.942 total time=   0.3s
[CV 4/5; 1/2] START colsample_bytree=1, device=gpu, eta=0.1, max_depth=6, objective=binary:logistic, subsample=1
[CV 4/5; 1/2] END colsample_bytree=1, device=gpu, eta=0.



[27]	train-logloss:0.17974	eval-logloss:0.18024
[28]	train-logloss:0.17853	eval-logloss:0.17920
[29]	train-logloss:0.17758	eval-logloss:0.17842
[30]	train-logloss:0.17650	eval-logloss:0.17752
[31]	train-logloss:0.17566	eval-logloss:0.17685
[32]	train-logloss:0.17481	eval-logloss:0.17605
[33]	train-logloss:0.17383	eval-logloss:0.17537
[34]	train-logloss:0.17298	eval-logloss:0.17465
[35]	train-logloss:0.17240	eval-logloss:0.17423
[36]	train-logloss:0.17185	eval-logloss:0.17388
[37]	train-logloss:0.17120	eval-logloss:0.17330
[38]	train-logloss:0.17008	eval-logloss:0.17241
[39]	train-logloss:0.16957	eval-logloss:0.17203
[40]	train-logloss:0.16884	eval-logloss:0.17149
[41]	train-logloss:0.16833	eval-logloss:0.17116
[42]	train-logloss:0.16779	eval-logloss:0.17077
[43]	train-logloss:0.16722	eval-logloss:0.17033
[44]	train-logloss:0.16688	eval-logloss:0.17013
[45]	train-logloss:0.16573	eval-logloss:0.16928
[46]	train-logloss:0.16516	eval-logloss:0.16900
[47]	train-logloss:0.16472	eval-logloss:

In [60]:
# predict on test data
test = pd.read_csv('data/test.csv')
test['person_home_ownership'] = test['person_home_ownership'].map({'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'OTHER': 3})
test['loan_intent'] = test['loan_intent'].map({'PERSONAL': 0, 'EDUCATION': 1, 'MEDICAL': 2, 'VENTURE': 3, 'HOME': 4})
test['loan_grade'] = test['loan_grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5})
test['cb_person_default_on_file'] = test['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

predicitions = model.predict(xgb.DMatrix(test.drop('id', axis=1)))
predicitions = [1.0 if x > 0.5 else 0.0 for x in predicitions]

# write predictions to file
output = pd.DataFrame({'id': test['id'], 'loan_status': predicitions})
output.to_csv('predictions.csv', index=False)