In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 19 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd 
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [None]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,551,15806307,Trevisano,720,Spain,Male,38,5,114051.97,2,0,1,107577.29,0
1,6897,15709621,Martin,682,France,Female,54,4,62397.41,1,1,0,113088.60,1
2,4588,15619340,Palmer,672,France,Female,31,5,119903.67,1,1,1,132925.17,0
3,291,15620746,Napolitani,592,Spain,Female,40,4,104257.86,1,1,0,110857.33,0
4,1673,15646372,Yao,753,Spain,Male,42,5,120387.73,1,0,1,126378.57,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,5345,15584532,Yu,568,France,Female,35,6,121079.60,2,1,1,124890.50,1
7996,5837,15606641,Liao,602,Germany,Female,45,7,145846.07,1,1,0,99276.02,0
7997,7335,15739692,Ferri,679,Spain,Female,43,5,132810.01,1,1,0,130780.85,1
7998,9552,15791373,Worsnop,715,France,Male,38,4,118729.45,1,0,0,95484.52,0


In [None]:
from sklearn.preprocessing import LabelEncoder

df = train.drop(columns = ["RowNumber", "CustomerId", "Surname"])#dropping less important features
le = LabelEncoder()
df['Geography'] = le.fit_transform(df['Geography'])#turning nation strings into numbers(actually I don't know if XGBoost would takw strings as legal variables)
df = df.replace("Male", 0)
df = df.replace("Female", 1)
df.dtypes

CreditScore          int64
Geography            int64
Gender               int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [None]:
# making data balanced(but no used in the final examination)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X = df.drop('Exited', axis=1)
y = df['Exited']
X_res, y_res = sm.fit_resample(X, y)
X_res.shape, y_res.shape

((12736, 10), (12736,))

In [None]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from xgboost import XGBClassifier

param_grid = {"max_depth": [int(i) for i in range(3, 9)], "eta": [float(j/100) for j in range(100)]}#determine the range of hyperparameters for XGBoost
cv = ShuffleSplit(n_splits = 5, test_size=0.2)#cross-validation spltting method

grid = GridSearchCV(XGBClassifier(), param_grid = param_grid, cv = cv, verbose = 0)#Start searching for best hyperparameters
grid.fit(X, y)
grid.predict(X)
print(grid.best_score_)
print(grid.best_params_)


0.8647500000000001
{'eta': 0.0, 'max_depth': 4}


In [None]:
from xgboost import XGBClassifier
clf = XGBClassifier(max_depth=5, eta = 0)#start fitting data after finding the best set of hyperparameters
clf.fit(X, y)

XGBClassifier(eta=0, max_depth=5)

In [None]:
from catboost import CatBoostClassifier
clf_2 = CatBoostClassifier()#a simple test for CatBoost, not used in later exploring for best method
clf_2.fit(X, y)

In [None]:
test = pd.read_csv("test.csv")
test.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [None]:
df_test = test.drop(columns = ["RowNumber", "CustomerId", "Surname"])
df_test.isnull().any()

CreditScore        False
Geography          False
Gender             False
Age                False
Tenure             False
Balance            False
NumOfProducts      False
HasCrCard          False
IsActiveMember     False
EstimatedSalary    False
dtype: bool

In [None]:

df_test['Geography'] = le.transform(df_test['Geography'])
df_test = df_test.replace("Male", 0)
df_test = df_test.replace("Female", 1)

In [None]:
df_test

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2209,726,0,1,39,3,62397.41,1,0,0,106139.31
1,9924,661,0,0,42,3,130339.64,1,1,0,125776.38
2,4617,545,0,0,34,6,62397.41,2,1,1,78372.28
3,6077,731,2,1,42,3,86717.08,1,1,0,136026.27
4,9240,533,0,1,50,6,62397.41,1,1,1,135205.58
...,...,...,...,...,...,...,...,...,...,...,...
1995,7872,628,0,0,35,6,62397.41,2,1,1,96201.09
1996,4257,639,1,0,41,5,98154.30,2,1,0,109531.64
1997,2273,548,0,1,46,3,119078.34,2,1,1,116725.67
1998,315,602,2,1,44,7,62397.41,2,1,1,128135.95


In [None]:
y_test = clf.predict(df_test)#prediction of XGBoost
y_test

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
y_test = clf_2.predict(df_test)#prediction of CatBoost

In [None]:
upload = pd.read_csv("upload.csv")
upload

Unnamed: 0.1,Unnamed: 0,RowNumber,Exited
0,0,2209,0
1,1,9924,0
2,2,4617,0
3,3,6077,0
4,4,9240,0
...,...,...,...
1995,1995,7872,0
1996,1996,4257,0
1997,1997,2273,0
1998,1998,315,0


In [None]:
upload["Exited"] = y_test
upload

Unnamed: 0.1,Unnamed: 0,RowNumber,Exited
0,0,2209,0
1,1,9924,0
2,2,4617,0
3,3,6077,0
4,4,9240,1
...,...,...,...
1995,1995,7872,0
1996,1996,4257,0
1997,1997,2273,0
1998,1998,315,0


In [None]:
upload.to_csv("upload_xgb_depth5_eta0.csv")