In [28]:
!pip install catboost



In [29]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [30]:
def cleaner(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary
    return summary

In [31]:
!ls

catboost_info  data.csv  sample_data


In [32]:
file_loc = "data.csv"

In [33]:
df = pd.read_csv(file_loc)

In [34]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1
3,2583.0,2358.0,120.0,360.0,Yes,1
4,6000.0,0.0,141.0,360.0,Yes,1


In [35]:
print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

Total number of rows in dataset = 614
Total number of columns in dataset = 6


In [36]:
result = cleaner(df)
result

Dataset Shape: (614, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,ApplicantIncome,float64,2,503,5849,4583
1,CoapplicantIncome,float64,2,287,0,1508
2,LoanAmount,float64,3,203,0,128
3,Loan_Amount_Term,float64,2,11,360,360
4,Credit_History,object,0,2,Yes,Yes
5,Loan_Status,int64,0,2,1,0


In [37]:
target = "Loan_Status"
X = df.loc[:, df.columns != target]
y = df.loc[:, target]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [39]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
244,3406.0,4417.0,123.0,360.0,Yes
393,1993.0,1625.0,113.0,180.0,Yes
310,2917.0,0.0,84.0,360.0,Yes
408,8300.0,0.0,152.0,300.0,No
572,16666.0,0.0,275.0,360.0,Yes


In [40]:
features = list(X_train.columns)

In [41]:
cat_features = ["Credit_History"]

In [42]:
model_cb = CatBoostClassifier(task_type='GPU', iterations=100, 
                              random_state = 42, 
                              eval_metric="F1")

In [43]:
model_cb.fit(X_train, y_train, cat_features= cat_features, plot=True, 
             eval_set=(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.199227
0:	learn: 0.8524046	test: 0.8333333	best: 0.8333333 (0)	total: 39.2ms	remaining: 3.88s
1:	learn: 0.8557845	test: 0.8327645	best: 0.8333333 (0)	total: 86.4ms	remaining: 4.23s
2:	learn: 0.8446602	test: 0.8510638	best: 0.8510638 (2)	total: 123ms	remaining: 3.96s
3:	learn: 0.8502415	test: 0.8439716	best: 0.8510638 (2)	total: 150ms	remaining: 3.6s
4:	learn: 0.8639241	test: 0.8231293	best: 0.8510638 (2)	total: 166ms	remaining: 3.15s
5:	learn: 0.8630573	test: 0.8247423	best: 0.8510638 (2)	total: 174ms	remaining: 2.73s
6:	learn: 0.8648649	test: 0.8362369	best: 0.8510638 (2)	total: 188ms	remaining: 2.5s
7:	learn: 0.8639241	test: 0.8203390	best: 0.8510638 (2)	total: 210ms	remaining: 2.41s
8:	learn: 0.8669797	test: 0.8227425	best: 0.8510638 (2)	total: 218ms	remaining: 2.21s
9:	learn: 0.8603175	test: 0.8231293	best: 0.8510638 (2)	total: 223ms	remaining: 2.01s
10:	learn: 0.8553459	test: 0.8215488	best: 0.8510638 (2)	total: 236ms	remaining: 1.91s
11:	learn: 0.8584906	te

<catboost.core.CatBoostClassifier at 0x7f9f7c260610>

In [44]:
y_pred = model_cb.predict(X_test)

In [45]:
f1_score(y_test, y_pred)

0.851063829787234

In [47]:
 accuracy_score(y_test, y_pred)

0.7931034482758621