# Packages to import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# 1. Data Preparation

In [2]:
train_df = pd.read_csv('data/lc_trainingset.csv')
test_df = pd.read_csv('data/lc_testset.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(316824, 28)
(78237, 27)


In [4]:
test_df.sample(n=1)

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
63502,63502,4125,36 months,15.99,145.01,D,D2,assistant grocery manager,5 years,MORTGAGE,...,4,0,2331,25.1,12,f,INDIVIDUAL,3.0,0,Unit 5713 Box 1505\nDPO AA 00813


## 1b. Cleaning / Relabelling
- Loan Status

In [5]:
train_df.loan_status.value_counts()

Fully Paid     254546
Charged Off     62278
Name: loan_status, dtype: int64

In [6]:
# loan status
def change_loan_status(loan_status):
    if loan_status == 'Fully Paid':
        return 0
    else:
        return 1
    
train_df['loan_status'] = train_df['loan_status'].apply(change_loan_status)

---
## Starter Code for Capstone Project
* estimate signal that can be retrieved from raw df
* only numerical columns
* Train/Test AUC = 0.71

In [7]:
train_df.columns

Index(['id', 'loan_amnt', 'term', 'int_rate', 'installment', 'grade',
       'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'purpose', 'title', 'dti',
       'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'initial_list_status', 'application_type', 'mort_acc',
       'pub_rec_bankruptcies', 'address', 'loan_status'],
      dtype='object')

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316824 entries, 0 to 316823
Data columns (total 28 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    316824 non-null  object 
 1   loan_amnt             316824 non-null  float64
 2   term                  316824 non-null  object 
 3   int_rate              316824 non-null  float64
 4   installment           316824 non-null  float64
 5   grade                 316824 non-null  object 
 6   sub_grade             316824 non-null  object 
 7   emp_title             298514 non-null  object 
 8   emp_length            302162 non-null  object 
 9   home_ownership        316824 non-null  object 
 10  annual_inc            316824 non-null  float64
 11  verification_status   316824 non-null  object 
 12  issue_d               316824 non-null  object 
 13  purpose               316824 non-null  object 
 14  title                 315423 non-null  object 
 15  

In [9]:
# Select only numerical columns. Make sure to remove 'loan_status' (outcome/label) from shortlisted feature (X) columns

selected_feature_cols = list(train_df.select_dtypes(exclude='object').drop(columns='loan_status').columns)
print(selected_feature_cols)

['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc', 'pub_rec_bankruptcies']


In [10]:
# Create catboost model, train on subset of columns

X = train_df[selected_feature_cols]
X_kaggle = test_df[selected_feature_cols]
y = train_df.loan_status

catboost_model = CatBoostClassifier(random_state=123).fit(X, y, verbose=True)  # This may take up to a minute
kaggle_preds = catboost_model.predict_proba(X_kaggle)[:,1]

print(kaggle_preds[:5])

Learning rate set to 0.120443
0:	learn: 0.6336976	total: 89.3ms	remaining: 1m 29s
1:	learn: 0.5897418	total: 119ms	remaining: 59.4s
2:	learn: 0.5583506	total: 150ms	remaining: 49.8s
3:	learn: 0.5333304	total: 181ms	remaining: 45.1s
4:	learn: 0.5145889	total: 209ms	remaining: 41.6s
5:	learn: 0.5006366	total: 237ms	remaining: 39.2s
6:	learn: 0.4902708	total: 266ms	remaining: 37.7s
7:	learn: 0.4827164	total: 296ms	remaining: 36.7s
8:	learn: 0.4775081	total: 319ms	remaining: 35.1s
9:	learn: 0.4727626	total: 348ms	remaining: 34.4s
10:	learn: 0.4690262	total: 375ms	remaining: 33.8s
11:	learn: 0.4661634	total: 404ms	remaining: 33.2s
12:	learn: 0.4639334	total: 434ms	remaining: 32.9s
13:	learn: 0.4620236	total: 464ms	remaining: 32.7s
14:	learn: 0.4606482	total: 496ms	remaining: 32.6s
15:	learn: 0.4596162	total: 525ms	remaining: 32.3s
16:	learn: 0.4585895	total: 554ms	remaining: 32s
17:	learn: 0.4577267	total: 583ms	remaining: 31.8s
18:	learn: 0.4571540	total: 610ms	remaining: 31.5s
19:	learn: 

163:	learn: 0.4453000	total: 4.08s	remaining: 20.8s
164:	learn: 0.4452476	total: 4.1s	remaining: 20.7s
165:	learn: 0.4452059	total: 4.12s	remaining: 20.7s
166:	learn: 0.4451601	total: 4.14s	remaining: 20.7s
167:	learn: 0.4451499	total: 4.16s	remaining: 20.6s
168:	learn: 0.4451052	total: 4.19s	remaining: 20.6s
169:	learn: 0.4450704	total: 4.21s	remaining: 20.6s
170:	learn: 0.4450300	total: 4.24s	remaining: 20.5s
171:	learn: 0.4449873	total: 4.26s	remaining: 20.5s
172:	learn: 0.4449506	total: 4.28s	remaining: 20.5s
173:	learn: 0.4449266	total: 4.31s	remaining: 20.4s
174:	learn: 0.4448785	total: 4.33s	remaining: 20.4s
175:	learn: 0.4448444	total: 4.35s	remaining: 20.4s
176:	learn: 0.4448046	total: 4.38s	remaining: 20.4s
177:	learn: 0.4447606	total: 4.4s	remaining: 20.3s
178:	learn: 0.4447235	total: 4.42s	remaining: 20.3s
179:	learn: 0.4446902	total: 4.45s	remaining: 20.3s
180:	learn: 0.4446699	total: 4.47s	remaining: 20.2s
181:	learn: 0.4446398	total: 4.5s	remaining: 20.2s
182:	learn: 0.4

324:	learn: 0.4400713	total: 7.93s	remaining: 16.5s
325:	learn: 0.4400329	total: 7.95s	remaining: 16.4s
326:	learn: 0.4400017	total: 7.97s	remaining: 16.4s
327:	learn: 0.4399745	total: 8s	remaining: 16.4s
328:	learn: 0.4399416	total: 8.02s	remaining: 16.4s
329:	learn: 0.4399183	total: 8.05s	remaining: 16.3s
330:	learn: 0.4398929	total: 8.07s	remaining: 16.3s
331:	learn: 0.4398649	total: 8.09s	remaining: 16.3s
332:	learn: 0.4398353	total: 8.12s	remaining: 16.3s
333:	learn: 0.4398120	total: 8.14s	remaining: 16.2s
334:	learn: 0.4397878	total: 8.17s	remaining: 16.2s
335:	learn: 0.4397645	total: 8.19s	remaining: 16.2s
336:	learn: 0.4397384	total: 8.22s	remaining: 16.2s
337:	learn: 0.4397090	total: 8.24s	remaining: 16.1s
338:	learn: 0.4396842	total: 8.26s	remaining: 16.1s
339:	learn: 0.4396567	total: 8.29s	remaining: 16.1s
340:	learn: 0.4396166	total: 8.31s	remaining: 16.1s
341:	learn: 0.4395847	total: 8.34s	remaining: 16s
342:	learn: 0.4395610	total: 8.36s	remaining: 16s
343:	learn: 0.43952

484:	learn: 0.4358830	total: 11.8s	remaining: 12.5s
485:	learn: 0.4358636	total: 11.8s	remaining: 12.5s
486:	learn: 0.4358351	total: 11.9s	remaining: 12.5s
487:	learn: 0.4358147	total: 11.9s	remaining: 12.5s
488:	learn: 0.4357884	total: 11.9s	remaining: 12.4s
489:	learn: 0.4357619	total: 11.9s	remaining: 12.4s
490:	learn: 0.4357323	total: 12s	remaining: 12.4s
491:	learn: 0.4357036	total: 12s	remaining: 12.4s
492:	learn: 0.4356798	total: 12s	remaining: 12.3s
493:	learn: 0.4356628	total: 12s	remaining: 12.3s
494:	learn: 0.4356487	total: 12.1s	remaining: 12.3s
495:	learn: 0.4356265	total: 12.1s	remaining: 12.3s
496:	learn: 0.4356085	total: 12.1s	remaining: 12.2s
497:	learn: 0.4355790	total: 12.1s	remaining: 12.2s
498:	learn: 0.4355551	total: 12.2s	remaining: 12.2s
499:	learn: 0.4355246	total: 12.2s	remaining: 12.2s
500:	learn: 0.4354923	total: 12.2s	remaining: 12.2s
501:	learn: 0.4354689	total: 12.2s	remaining: 12.1s
502:	learn: 0.4354502	total: 12.2s	remaining: 12.1s
503:	learn: 0.435425

646:	learn: 0.4322167	total: 15.7s	remaining: 8.56s
647:	learn: 0.4321943	total: 15.7s	remaining: 8.54s
648:	learn: 0.4321753	total: 15.7s	remaining: 8.52s
649:	learn: 0.4321470	total: 15.8s	remaining: 8.49s
650:	learn: 0.4321219	total: 15.8s	remaining: 8.47s
651:	learn: 0.4321067	total: 15.8s	remaining: 8.44s
652:	learn: 0.4320895	total: 15.8s	remaining: 8.42s
653:	learn: 0.4320599	total: 15.9s	remaining: 8.39s
654:	learn: 0.4320379	total: 15.9s	remaining: 8.37s
655:	learn: 0.4320156	total: 15.9s	remaining: 8.35s
656:	learn: 0.4319869	total: 15.9s	remaining: 8.32s
657:	learn: 0.4319632	total: 16s	remaining: 8.3s
658:	learn: 0.4319431	total: 16s	remaining: 8.27s
659:	learn: 0.4319230	total: 16s	remaining: 8.25s
660:	learn: 0.4319009	total: 16s	remaining: 8.22s
661:	learn: 0.4318907	total: 16.1s	remaining: 8.2s
662:	learn: 0.4318627	total: 16.1s	remaining: 8.17s
663:	learn: 0.4318323	total: 16.1s	remaining: 8.15s
664:	learn: 0.4318033	total: 16.1s	remaining: 8.12s
665:	learn: 0.4317767	

805:	learn: 0.4288720	total: 19.6s	remaining: 4.71s
806:	learn: 0.4288514	total: 19.6s	remaining: 4.69s
807:	learn: 0.4288238	total: 19.6s	remaining: 4.66s
808:	learn: 0.4287961	total: 19.6s	remaining: 4.64s
809:	learn: 0.4287677	total: 19.7s	remaining: 4.61s
810:	learn: 0.4287488	total: 19.7s	remaining: 4.59s
811:	learn: 0.4287271	total: 19.7s	remaining: 4.56s
812:	learn: 0.4287105	total: 19.7s	remaining: 4.54s
813:	learn: 0.4286857	total: 19.8s	remaining: 4.51s
814:	learn: 0.4286695	total: 19.8s	remaining: 4.49s
815:	learn: 0.4286492	total: 19.8s	remaining: 4.47s
816:	learn: 0.4286183	total: 19.8s	remaining: 4.44s
817:	learn: 0.4285931	total: 19.9s	remaining: 4.42s
818:	learn: 0.4285665	total: 19.9s	remaining: 4.39s
819:	learn: 0.4285510	total: 19.9s	remaining: 4.37s
820:	learn: 0.4285238	total: 19.9s	remaining: 4.35s
821:	learn: 0.4285070	total: 20s	remaining: 4.32s
822:	learn: 0.4284949	total: 20s	remaining: 4.3s
823:	learn: 0.4284612	total: 20s	remaining: 4.27s
824:	learn: 0.42843

967:	learn: 0.4255214	total: 23.4s	remaining: 773ms
968:	learn: 0.4254973	total: 23.4s	remaining: 749ms
969:	learn: 0.4254844	total: 23.4s	remaining: 725ms
970:	learn: 0.4254743	total: 23.5s	remaining: 701ms
971:	learn: 0.4254615	total: 23.5s	remaining: 676ms
972:	learn: 0.4254350	total: 23.5s	remaining: 652ms
973:	learn: 0.4254135	total: 23.5s	remaining: 628ms
974:	learn: 0.4253885	total: 23.5s	remaining: 604ms
975:	learn: 0.4253669	total: 23.6s	remaining: 580ms
976:	learn: 0.4253535	total: 23.6s	remaining: 555ms
977:	learn: 0.4253316	total: 23.6s	remaining: 531ms
978:	learn: 0.4253077	total: 23.6s	remaining: 507ms
979:	learn: 0.4252890	total: 23.7s	remaining: 483ms
980:	learn: 0.4252675	total: 23.7s	remaining: 459ms
981:	learn: 0.4252485	total: 23.7s	remaining: 435ms
982:	learn: 0.4252252	total: 23.7s	remaining: 410ms
983:	learn: 0.4252007	total: 23.8s	remaining: 386ms
984:	learn: 0.4251901	total: 23.8s	remaining: 362ms
985:	learn: 0.4251796	total: 23.8s	remaining: 338ms
986:	learn: 

In [11]:
output_dataframe = pd.DataFrame({
    'Id':list(range(len(kaggle_preds))),
    'Predicted':kaggle_preds
})
output_dataframe.to_csv('catboost_sample_predictions.csv', index=False)

## You may continue from here
---