In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, f1_score
from catboost import CatBoostClassifier
from tqdm import tqdm
from IPython.display import clear_output
from tensorflow import keras
from keras.layers import Dense, LSTM, Dropout
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
def validate(model, val_data):
    y = model.predict(val_data[0])
    print('Accuracy =', accuracy_score(y, val_data[1]))
    print('ROC AUC =', roc_auc_score(y, val_data[1]))
    print('F1 =', f1_score(y, val_data[1]))

In [3]:
orig_data = pd.read_csv('train.csv')
data  = orig_data.copy()

In [4]:
data.describe()

Unnamed: 0,index,age,fnlwgt,education-num,Y
count,10200.0,10200.0,10200.0,10200.0,10200.0
mean,5099.5,27.740686,130951.434804,9.498039,0.242157
std,2944.630707,8.389498,13332.415015,2.597184,0.42841
min,0.0,15.0,12461.0,1.0,0.0
25%,2549.75,22.0,132570.0,8.0,0.0
50%,5099.5,25.0,132616.0,9.0,0.0
75%,7649.25,35.0,132660.0,12.0,0.0
max,10199.0,64.0,132861.0,15.0,1.0


In [5]:
data.dropna(inplace=True)

In [6]:
X = data.drop(columns='Y')
y = data['Y']
test_x = pd.read_csv('test.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=45)
val_data = (X_test, y_test)
cat_f = [] # Categorical columns for catboost
for col in X.columns:
    if X[col].dtype == np.uint8:
        cat_f.append(col)

for col in test_x.columns:
    if test_x[col].dtype == np.uint8:
        cat_f.append(col)

In [7]:
# X_test = test_x
# Get all object variables from X_train
cat_vars = [var for var in X_train.columns if X_train[var].dtype == "O"]

# Initialize model with cat_vars
rf = CatBoostClassifier(
                   cat_features = cat_vars,
                   learning_rate=0.13,
                   eval_metric="AUC",
                   iterations=1500,
                   )
# Fit & Predict
rf.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))
pred_rf = rf.predict(X_test)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.9234381	best: 0.9234381 (0)	total: 186ms	remaining: 4m 39s
1:	test: 0.9280139	best: 0.9280139 (1)	total: 215ms	remaining: 2m 41s
2:	test: 0.9315027	best: 0.9315027 (2)	total: 255ms	remaining: 2m 7s
3:	test: 0.9324426	best: 0.9324426 (3)	total: 282ms	remaining: 1m 45s
4:	test: 0.9356586	best: 0.9356586 (4)	total: 314ms	remaining: 1m 33s
5:	test: 0.9377562	best: 0.9377562 (5)	total: 348ms	remaining: 1m 26s
6:	test: 0.9389435	best: 0.9389435 (6)	total: 379ms	remaining: 1m 20s
7:	test: 0.9394528	best: 0.9394528 (7)	total: 415ms	remaining: 1m 17s
8:	test: 0.9403795	best: 0.9403795 (8)	total: 442ms	remaining: 1m 13s
9:	test: 0.9411659	best: 0.9411659 (9)	total: 499ms	remaining: 1m 14s
10:	test: 0.9416921	best: 0.9416921 (10)	total: 527ms	remaining: 1m 11s
11:	test: 0.9417339	best: 0.9417339 (11)	total: 562ms	remaining: 1m 9s
12:	test: 0.9421582	best: 0.9421582 (12)	total: 591ms	remaining: 1m 7s
13:	test: 0.9424124	best: 0.9424124 (13)	total: 637ms	remaining: 1m 7s
14:	test: 0.9422

118:	test: 0.9437207	best: 0.9450316 (42)	total: 4.33s	remaining: 50.2s
119:	test: 0.9437190	best: 0.9450316 (42)	total: 4.37s	remaining: 50.2s
120:	test: 0.9437055	best: 0.9450316 (42)	total: 4.4s	remaining: 50.2s
121:	test: 0.9436370	best: 0.9450316 (42)	total: 4.43s	remaining: 50.1s
122:	test: 0.9436217	best: 0.9450316 (42)	total: 4.47s	remaining: 50s
123:	test: 0.9436353	best: 0.9450316 (42)	total: 4.51s	remaining: 50.1s
124:	test: 0.9435921	best: 0.9450316 (42)	total: 4.54s	remaining: 50s
125:	test: 0.9435870	best: 0.9450316 (42)	total: 4.6s	remaining: 50.2s
126:	test: 0.9435651	best: 0.9450316 (42)	total: 4.63s	remaining: 50.1s
127:	test: 0.9435354	best: 0.9450316 (42)	total: 4.68s	remaining: 50.2s
128:	test: 0.9434712	best: 0.9450316 (42)	total: 4.71s	remaining: 50s
129:	test: 0.9434762	best: 0.9450316 (42)	total: 4.74s	remaining: 49.9s
130:	test: 0.9434669	best: 0.9450316 (42)	total: 4.77s	remaining: 49.9s
131:	test: 0.9434204	best: 0.9450316 (42)	total: 4.8s	remaining: 49.8s
1

238:	test: 0.9422073	best: 0.9450316 (42)	total: 8.54s	remaining: 45.1s
239:	test: 0.9421354	best: 0.9450316 (42)	total: 8.57s	remaining: 45s
240:	test: 0.9420753	best: 0.9450316 (42)	total: 8.6s	remaining: 44.9s
241:	test: 0.9420854	best: 0.9450316 (42)	total: 8.63s	remaining: 44.9s
242:	test: 0.9420677	best: 0.9450316 (42)	total: 8.66s	remaining: 44.8s
243:	test: 0.9419805	best: 0.9450316 (42)	total: 8.7s	remaining: 44.8s
244:	test: 0.9419484	best: 0.9450316 (42)	total: 8.72s	remaining: 44.7s
245:	test: 0.9419061	best: 0.9450316 (42)	total: 8.75s	remaining: 44.6s
246:	test: 0.9418646	best: 0.9450316 (42)	total: 8.79s	remaining: 44.6s
247:	test: 0.9418494	best: 0.9450316 (42)	total: 8.84s	remaining: 44.6s
248:	test: 0.9418232	best: 0.9450316 (42)	total: 8.9s	remaining: 44.7s
249:	test: 0.9418071	best: 0.9450316 (42)	total: 8.97s	remaining: 44.9s
250:	test: 0.9418511	best: 0.9450316 (42)	total: 9.01s	remaining: 44.9s
251:	test: 0.9418418	best: 0.9450316 (42)	total: 9.06s	remaining: 44.

357:	test: 0.9404942	best: 0.9450316 (42)	total: 12.8s	remaining: 40.8s
358:	test: 0.9404104	best: 0.9450316 (42)	total: 12.8s	remaining: 40.7s
359:	test: 0.9404510	best: 0.9450316 (42)	total: 12.9s	remaining: 40.7s
360:	test: 0.9403867	best: 0.9450316 (42)	total: 12.9s	remaining: 40.6s
361:	test: 0.9403927	best: 0.9450316 (42)	total: 12.9s	remaining: 40.6s
362:	test: 0.9403977	best: 0.9450316 (42)	total: 13s	remaining: 40.6s
363:	test: 0.9403740	best: 0.9450316 (42)	total: 13s	remaining: 40.6s
364:	test: 0.9404586	best: 0.9450316 (42)	total: 13s	remaining: 40.5s
365:	test: 0.9404553	best: 0.9450316 (42)	total: 13.1s	remaining: 40.5s
366:	test: 0.9404053	best: 0.9450316 (42)	total: 13.1s	remaining: 40.4s
367:	test: 0.9404206	best: 0.9450316 (42)	total: 13.1s	remaining: 40.4s
368:	test: 0.9404341	best: 0.9450316 (42)	total: 13.2s	remaining: 40.4s
369:	test: 0.9403698	best: 0.9450316 (42)	total: 13.2s	remaining: 40.3s
370:	test: 0.9403935	best: 0.9450316 (42)	total: 13.2s	remaining: 40.3

479:	test: 0.9397717	best: 0.9450316 (42)	total: 16.9s	remaining: 35.8s
480:	test: 0.9397421	best: 0.9450316 (42)	total: 16.9s	remaining: 35.8s
481:	test: 0.9397599	best: 0.9450316 (42)	total: 16.9s	remaining: 35.8s
482:	test: 0.9397616	best: 0.9450316 (42)	total: 17s	remaining: 35.7s
483:	test: 0.9397548	best: 0.9450316 (42)	total: 17s	remaining: 35.7s
484:	test: 0.9397167	best: 0.9450316 (42)	total: 17.1s	remaining: 35.7s
485:	test: 0.9397700	best: 0.9450316 (42)	total: 17.1s	remaining: 35.7s
486:	test: 0.9397632	best: 0.9450316 (42)	total: 17.1s	remaining: 35.6s
487:	test: 0.9397353	best: 0.9450316 (42)	total: 17.2s	remaining: 35.6s
488:	test: 0.9397489	best: 0.9450316 (42)	total: 17.2s	remaining: 35.6s
489:	test: 0.9397353	best: 0.9450316 (42)	total: 17.2s	remaining: 35.5s
490:	test: 0.9396727	best: 0.9450316 (42)	total: 17.3s	remaining: 35.5s
491:	test: 0.9395602	best: 0.9450316 (42)	total: 17.3s	remaining: 35.5s
492:	test: 0.9395941	best: 0.9450316 (42)	total: 17.4s	remaining: 35

594:	test: 0.9383547	best: 0.9450316 (42)	total: 20.9s	remaining: 31.9s
595:	test: 0.9383826	best: 0.9450316 (42)	total: 21s	remaining: 31.8s
596:	test: 0.9383818	best: 0.9450316 (42)	total: 21s	remaining: 31.8s
597:	test: 0.9382701	best: 0.9450316 (42)	total: 21s	remaining: 31.7s
598:	test: 0.9382210	best: 0.9450316 (42)	total: 21.1s	remaining: 31.7s
599:	test: 0.9381973	best: 0.9450316 (42)	total: 21.1s	remaining: 31.6s
600:	test: 0.9381457	best: 0.9450316 (42)	total: 21.1s	remaining: 31.6s
601:	test: 0.9381660	best: 0.9450316 (42)	total: 21.2s	remaining: 31.6s
602:	test: 0.9381491	best: 0.9450316 (42)	total: 21.2s	remaining: 31.5s
603:	test: 0.9382388	best: 0.9450316 (42)	total: 21.2s	remaining: 31.5s
604:	test: 0.9382303	best: 0.9450316 (42)	total: 21.3s	remaining: 31.5s
605:	test: 0.9382058	best: 0.9450316 (42)	total: 21.3s	remaining: 31.4s
606:	test: 0.9382185	best: 0.9450316 (42)	total: 21.3s	remaining: 31.4s
607:	test: 0.9381754	best: 0.9450316 (42)	total: 21.4s	remaining: 31.4

711:	test: 0.9378615	best: 0.9450316 (42)	total: 25.1s	remaining: 27.8s
712:	test: 0.9378310	best: 0.9450316 (42)	total: 25.1s	remaining: 27.8s
713:	test: 0.9378260	best: 0.9450316 (42)	total: 25.2s	remaining: 27.7s
714:	test: 0.9378133	best: 0.9450316 (42)	total: 25.2s	remaining: 27.7s
715:	test: 0.9378471	best: 0.9450316 (42)	total: 25.3s	remaining: 27.7s
716:	test: 0.9378649	best: 0.9450316 (42)	total: 25.3s	remaining: 27.6s
717:	test: 0.9379156	best: 0.9450316 (42)	total: 25.3s	remaining: 27.6s
718:	test: 0.9378107	best: 0.9450316 (42)	total: 25.4s	remaining: 27.6s
719:	test: 0.9378471	best: 0.9450316 (42)	total: 25.4s	remaining: 27.5s
720:	test: 0.9378192	best: 0.9450316 (42)	total: 25.4s	remaining: 27.5s
721:	test: 0.9380045	best: 0.9450316 (42)	total: 25.5s	remaining: 27.5s
722:	test: 0.9379825	best: 0.9450316 (42)	total: 25.5s	remaining: 27.4s
723:	test: 0.9379436	best: 0.9450316 (42)	total: 25.5s	remaining: 27.4s
724:	test: 0.9379190	best: 0.9450316 (42)	total: 25.6s	remaining

830:	test: 0.9370121	best: 0.9450316 (42)	total: 29.6s	remaining: 23.8s
831:	test: 0.9369884	best: 0.9450316 (42)	total: 29.6s	remaining: 23.8s
832:	test: 0.9369242	best: 0.9450316 (42)	total: 29.7s	remaining: 23.8s
833:	test: 0.9369132	best: 0.9450316 (42)	total: 29.7s	remaining: 23.7s
834:	test: 0.9369208	best: 0.9450316 (42)	total: 29.7s	remaining: 23.7s
835:	test: 0.9369462	best: 0.9450316 (42)	total: 29.7s	remaining: 23.6s
836:	test: 0.9369555	best: 0.9450316 (42)	total: 29.8s	remaining: 23.6s
837:	test: 0.9369707	best: 0.9450316 (42)	total: 29.8s	remaining: 23.5s
838:	test: 0.9370282	best: 0.9450316 (42)	total: 29.8s	remaining: 23.5s
839:	test: 0.9370840	best: 0.9450316 (42)	total: 29.9s	remaining: 23.5s
840:	test: 0.9370637	best: 0.9450316 (42)	total: 29.9s	remaining: 23.4s
841:	test: 0.9370883	best: 0.9450316 (42)	total: 29.9s	remaining: 23.4s
842:	test: 0.9371238	best: 0.9450316 (42)	total: 30s	remaining: 23.4s
843:	test: 0.9371348	best: 0.9450316 (42)	total: 30s	remaining: 23

947:	test: 0.9362880	best: 0.9450316 (42)	total: 33.5s	remaining: 19.5s
948:	test: 0.9363032	best: 0.9450316 (42)	total: 33.6s	remaining: 19.5s
949:	test: 0.9362821	best: 0.9450316 (42)	total: 33.6s	remaining: 19.4s
950:	test: 0.9362964	best: 0.9450316 (42)	total: 33.6s	remaining: 19.4s
951:	test: 0.9364056	best: 0.9450316 (42)	total: 33.6s	remaining: 19.4s
952:	test: 0.9364267	best: 0.9450316 (42)	total: 33.7s	remaining: 19.3s
953:	test: 0.9364106	best: 0.9450316 (42)	total: 33.7s	remaining: 19.3s
954:	test: 0.9364140	best: 0.9450316 (42)	total: 33.8s	remaining: 19.3s
955:	test: 0.9364191	best: 0.9450316 (42)	total: 33.8s	remaining: 19.2s
956:	test: 0.9364250	best: 0.9450316 (42)	total: 33.8s	remaining: 19.2s
957:	test: 0.9364166	best: 0.9450316 (42)	total: 33.9s	remaining: 19.2s
958:	test: 0.9364081	best: 0.9450316 (42)	total: 33.9s	remaining: 19.1s
959:	test: 0.9364377	best: 0.9450316 (42)	total: 33.9s	remaining: 19.1s
960:	test: 0.9363565	best: 0.9450316 (42)	total: 34s	remaining: 

1063:	test: 0.9357567	best: 0.9450316 (42)	total: 37.5s	remaining: 15.4s
1064:	test: 0.9357516	best: 0.9450316 (42)	total: 37.5s	remaining: 15.3s
1065:	test: 0.9357914	best: 0.9450316 (42)	total: 37.6s	remaining: 15.3s
1066:	test: 0.9358134	best: 0.9450316 (42)	total: 37.6s	remaining: 15.3s
1067:	test: 0.9358151	best: 0.9450316 (42)	total: 37.7s	remaining: 15.2s
1068:	test: 0.9358083	best: 0.9450316 (42)	total: 37.7s	remaining: 15.2s
1069:	test: 0.9358405	best: 0.9450316 (42)	total: 37.7s	remaining: 15.2s
1070:	test: 0.9358244	best: 0.9450316 (42)	total: 37.8s	remaining: 15.1s
1071:	test: 0.9357982	best: 0.9450316 (42)	total: 37.8s	remaining: 15.1s
1072:	test: 0.9357144	best: 0.9450316 (42)	total: 37.9s	remaining: 15.1s
1073:	test: 0.9357601	best: 0.9450316 (42)	total: 37.9s	remaining: 15s
1074:	test: 0.9357576	best: 0.9450316 (42)	total: 37.9s	remaining: 15s
1075:	test: 0.9357736	best: 0.9450316 (42)	total: 38s	remaining: 15s
1076:	test: 0.9357965	best: 0.9450316 (42)	total: 38s	remai

1176:	test: 0.9349987	best: 0.9450316 (42)	total: 41.7s	remaining: 11.5s
1177:	test: 0.9350275	best: 0.9450316 (42)	total: 41.8s	remaining: 11.4s
1178:	test: 0.9350021	best: 0.9450316 (42)	total: 41.8s	remaining: 11.4s
1179:	test: 0.9349979	best: 0.9450316 (42)	total: 41.8s	remaining: 11.3s
1180:	test: 0.9349784	best: 0.9450316 (42)	total: 41.9s	remaining: 11.3s
1181:	test: 0.9349666	best: 0.9450316 (42)	total: 41.9s	remaining: 11.3s
1182:	test: 0.9349666	best: 0.9450316 (42)	total: 42s	remaining: 11.2s
1183:	test: 0.9350123	best: 0.9450316 (42)	total: 42s	remaining: 11.2s
1184:	test: 0.9349590	best: 0.9450316 (42)	total: 42s	remaining: 11.2s
1185:	test: 0.9349674	best: 0.9450316 (42)	total: 42.1s	remaining: 11.1s
1186:	test: 0.9349742	best: 0.9450316 (42)	total: 42.1s	remaining: 11.1s
1187:	test: 0.9349378	best: 0.9450316 (42)	total: 42.1s	remaining: 11.1s
1188:	test: 0.9349581	best: 0.9450316 (42)	total: 42.2s	remaining: 11s
1189:	test: 0.9349150	best: 0.9450316 (42)	total: 42.2s	rem

1290:	test: 0.9352948	best: 0.9450316 (42)	total: 45.9s	remaining: 7.43s
1291:	test: 0.9352813	best: 0.9450316 (42)	total: 46s	remaining: 7.4s
1292:	test: 0.9352339	best: 0.9450316 (42)	total: 46s	remaining: 7.36s
1293:	test: 0.9352305	best: 0.9450316 (42)	total: 46s	remaining: 7.33s
1294:	test: 0.9352246	best: 0.9450316 (42)	total: 46.1s	remaining: 7.29s
1295:	test: 0.9352068	best: 0.9450316 (42)	total: 46.1s	remaining: 7.25s
1296:	test: 0.9352339	best: 0.9450316 (42)	total: 46.1s	remaining: 7.22s
1297:	test: 0.9352627	best: 0.9450316 (42)	total: 46.2s	remaining: 7.18s
1298:	test: 0.9353050	best: 0.9450316 (42)	total: 46.2s	remaining: 7.15s
1299:	test: 0.9353058	best: 0.9450316 (42)	total: 46.2s	remaining: 7.11s
1300:	test: 0.9353134	best: 0.9450316 (42)	total: 46.3s	remaining: 7.08s
1301:	test: 0.9354369	best: 0.9450316 (42)	total: 46.3s	remaining: 7.04s
1302:	test: 0.9354462	best: 0.9450316 (42)	total: 46.3s	remaining: 7.01s
1303:	test: 0.9354039	best: 0.9450316 (42)	total: 46.4s	re

1408:	test: 0.9348744	best: 0.9450316 (42)	total: 50s	remaining: 3.23s
1409:	test: 0.9348972	best: 0.9450316 (42)	total: 50s	remaining: 3.19s
1410:	test: 0.9348930	best: 0.9450316 (42)	total: 50s	remaining: 3.15s
1411:	test: 0.9348837	best: 0.9450316 (42)	total: 50s	remaining: 3.12s
1412:	test: 0.9348870	best: 0.9450316 (42)	total: 50.1s	remaining: 3.08s
1413:	test: 0.9348921	best: 0.9450316 (42)	total: 50.1s	remaining: 3.05s
1414:	test: 0.9348744	best: 0.9450316 (42)	total: 50.2s	remaining: 3.01s
1415:	test: 0.9348541	best: 0.9450316 (42)	total: 50.2s	remaining: 2.98s
1416:	test: 0.9348566	best: 0.9450316 (42)	total: 50.2s	remaining: 2.94s
1417:	test: 0.9348710	best: 0.9450316 (42)	total: 50.3s	remaining: 2.91s
1418:	test: 0.9349513	best: 0.9450316 (42)	total: 50.3s	remaining: 2.87s
1419:	test: 0.9349412	best: 0.9450316 (42)	total: 50.3s	remaining: 2.83s
1420:	test: 0.9349649	best: 0.9450316 (42)	total: 50.4s	remaining: 2.8s
1421:	test: 0.9349700	best: 0.9450316 (42)	total: 50.4s	rema

In [8]:
test_x = pd.read_csv('test.csv')
test_x.columns

Index(['index', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'native-country'],
      dtype='object')

In [10]:

id = test_x['index']

sub_pool = test_x



pred = rf.predict(sub_pool, prediction_type = 'Class')



In [11]:
submission = pd.DataFrame({"id": id, "Y": pred.astype(np.int32)})

submission.to_csv('submission_01.csv', index=False, header = 0)