In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
train = pd.read_csv('./data/datathon2020_train.csv')
test = pd.read_csv('./data/datathon2020_test.csv')

### Dropping columns with >50% NaN values

In [3]:
train.isna().sum()
test.isna().sum()

ID              0
F2              0
F3              0
F4              0
F8              0
F11             0
F25             0
F39            14
F52           364
F53           350
F54           349
F55           328
F69            23
F72           453
F73           437
F74           418
F75           389
F76             3
F77             3
F78             1
F79             3
F81           271
F82           181
F83           165
F84           463
F85           462
F86           394
F92           361
F93           349
F94           348
F95           328
F98             0
F99             0
F109            0
F110            0
F116            0
F121            0
F122            0
F123            0
F124            0
F125            0
F126            0
F127            0
F128            0
F129            0
F130            0
F131            0
F132            0
F133            0
F136            0
F139            0
COVID_TCPM      0
dtype: int64

ID               0
F2               0
F3               0
F4               0
F8               3
F11              8
F25              0
F39             91
F52           1789
F53           1690
F54           1690
F55           1629
F69            113
F72           2286
F73           2179
F74           2102
F75           1925
F76             15
F77             15
F78             10
F79             11
F81           1314
F82            967
F83            908
F84           2293
F85           2286
F86           1970
F92           1780
F93           1685
F94           1685
F95           1625
F98              0
F99              0
F109             1
F110             1
F116             0
F121             0
F122             0
F123             0
F124             0
F125             0
F126             0
F127             0
F128             0
F129             0
F130             0
F131             0
F132             0
F133             0
F136             0
F139             0
COVID_TCPM       0
dtype: int64

In [4]:
na_percents_train = train.isna().sum() / len(train) * 100
cols_to_drop_train = na_percents_train[na_percents_train > 50.0].index.tolist()
cols_to_drop_train.extend(['F8', 'F11', 'F136'])
cols_to_drop_train

['F52',
 'F53',
 'F54',
 'F55',
 'F72',
 'F73',
 'F74',
 'F75',
 'F81',
 'F84',
 'F85',
 'F86',
 'F92',
 'F93',
 'F94',
 'F95',
 'F8',
 'F11',
 'F136']

In [5]:
na_percents_test = test.isna().sum() / len(test) * 100
cols_to_drop_test = na_percents_test[na_percents_test > 50.0].index.tolist()
cols_to_drop_test.extend(['F8', 'F11', 'F136'])
cols_to_drop_test

['F52',
 'F53',
 'F54',
 'F55',
 'F72',
 'F73',
 'F74',
 'F75',
 'F81',
 'F84',
 'F85',
 'F86',
 'F92',
 'F93',
 'F94',
 'F95',
 'F8',
 'F11',
 'F136']

In [6]:
train_drops= train.drop(columns=cols_to_drop_train, axis=1)
test_drops = test.drop(columns=cols_to_drop_test, axis=1)

In [7]:
train_drops
test_drops

Unnamed: 0,ID,F2,F3,F4,F25,F39,F69,F76,F77,F78,...,F126,F127,F128,F129,F130,F131,F132,F133,F139,COVID_TCPM
0,10681,1,1,1,9.028250e+04,0.367070,83.449504,4988.544333,5949.288232,6872.472783,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.102565,low
1,3300,1,1,1,2.885228e+05,0.420663,65.779902,42724.435510,19536.915530,85912.946920,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.186848,high
2,12038,1,1,1,9.469753e+04,0.343825,48.131525,9826.923968,11719.510030,13461.595530,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.131743,low
3,10236,0,1,1,1.753151e+05,0.558702,9.809284,3483.049562,4605.716581,5039.788915,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.703491,low
4,4111,1,1,1,8.909464e+04,0.508010,3.993893,1599.962445,863.509687,240.446778,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.531441,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,5740,0,1,1,2.051100e+05,0.094838,12.001842,1.645217,21.250187,8.256998,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,high
496,554,1,1,1,1.733145e+06,0.515650,36.352887,275506.494200,314674.056100,256327.107500,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.731324,high
497,12566,1,1,1,3.038501e+06,0.315763,377.297391,242647.813300,289379.600600,447452.384600,...,0.807399,0.946529,1.200975,1.281726,7113.034668,8993.411865,10894.03711,12675.01086,0.034969,low
498,13005,0,1,1,7.359616e+05,0.439870,169.007052,23943.459760,29724.676760,20989.280990,...,12.276258,16.289423,19.483387,20.412186,88959.949020,281104.611600,456585.33630,735961.58750,0.342686,low


Unnamed: 0,ID,F2,F3,F4,F25,F39,F69,F76,F77,F78,...,F126,F127,F128,F129,F130,F131,F132,F133,F139,COVID_TCPM
0,12272,1,1,1,3.010114e+05,0.270918,57.889697,45187.287610,53889.947660,61039.963870,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.008360,low
1,6526,1,1,1,8.668036e+04,0.303313,21.221158,86.386399,109.933379,129.268833,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.000000,low
2,11320,2,1,1,2.972672e+05,0.359829,86.350782,63083.100680,75232.254990,97333.212900,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.165453,low
3,4566,0,0,1,9.294967e+04,0.274426,7.550097,2205.528121,2213.771912,2076.353124,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.029401,low
4,4866,1,1,1,1.546434e+05,0.475473,0.018488,697.059179,1023.343533,1299.903914,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.334338,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2481,1575,1,1,1,3.985660e+06,0.262157,33.443898,28464.539720,52332.439850,62765.784400,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.040463,low
2482,13009,0,0,1,7.704883e+04,0.388311,20.378557,1659.035053,2059.541799,1499.421578,...,2.263137,2.593406,2.593406,2.688027,13306.76338,37205.58549,55806.48722,77048.82957,0.221624,low
2483,11032,0,1,1,5.288064e+04,0.556476,82.200772,4693.905448,6078.296876,6370.126262,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.794566,low
2484,7213,0,1,1,1.618023e+05,0.405874,74.565311,3321.549966,4226.918927,4747.838824,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.107804,low


### Train-Test Splits

In [8]:
train_drops['COVID_TCPM'] = train_drops['COVID_TCPM'].apply(lambda x: 0 if x == 'low' else 1)
train_drops['COVID_TCPM'].value_counts()
train_drops_copy = train_drops.copy()

0    252
1    248
Name: COVID_TCPM, dtype: int64

In [9]:
y_train = train_drops_copy['COVID_TCPM']
train_data = train_drops_copy.drop(columns=['COVID_TCPM'], axis=1)

In [10]:
test_drops['COVID_TCPM'] = test_drops['COVID_TCPM'].apply(lambda x: 0 if x == 'low' else 1)
test_drops['COVID_TCPM'].value_counts()
test_drops_copy = test_drops.copy()

0    2486
Name: COVID_TCPM, dtype: int64

In [11]:
y_test = test_drops_copy['COVID_TCPM']
test_data = test_drops_copy.drop(columns=['COVID_TCPM'], axis=1)

### Impute NaN with mean

In [12]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = pd.DataFrame(imp.fit_transform(train_data), columns=train_data.columns)
X_test = pd.DataFrame(imp.fit_transform(test_data), columns=test_data.columns)

### Decision Tree Classifier

In [19]:
clf_dt = dt(criterion='entropy', splitter='best', max_depth=None, min_samples_split=10, min_samples_leaf=6, max_features='auto', ccp_alpha=0.1)
clf_dt = clf_dt.fit(X_train, y_train)
clf_dt.score(X_train, y_train)

0.504

In [20]:
clf_dt.score(X_test, y_test)
y_pred = clf_dt.predict(X_test)

1.0

In [15]:
cross_val_score(clf_dt, X_train, y_train, cv=10)

array([0.84, 0.52, 0.64, 0.5 , 0.84, 0.46, 0.7 , 0.5 , 0.5 , 0.66])

### Submission

In [16]:
ID = np.array(X_test['ID'].astype('int64'))
submission = pd.DataFrame([ID, y_pred])
submission = submission.T
submission = submission.rename(columns={0:'ID', 1:'COVID_TCPM'})
submission['COVID_TCPM'] = submission['COVID_TCPM'].apply(lambda x: 'low' if x == 0 else 'high')
submission.to_csv('submission.csv', index = False)

In [17]:
feature_imp = pd.DataFrame(np.transpose(clf_dt.feature_importances_))
feature_imp = feature_imp.T
mapper = {i:v for i, v in enumerate(X_train.columns)}
feature_imp.rename(columns=mapper, inplace=True)
feature_imp = feature_imp.T
feature_imp[feature_imp > 0.0]

Unnamed: 0,0
ID,
F2,
F3,
F4,
F25,
F39,
F69,
F76,
F77,
F78,


In [18]:
np.transpose(clf_dt.feature_importances_)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])