In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
train = pd.read_csv('datathon2020_train.csv')
test = pd.read_csv('datathon2020_test.csv')
data_df = pd.concat([train, test])
data_df

Unnamed: 0,ID,F2,F3,F4,F8,F11,F25,F39,F52,F53,...,F127,F128,F129,F130,F131,F132,F133,F136,F139,COVID_TCPM
0,10681,1,1,1,Tropical and Subtropical Moist Broadleaf Forests,"Mild temperate with dry winter, and Warm summer",9.028250e+04,0.367070,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.102565,low
1,3300,1,1,1,Temperate Broadleaf and Mixed Forests,"Mild temperate, fully humid, and Hot summer",2.885228e+05,0.420663,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.186848,high
2,12038,1,1,1,Temperate Broadleaf and Mixed Forests,"Mild temperate, fully humid, and Hot summer",9.469753e+04,0.343825,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.131743,low
3,10236,0,1,1,Tropical and Subtropical Moist Broadleaf Forests,Tropical monsoon,1.753151e+05,0.558702,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,preliminary,0.703491,low
4,4111,1,1,1,Temperate Broadleaf and Mixed Forests,"Snow, fully humid, and Warm summer",8.909464e+04,0.508010,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.531441,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2481,1575,1,1,1,"Mediterranean Forests, Woodlands, and Scrub","Mild temperate with dry summer, and Hot summer",3.985660e+06,0.262157,168320.0651,367945.1962,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.040463,low
2482,13009,0,0,1,Tropical and Subtropical Moist Broadleaf Forests,Tropical rain forest,7.704883e+04,0.388311,,,...,2.593406,2.593406,2.688027,13306.76338,37205.58549,55806.48722,77048.82957,preliminary,0.221624,low
2483,11032,0,1,1,Tropical and Subtropical Moist Broadleaf Forests,Tropical rain forest,5.288064e+04,0.556476,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.794566,low
2484,7213,0,1,1,Tropical and Subtropical Dry Broadleaf Forests,Tropical savannah with dry winter,1.618023e+05,0.405874,,,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,available,0.107804,low


In [3]:
data_df.isna().sum()

ID               0
F2               0
F3               0
F4               0
F8               3
F11              8
F25              0
F39            105
F52           2153
F53           2040
F54           2039
F55           1957
F69            136
F72           2739
F73           2616
F74           2520
F75           2314
F76             18
F77             18
F78             11
F79             14
F81           1585
F82           1148
F83           1073
F84           2756
F85           2748
F86           2364
F92           2141
F93           2034
F94           2033
F95           1953
F98              0
F99              0
F109             1
F110             1
F116             0
F121             0
F122             0
F123             0
F124             0
F125             0
F126             0
F127             0
F128             0
F129             0
F130             0
F131             0
F132             0
F133             0
F136             0
F139             0
COVID_TCPM       0
dtype: int64

In [4]:
na_percents = data_df.isna().sum() / len(data_df)*100
cols_to_drop = na_percents[na_percents > 50.0].index.tolist()
data_df.drop(columns=cols_to_drop, axis=1, inplace=True)
data_df.isna().sum()

ID               0
F2               0
F3               0
F4               0
F8               3
F11              8
F25              0
F39            105
F69            136
F76             18
F77             18
F78             11
F79             14
F82           1148
F83           1073
F98              0
F99              0
F109             1
F110             1
F116             0
F121             0
F122             0
F123             0
F124             0
F125             0
F126             0
F127             0
F128             0
F129             0
F130             0
F131             0
F132             0
F133             0
F136             0
F139             0
COVID_TCPM       0
dtype: int64

In [5]:
imp1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp1 = imp1.fit(data_df[['F8', 'F11', 'F136']])
F8_F11_noNull = pd.DataFrame(imp1.transform(data_df[['F8', 'F11', 'F136']]), columns=['F8', 'F11', 'F136'])
F8_F11_noNull

Unnamed: 0,F8,F11,F136
0,Tropical and Subtropical Moist Broadleaf Forests,"Mild temperate with dry winter, and Warm summer",available
1,Temperate Broadleaf and Mixed Forests,"Mild temperate, fully humid, and Hot summer",available
2,Temperate Broadleaf and Mixed Forests,"Mild temperate, fully humid, and Hot summer",available
3,Tropical and Subtropical Moist Broadleaf Forests,Tropical monsoon,preliminary
4,Temperate Broadleaf and Mixed Forests,"Snow, fully humid, and Warm summer",available
...,...,...,...
2981,"Mediterranean Forests, Woodlands, and Scrub","Mild temperate with dry summer, and Hot summer",available
2982,Tropical and Subtropical Moist Broadleaf Forests,Tropical rain forest,preliminary
2983,Tropical and Subtropical Moist Broadleaf Forests,Tropical rain forest,available
2984,Tropical and Subtropical Dry Broadleaf Forests,Tropical savannah with dry winter,available


In [6]:
data_df_copy = data_df.drop(['F8', 'F11', 'F136', 'COVID_TCPM'], axis=1)
imp2 = SimpleImputer(missing_values=np.nan, strategy='mean')
imp2 = imp2.fit(data_df_copy)
rest_data = pd.DataFrame(imp2.transform(data_df_copy), columns=data_df_copy.columns)
rest_data

Unnamed: 0,ID,F2,F3,F4,F25,F39,F69,F76,F77,F78,...,F125,F126,F127,F128,F129,F130,F131,F132,F133,F139
0,10681.0,1.0,1.0,1.0,9.028250e+04,0.367070,83.449504,4988.544333,5949.288232,6872.472783,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.102565
1,3300.0,1.0,1.0,1.0,2.885228e+05,0.420663,65.779902,42724.435510,19536.915530,85912.946920,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.186848
2,12038.0,1.0,1.0,1.0,9.469753e+04,0.343825,48.131525,9826.923968,11719.510030,13461.595530,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.131743
3,10236.0,0.0,1.0,1.0,1.753151e+05,0.558702,9.809284,3483.049562,4605.716581,5039.788915,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.703491
4,4111.0,1.0,1.0,1.0,8.909464e+04,0.508010,3.993893,1599.962445,863.509687,240.446778,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.531441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1575.0,1.0,1.0,1.0,3.985660e+06,0.262157,33.443898,28464.539720,52332.439850,62765.784400,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.040463
2982,13009.0,0.0,0.0,1.0,7.704883e+04,0.388311,20.378557,1659.035053,2059.541799,1499.421578,...,13.0,2.263137,2.593406,2.593406,2.688027,13306.76338,37205.58549,55806.48722,77048.82957,0.221624
2983,11032.0,0.0,1.0,1.0,5.288064e+04,0.556476,82.200772,4693.905448,6078.296876,6370.126262,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.794566
2984,7213.0,0.0,1.0,1.0,1.618023e+05,0.405874,74.565311,3321.549966,4226.918927,4747.838824,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.107804


In [7]:
clean_full_data_df = pd.concat([rest_data, F8_F11_noNull], axis=1)
clean_full_data_df['F8'] = clean_full_data_df['F8'].astype('category')
clean_full_data_df['F11'] = clean_full_data_df['F11'].astype('category')
clean_full_data_df['F136'] = clean_full_data_df['F136'].astype('category')
clean_categ_data = pd.get_dummies(clean_full_data_df, columns=['F8', 'F11', 'F136'], prefix=['F8_Area_Condition', 'F11_Climate', 'F136'])
clean_categ_data.head(5)

Unnamed: 0,ID,F2,F3,F4,F25,F39,F69,F76,F77,F78,...,"F11_Climate_Steppe (semi-arid), and Cold arid;Snow with dry summer, and Hot summer","F11_Climate_Steppe (semi-arid), and Hot arid",F11_Climate_Tropical monsoon,F11_Climate_Tropical rain forest,F11_Climate_Tropical savannah with dry summer,F11_Climate_Tropical savannah with dry winter,F11_Climate_Tundra,F136_available,F136_missing,F136_preliminary
0,10681.0,1.0,1.0,1.0,90282.4989,0.36707,83.449504,4988.544333,5949.288232,6872.472783,...,0,0,0,0,0,0,0,1,0,0
1,3300.0,1.0,1.0,1.0,288522.8183,0.420663,65.779902,42724.43551,19536.91553,85912.94692,...,0,0,0,0,0,0,0,1,0,0
2,12038.0,1.0,1.0,1.0,94697.534,0.343825,48.131525,9826.923968,11719.51003,13461.59553,...,0,0,0,0,0,0,0,1,0,0
3,10236.0,0.0,1.0,1.0,175315.1001,0.558702,9.809284,3483.049562,4605.716581,5039.788915,...,0,0,1,0,0,0,0,0,0,1
4,4111.0,1.0,1.0,1.0,89094.63635,0.50801,3.993893,1599.962445,863.509687,240.446778,...,0,0,0,0,0,0,0,1,0,0


In [8]:
X = clean_categ_data.copy()
y = data_df['COVID_TCPM']
le = LabelEncoder()
le.fit(y)
le.classes_

LabelEncoder()

array(['high', 'low'], dtype=object)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=500)
len(X_test)

2486

In [52]:
clf_dt = dt(criterion='entropy', splitter='best', max_depth=6, min_samples_split=10, min_samples_leaf=5, max_features='auto')
clf_dt = clf_dt.fit(X_train, y_train)

In [53]:
clf_dt.score(X_train, y_train)

0.922

In [54]:
Y_pred = clf_dt.predict(X_test)
Y_pred

array(['low', 'low', 'low', ..., 'low', 'low', 'low'], dtype=object)

In [55]:
clf_dt.score(X_test, y_test)

0.9014481094127111

In [56]:
cm = confusion_matrix(y_test, Y_pred)
cm

array([[  11,  196],
       [  49, 2230]], dtype=int64)

In [57]:
ID = np.array(X_test['ID'].astype('int64'))
ID

array([4881, 8090, 1546, ..., 5017, 2440, 1899], dtype=int64)

In [58]:
submission = pd.DataFrame([ID, Y_pred])
submission = submission.T
submission.rename(columns={0:'ID', 1:'COVID_TCPM'}, inplace=True)
submission.set_index(['ID'])

Unnamed: 0_level_0,COVID_TCPM
ID,Unnamed: 1_level_1
4881,low
8090,low
1546,low
2135,low
8761,low
...,...
2558,low
3393,low
5017,low
2440,low


In [60]:
submission.to_csv('submission.csv', index=False)

In [61]:
clf_dt2 = dt()
params_dt = {'criterion': ['gini', 'entropy'],'min_samples_split': [2,5,10,13], 'min_samples_leaf': [2,3,5,7]}
clf_gs_dt = GridSearchCV(clf_dt2, params_dt, cv=5, return_train_score=True)
clf_gs_dt = clf_gs_dt.fit(X_train, y_train)

In [62]:
clf_gs_dt.score(X_train, y_train)

0.932

In [63]:
clf_gs_dt.score(X_test, y_test)

0.8584070796460177