<a href="https://colab.research.google.com/github/lassewardenaer/covid/blob/main/covid2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
# Example code for training model and creating submission file.
# Author: Peter Sadowski Jan 22 2022
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from category_encoders.one_hot import OneHotEncoder #pip install category_encoders
import multiprocessing as mp 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Load training data.
df_train = pd.read_csv('./train_small.csv', nrows=20000000) # Can read from zip files directly.
df_train = df_train.replace({'death_yn':{np.nan:0}}) # Assume no info means survived.

# Load test data.
df_test = pd.read_csv('./test.csv.zip')
df_all_smoke = pd.read_csv('./smoking_data.csv')

state_to_smoke = dict()
for state in df_all_smoke['LocationAbbr'].values:
  state_to_smoke[state] = df_all_smoke[df_all_smoke['LocationAbbr'] == state]['Data_Value'].to_numpy()[0]

state_to_smoke['VI'] = 0
state_to_smoke['NJ'] = 0
state_to_smoke[np.nan] = 0
smoke_no_nan = [x for x in list(state_to_smoke.values())]
state_to_smoke['VI'] = np.sum(smoke_no_nan) // (len(state_to_smoke)-2)
state_to_smoke['NJ'] = np.sum(smoke_no_nan) // (len(state_to_smoke)-2)
state_to_smoke[np.nan] = np.sum(smoke_no_nan) // (len(state_to_smoke)-2)

state_to_smoke_rate_train = [state_to_smoke[state] for state in df_train['res_state']]
df_smoke_train = pd.DataFrame({'smoke_data': state_to_smoke_rate_train})

state_to_smoke_rate_test = [state_to_smoke[state] for state in df_test['res_state']]
df_smoke_test = pd.DataFrame({'smoke_data': state_to_smoke_rate_test})

df_train = df_train.drop(
    ['case_month',
     'res_state',
     'state_fips_code',
     'res_county',
     'county_fips_code',
     'case_positive_specimen_interval',
     'case_onset_interval',
     'process',
     'exposure_yn',
     'labconfirmed_yn',
     'symptomatic_yn'],
      axis=1)

df_test = df_test.drop(
    ['case_month',
     'res_state',
     'state_fips_code',
     'res_county',
     'county_fips_code',
     'case_positive_specimen_interval',
     'case_onset_interval',
     'process',
     'exposure_yn',
     'labconfirmed_yn',
     'symptomatic_yn'],
      axis=1)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
df_train = df_train.replace({'icu_yn':{np.nan:0}})
df_train = df_train.replace({'icu_yn':{'0':0}})
df_train = df_train.replace({'icu_yn':{'1':1}})
df_train = df_train.replace({'icu_yn':{'nul':0}})

df_test = df_test.replace({'icu_yn':{np.nan:0}})
df_test = df_test.replace({'icu_yn':{'0':0}})
df_test = df_test.replace({'icu_yn':{'1':1}})
df_test = df_test.replace({'icu_yn':{'nul':0}})

df_smoke_train = df_smoke_train.replace({'smoke_data':{np.nan:state_to_smoke['VI']}})
df_smoke_test = df_smoke_test.replace({'smoke_data':{np.nan:state_to_smoke['VI']}})

df_train = df_train.replace({'age_group':{'Unknown':np.nan}})
df_train = df_train.replace({'age_group':{'Missing':np.nan}})

df_test = df_test.replace({'age_group':{'Unknown':np.nan}})
df_test = df_test.replace({'age_group':{'Missing':np.nan}})

df_train = df_train.replace({'hosp_yn':{np.nan:0}})
df_test = df_test.replace({'hosp_yn':{np.nan:0}})

df_train = df_train.replace({'underlying_conditions_yn':{np.nan:0}})
df_test = df_test.replace({'underlying_conditions_yn':{np.nan:0}})

In [27]:
#Cleaning of data

def encode(atr, df, df_test, drop_atr):
  oe = OneHotEncoder(return_df=True, use_cat_names=True)
  df = oe.fit_transform(df[[atr]])
  df_test = oe.transform(df_test[[atr]])
  df = df.drop(drop_atr, axis=1)
  df_test = df_test.drop(drop_atr, axis=1)
  return [df, df_test]

In [28]:
[race_encoded, race_encoded_test] = encode('race', df_train, df_test, ['race_nan', 'race_Unknown', 'race_Missing']);
[sex_encoded, sex_encoded_test] = encode('sex', df_train, df_test, ['sex_nan', 'sex_Unknown', 'sex_Missing']);
[age_group_encoded, age_group_encoded_test] = encode('age_group', df_train, df_test, ['age_group_nan'])

In [29]:
# Merging of data
df_train = df_train.drop(['race', 'age_group', 'ethnicity', 'sex'], axis=1)
df_test = df_test.drop(['race', 'age_group', 'ethnicity', 'sex'], axis=1)

df_train = pd.concat([df_train.reset_index(drop=True), race_encoded.reset_index(drop=True)], axis=1)
df_train = pd.concat([df_train.reset_index(drop=True), sex_encoded.reset_index(drop=True)], axis=1)
df_train = pd.concat([df_train.reset_index(drop=True), age_group_encoded.reset_index(drop=True)], axis=1)
df_train = pd.concat([df_train.reset_index(drop=True), df_smoke_train.reset_index(drop=True)], axis=1)

df_test = pd.concat([df_test.reset_index(drop=True), race_encoded_test.reset_index(drop=True)], axis=1)
df_test = pd.concat([df_test.reset_index(drop=True), sex_encoded_test.reset_index(drop=True)], axis=1)
df_test = pd.concat([df_test.reset_index(drop=True), age_group_encoded_test.reset_index(drop=True)], axis=1)
df_test = pd.concat([df_test.reset_index(drop=True), df_smoke_test.reset_index(drop=True)], axis=1)

In [30]:
y = df_train['death_yn']
X = df_train.drop(['death_yn'], axis=1);
X_test = df_test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Using Random Forest
parameters = {
    'n_estimators': np.arange(10, 200, 40, dtype=int),
    'max_depth': np.arange(1, 20, 5, dtype=int),
    'max_leaf_nodes': np.arange(2, 250, 60, dtype=int)
}

model = RandomForestClassifier(n_jobs = mp.cpu_count())

In [None]:
# Using k nearest neigbours
parameters = {
    'n_estimators': [np.arange(10, 200, 30, dtype=int)],
    'max_depth': [np.arange(3, 10, dtype=int)]
}

model = KNeighborsClassifier()

In [None]:
# Use of grid search to test different parameters
gcv = GridSearchCV(model, parameters, n_jobs=-1)
gcv.fit(X, y)
print(gcv.best_estimator_)
print(gcv.best_score_)


In [24]:
X_train

Unnamed: 0,county_fips_code,hosp_yn,icu_yn,underlying_conditions_yn,race_White,race_Black,race_Asian,race_American Indian/Alaska Native,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,sex_Male,sex_Female,age_group_0 - 17 years,age_group_18 to 49 years,age_group_65+ years,age_group_50 to 64 years,smoke_data
75220,18053.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,1,0,0,19.2
48955,47025.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,1,0,0,19.9
44966,16027.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,1,0,0,0,15.3
13568,35043.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,1,0,0,0,16.0
92727,12019.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0,1,14.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,41047.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,14.5
54886,1121.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,1,0,20.2
76820,18181.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,0,0,1,19.2
860,51147.0,1.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,1,0,14.0


In [58]:
model = RandomForestClassifier(
    n_jobs = mp.cpu_count(),
    max_samples=300,
    max_leaf_nodes=200
    )
model.fit(X_train, y_train)
ypred = model.predict_proba(X_test)[:,1]
ypred_val = model.predict_proba(X_val)[:,1]

In [59]:
print(model.score(X_val, y_val))
print(roc_auc_score(y_val, ypred_val))
print(model.get_params())

0.9966
0.9867719363482901
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': 200, 'max_samples': 300, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': 4, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [None]:
#. Creating submission file
submission = pd.DataFrame(ypred, columns=['prediction']) # Create new dataframe.
submission['Id'] = submission.index  # Kaggle expects two columns: Id, prediction.
submission.to_csv('sample_submission.csv', index=False)