In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
dataset = pd.read_csv("covid_train.csv")
test_set = pd.read_csv("covid_test.csv")

In [3]:
# replacing NaN in Outbreak_Related with NO 
dataset['Outbreak_Related'] = dataset['Outbreak_Related'].fillna("No")
test_set['Outbreak_Related'] = test_set['Outbreak_Related'].fillna("No") 

# Dropping NaN values in Age_Group
dataset = dataset.dropna()
test_set = test_set.dropna()
# print(test_set['Reporting_PHU_City'].unique())
# print(dataset['Reporting_PHU_City'].unique())

# Ordinal encoding of Age_Group
dataset['Age_Group'] = dataset['Age_Group'].replace({'<20': 1, '20s': 2, '30s': 3 , '40s': 4, '50s': 5, '60s': 6, '70s': 7, '80s': 8, '90s': 9})
test_set['Age_Group'] = test_set['Age_Group'].replace({'<20': 1, '20s': 2, '30s': 3 , '40s': 4, '50s': 5, '60s': 6, '70s': 7, '80s': 8, '90s': 9})

In [4]:
## on-hot encoding of training and test set
from sklearn.preprocessing import OneHotEncoder

dataset_features = dataset.iloc[:,:-1]
dataset_target = dataset.iloc[:, -1]

categorical_cols = ['Client_Gender','Case_AcquisitionInfo','Reporting_PHU_City','Outbreak_Related']
# Create the encoder.
encoder = OneHotEncoder(categories = "auto", handle_unknown="error", sparse = False, drop= 'first')
# Fit and transform encoder to X
array_hot_encoded = encoder.fit_transform(dataset_features[categorical_cols])
# convert it to a Data Frame
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=dataset_features.index)
# set column names of encoded dataframe
data_hot_encoded.columns = encoder.get_feature_names(categorical_cols)
# concatenate other columns with the encoded columns to get the final datset
data_other_cols = dataset_features.drop(columns=categorical_cols)
X_train_data = pd.concat([data_hot_encoded, data_other_cols], axis=1)


In [5]:
# Apply the encoder to test data.
X_test_array = encoder.transform(test_set[categorical_cols])
test_data_encoded = pd.DataFrame(X_test_array, index=test_set.index)
test_data_encoded.columns = encoder.get_feature_names(categorical_cols)
test_data_other_cols = test_set.drop(columns=categorical_cols)
X_test_with_id = pd.concat([test_data_encoded, test_data_other_cols], axis=1)
X_test_without_id = X_test_with_id.drop(["id"], axis =1)
# X_test_without_id

In [6]:
# Dataframe to Numpy array for training
X_test = X_test_without_id.values
X_train = X_train_data.values
y_train = dataset_target.values
print(X_test.shape)
print(X_train.shape)
print(y_train.shape)

(3713, 45)
(14845, 45)
(14845,)


In [9]:
rfclassifier = RandomForestClassifier(random_state = 0)
rfclassifier.fit(X_train_data, y_train)
cv = KFold(n_splits=10, random_state=0, shuffle=True)

parameters = [{'criterion': ['gini','entropy'], 'n_estimators': [5, 10, 50, 150, 200] , 'max_depth': [3, 5, 10, None]}]
grid_search = GridSearchCV(estimator = rfclassifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = cv,
                           n_jobs = -1)
grid_search.fit(X_train_data, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
results = grid_search.cv_results_
print("Best Accuracy obtained for Random Forest Classifier: {:.2f} %".format(best_accuracy*100))
print("Best Parameters found for Random Forest Classifier:", best_parameters)

Best Accuracy obtained for Random Forest Classifier: 67.18 %
Best Parameters found for Random Forest Classifier: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50}


In [10]:
rfclassifier_final = RandomForestClassifier(criterion = 'gini', n_estimators = 50, max_depth = 10, random_state = 0)
rfclassifier_final.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=50, random_state=0)

In [11]:
y_pred = rfclassifier_final.predict(X_test)

In [12]:
y_pred

array(['Fatal', 'Fatal', 'Not Resolved', ..., 'Fatal', 'Fatal', 'Fatal'],
      dtype=object)

In [13]:
ids = test_set['id'].values
column_values = ['id', 'Outcome1']
final_np = np.concatenate((ids.reshape(len(ids),1), y_pred.reshape(len(y_pred),1)),1)
final_np
final = pd.DataFrame(data = final_np, columns = column_values)
final

Unnamed: 0,id,Outcome1
0,0,Fatal
1,1,Fatal
2,2,Not Resolved
3,3,Resolved
4,4,Not Resolved
...,...,...
3708,3708,Fatal
3709,3709,Not Resolved
3710,3710,Fatal
3711,3711,Fatal


In [14]:
final.to_csv("Group_31.csv", index = False)