# Import Necessary Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as ticker
import seaborn as sns
from scipy import stats
import missingno as msno

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.feature_selection import RFECV

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook, tqdm
tqdm.pandas(tqdm_notebook)

import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.tree import DecisionTreeRegressor  
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import r2_score,median_absolute_error,mean_squared_error,mean_absolute_error,accuracy_score
from sklearn import preprocessing # label encoding
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split functionn
from IPython.display import Image  
from sklearn.naive_bayes import GaussianNB

# Define Path for Dataset & Variable Initialization

In [15]:
path = os.path.join(os.getcwd())
malaysia_case_dir = path + "\dataset\cases_malaysia.csv"
state_case_dir = path + "\dataset\cases_state.csv"
checkIn_dir = path + "\dataset\covid19-public/mysejahtera/checkin_state.csv"
clusters_dir = path + "\dataset\clusters.csv"
hospital_dir = path + "\dataset\hospital.csv"
pkrc_dir = path + "\dataset\pkrc.csv"
malaysia_tests_dir = path + "\dataset\tests_malaysia.csv"
states_tests_dir = path + "\dataset\tests_state.csv"

**Time frame is set from 1/7/2021 until 31/8/2021** 
</p> This is because the tests_state dataset on MOH only has data available from 1st of July 2021 to 21st of September 2021. Also, the recent data is more helpful to predict future new Covid-19 cases in Malaysia since the data from 2020 to 2021 is having big differences within short time period. 

In [None]:
start_date = "2021-07-01"
end_date = "2021-08-31"

# <font color='FireBrick'>Question 3 (i)</font> 

## Exploratory Data Analysis (EDA)

Perform EDA by identifying missing value using isna() function, and outliers using box plot, we did not remove outliers because there are important to the datasets and statistically significant.

### Exploratory Data Analysis for Malaysia Case Dataset

In [None]:
malaysia_case_df = pd.read_csv(malaysia_case_dir)
after_start_date = malaysia_case_df["date"] >= start_date
before_end_date = malaysia_case_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
malaysia_case_df = malaysia_case_df.loc[between_two_dates]
malaysia_case_df.head()

In [None]:
malaysia_case_df.info()

In [None]:
malaysia_case_df.describe()

In [None]:
nRow, nCol = malaysia_case_df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
malaysia_case_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(malaysia_case_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=2, wspace=0.2, hspace=0.6)

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cases_new"],ax=axes[0][0])
axes[0][0].set_title('New Case')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cases_import"],ax=axes[0][1])
axes[0][1].set_title('Case Imprt')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cases_recovered"],ax=axes[0][2])
axes[0][2].set_title('Case Recovered')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_import"],ax=axes[1][0])
axes[1][0].set_title('cluster_workplace')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_religious"],ax=axes[1][1])
axes[1][1].set_title('cluster_religious')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_community"],ax=axes[1][2])
axes[1][2].set_title('cluster_community')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_highRisk"],ax=axes[2][0])
axes[2][0].set_title('cluster_highRisk')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_education"],ax=axes[2][1])
axes[2][1].set_title('cluster_education')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_detentionCentre"],ax=axes[2][2])
axes[2][2].set_title('cluster_detentionCentre')

sns.boxplot(data=malaysia_case_df,x=malaysia_case_df["cluster_workplace"],ax=axes[3][0])
axes[3][0].set_title('cluster_workplace')

### Exploratory Data Analysis for State Case Dataset

In [None]:
state_case_df = pd.read_csv(state_case_dir)
after_start_date = state_case_df["date"] >= start_date
before_end_date = state_case_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
state_case_df = state_case_df.loc[between_two_dates]
state_case_df.head()

In [None]:
state_case_df.info()

In [None]:
state_case_df.describe()

In [None]:
state_case_df.groupby([state_case_df['date']]).sum()

In [None]:
state_case_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(state_case_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=1, wspace=0.2, hspace=0.6)

# sns.boxplot(data=state_case_df,x=state_case_df["cases_new"],ax=axes[0][0])
# axes[0][0].set_title('Date')
# sns.boxplot(data=state_case_df,x=state_case_df["cases_new"],ax=axes[0][1])
# axes[0][1].set_title('State')
sns.boxplot(data=state_case_df,x=state_case_df["cases_import"],ax=axes[0])
axes[0].set_title('Import Case')
sns.boxplot(data=state_case_df,x=state_case_df["cases_new"],ax=axes[1])
axes[1].set_title('New Case')
sns.boxplot(data=state_case_df,x=state_case_df["cases_recovered"],ax=axes[2])
axes[2].set_title('Recovered Case')

### Exploratory Data Analysis for Clusters Dataset

In [None]:
clusters_df = pd.read_csv(clusters_dir)
after_start_date = clusters_df["date_announced"] >= start_date
before_end_date = clusters_df["date_announced"] <= end_date
between_two_dates = after_start_date & before_end_date
clusters_df = clusters_df.loc[between_two_dates]
clusters_df['date'] = clusters_df.date_announced
clusters_df.head()

In [None]:
clusters_df.info()

In [None]:
clusters_df.describe()

In [None]:
cluster_count_by_state = clusters_df.groupby([clusters_df['state']]).count().loc[['Johor','Pahang','Kedah','Selangor']]['cluster']
clusters_df = clusters_df.groupby([clusters_df['state']]).sum().loc[['Johor','Pahang','Kedah','Selangor']]
clusters_df['cluster_total'] = cluster_count_by_state

In [None]:
clusters_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(clusters_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=2, wspace=0.2, hspace=0.6)

sns.boxplot(data=clusters_df,x=clusters_df["cases_new"],ax=axes[0][0])
axes[0][0].set_title('cases_new')

sns.boxplot(data=clusters_df,x=clusters_df["cases_total"],ax=axes[0][1])
axes[0][1].set_title('cases_total')

sns.boxplot(data=clusters_df,x=clusters_df["cases_active"],ax=axes[0][2])
axes[0][2].set_title('cases_active')

sns.boxplot(data=clusters_df,x=clusters_df["tests"],ax=axes[1][0])
axes[1][0].set_title('tests')

sns.boxplot(data=clusters_df,x=clusters_df["icu"],ax=axes[1][1])
axes[1][1].set_title('icu')

sns.boxplot(data=clusters_df,x=clusters_df["deaths"],ax=axes[1][2])
axes[1][2].set_title('deaths')

sns.boxplot(data=clusters_df,x=clusters_df["recovered"],ax=axes[2][0])
axes[2][0].set_title('recovered')

### Exploratory Data Analysis for Malaysia Tests Dataset

In [None]:
malaysia_tests_df = pd.read_csv(malaysia_tests_dir)
after_start_date = malaysia_tests_df["date"] >= start_date
before_end_date = malaysia_tests_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
malaysia_tests_df = malaysia_tests_df.loc[between_two_dates]
malaysia_tests_df.head()

In [None]:
malaysia_tests_df.info()

In [None]:
malaysia_tests_df.describe()

In [None]:
malaysia_tests_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(malaysia_tests_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=0.5, wspace=0.2, hspace=0.6)

sns.boxplot(data=malaysia_tests_df, x = malaysia_tests_df["rtk-ag"],ax=axes[0])
axes[0].set_title('rtk-ag')

sns.boxplot(data=malaysia_tests_df,x = malaysia_tests_df["pcr"],ax=axes[1])
axes[1].set_title('pcr')

### Exploratory Data Analysis for State Tests Dataset

In [None]:
states_tests_df = pd.read_csv(states_tests_dir)
after_start_date = states_tests_df["date"] >= start_date
before_end_date = states_tests_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
states_tests_df = states_tests_df.loc[between_two_dates]
states_tests_df.head()

In [None]:
states_tests_df.info()

In [None]:
states_tests_df.describe()

In [None]:
states_tests_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(states_tests_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=0.5, wspace=0.2, hspace=0.6)

sns.boxplot(data=states_tests_df, x = states_tests_df["rtk-ag"],ax=axes[0])
axes[0].set_title('rtk-ag')

sns.boxplot(data=states_tests_df,x = states_tests_df["pcr"],ax=axes[1])
axes[1].set_title('pcr')

### Exploratory Data Analysis for PKRC Dataset

In [None]:
pkrc_df = pd.read_csv(pkrc_dir)
after_start_date = pkrc_df["date"] >= start_date
before_end_date = pkrc_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
pkrc_df = pkrc_df.loc[between_two_dates]
pkrc_df.head()

In [None]:
pkrc_df.info()

In [None]:
pkrc_df.describe()

In [None]:
pkrc_df.isna().sum()

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(pkrc_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=2, wspace=0.2, hspace=0.6)

sns.boxplot(data=pkrc_df, x = pkrc_df["beds"],ax=axes[0][0])
axes[0][0].set_title('beds')
sns.boxplot(data=pkrc_df,x = pkrc_df["admitted_pui"],ax=axes[0][1])
axes[0][1].set_title('admitted_pui')
sns.boxplot(data=pkrc_df, x = pkrc_df["admitted_covid"],ax=axes[0][2])
axes[0][2].set_title("admitted_covid")
sns.boxplot(data=pkrc_df,x = pkrc_df["admitted_total"],ax=axes[1][0])
axes[1][0].set_title('admitted_total')
sns.boxplot(data=pkrc_df, x = pkrc_df["discharge_pui"],ax=axes[1][1])
axes[1][1].set_title('discharge_pui')
sns.boxplot(data=pkrc_df,x = pkrc_df["discharge_covid"],ax=axes[1][2])
axes[1][2].set_title('discharge_covid')
sns.boxplot(data=pkrc_df, x = pkrc_df["discharge_total"],ax=axes[2][0])
axes[2][0].set_title('discharge_total')
sns.boxplot(data=pkrc_df,x = pkrc_df["pkrc_covid"],ax=axes[2][1])
axes[2][1].set_title('pkrc_covid')
sns.boxplot(data=pkrc_df, x = pkrc_df["pkrc_pui"],ax=axes[2][2])
axes[2][2].set_title('pkrc_pui')
sns.boxplot(data=pkrc_df,x = pkrc_df["pkrc_noncovid"],ax=axes[3][0])
axes[2][0].set_title('pkrc_noncovid')

### Exploratory Data Analaysis for Mysejahtera CheckIn Dataset

In [None]:
checkIn_df = pd.read_csv(checkIn_dir)
after_start_date = checkIn_df["date"] >= start_date
before_end_date = checkIn_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
checkIn_df = checkIn_df.loc[between_two_dates]
checkIn_df.head()

In [None]:
checkIn_df.info()

In [None]:
checkIn_df.describe()

In [None]:
checkIn_null_df=pd.DataFrame({'Column':checkIn_df.isna().sum().index, 'Count of Null Values':checkIn_df.isna().sum().values})  
checkIn_null_df

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(checkIn_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=0.5, wspace=0.2, hspace=0.6)

sns.boxplot(data=checkIn_df, x = checkIn_df["checkins"],ax=axes[0])
axes[0].set_title('checkins')
sns.boxplot(data=checkIn_df,x = checkIn_df["unique_ind"],ax=axes[1])
axes[1].set_title('unique_ind')
sns.boxplot(data=checkIn_df, x = checkIn_df["unique_loc"],ax=axes[2])
axes[1].set_title('unique_loc')

#### Exploratory Data Analysis for Hospital Dataset

In [None]:
hospital_df = pd.read_csv(hospital_dir)
after_start_date = hospital_df["date"] >= start_date
before_end_date = hospital_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
hospital_df = hospital_df.loc[between_two_dates]
hospital_df.head()

In [None]:
hospital_df.info()

In [None]:
hospital_df.describe()

In [None]:
hospital_df_null_df=pd.DataFrame({'Column':hospital_df.isna().sum().index, 'Count of Null Values':hospital_df.isna().sum().values})  
hospital_df_null_df

##### Identify Missing Value
</p> Visualize the number of missing values as a bar chart

In [None]:
msno.bar(hospital_df)

##### Identify Outliers
</p> Visualize the data using box plot, check the distribution of data and outliers

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 5), sharey=True)
# fig.suptitle('Outliers Visualization')
plt.subplots_adjust(left=None, bottom= 0.1, right=None, top=2, wspace=0.2, hspace=0.6)

# beds	beds_covid	beds_noncrit	admitted_pui	admitted_covid	admitted_total	discharged_pui	discharged_covid	discharged_total	hosp_covid	
# hosp_pui	hosp_noncovid

sns.boxplot(data=hospital_df, x = hospital_df["beds"],ax=axes[0][0])
axes[0][0].set_title('beds')
sns.boxplot(data=hospital_df,x = hospital_df["beds_covid"],ax=axes[0][1])
axes[0][1].set_title('beds_covid')
sns.boxplot(data=hospital_df, x = hospital_df["beds_noncrit"],ax=axes[0][2])
axes[0][2].set_title('beds_noncrit')
sns.boxplot(data=hospital_df, x = hospital_df["admitted_pui"],ax=axes[1][0])
axes[1][0].set_title('admitted_pui')
sns.boxplot(data=hospital_df,x = hospital_df["admitted_covid"],ax=axes[1][1])
axes[1][1].set_title('admitted_covid')
sns.boxplot(data=hospital_df, x = hospital_df["admitted_total"],ax=axes[1][2])
axes[1][2].set_title('admitted_total')
sns.boxplot(data=hospital_df, x = hospital_df["discharged_pui"],ax=axes[2][0])
axes[2][0].set_title('discharged_pui')
sns.boxplot(data=hospital_df,x = hospital_df["discharged_covid"],ax=axes[2][1])
axes[2][1].set_title('discharged_covid')
sns.boxplot(data=hospital_df, x = hospital_df["discharged_total"],ax=axes[2][2])
axes[2][2].set_title('discharged_total')
sns.boxplot(data=hospital_df, x = hospital_df["hosp_covid"],ax=axes[3][0])
axes[3][0].set_title('hosp_covid')
sns.boxplot(data=hospital_df,x = hospital_df["hosp_pui"],ax=axes[3][1])
axes[3][1].set_title('hosp_pui')
sns.boxplot(data=hospital_df, x = hospital_df["hosp_noncovid"],ax=axes[3][2])
axes[3][2].set_title('hosp_noncovid')

# <font color='FireBrick'>Question 3 (ii)</font>

### Data Preprocessing with One-Hot Encoding

Before finding the relationship between the states, we perform One-Hot encoding to preprocess the state cases dataset.

In [17]:
state_case_df = pd.get_dummies(state_case_df, prefix='', columns=['state'])
state_case_df

NameError: name 'state_case_df' is not defined

In [None]:
state_case_import_df = state_case_df.loc[:,'_Johor':].multiply(state_case_df["cases_import"], axis="index")
state_case_new_df = state_case_df.loc[:,'_Johor':].multiply(state_case_df["cases_new"], axis="index")
state_case_recovered_df = state_case_df.loc[:,'_Johor':].multiply(state_case_df["cases_recovered"], axis="index")

In [None]:
state_case_import_df.columns = state_case_import_df.columns.str[1:]
state_case_import_df.head()

In [None]:
state_case_import_df['date'] = date
state_case_new_df['date'] = date
state_case_recovered_df['date'] = date

In [None]:
state_case_import_df = state_case_import_df.groupby([state_case_import_df['date']]).sum()
state_case_new_df = state_case_new_df.groupby([state_case_new_df['date']]).sum()
state_case_recovered_df = state_case_recovered_df.groupby([state_case_recovered_df['date']]).sum()

In [None]:
corr = state_case_new_df.corr()
fig, ax = plt.subplots(figsize=(20,10))  
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot = True,
    linewidths = 2
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
ax.set_title('New Case Correlation Heatmap')

# <font color='FireBrick'>Question 3 (iii)</font>

Pre-processing the needed datasets : **Cluster, State Cases, Tests，Mysejahtera checkins, PKRC, Hospital** </p> We use data of 4 states in Malaysia to perform feature selection : **Pahang, Johor, Kedah, Selangor**. </p>

In [None]:
state = ["Pahang","Kedah","Johor","Selangor"]
clusters_df = clusters_df.loc[clusters_df['state'].isin(state)]
clusters_df['date'] = clusters_df.date_announced
state_case_df = state_case_df.loc[state_case_df['state'].isin(state)]
states_tests_df = states_tests_df.loc[states_tests_df['state'].isin(state)]
mysejahtera = mysejahtera.loc[mysejahtera['state'].isin(state)]
mysejahtera_checkins["date"] = mysejahtera.date
pkrc_df = pkrc_df.loc[pkrc_df['state'].isin(state)]
pkrc_df['date'] = pkrc_df.date
pkrc_df = pkrc_df.add_suffix('_pkrc')
hospital_df = hospital_df.loc[hospital_df['state'].isin(state)]
hospital_df['date'] = hospital_df.date
hospital_df = hospital_df.add_suffix('_hospital')

In [None]:
# Cluster Dataset
## One-Hot Encoding
clusters_df = pd.get_dummies(clusters_df, prefix='cluster', columns=['category'])
## Drop unused columns
clusters_df.drop(['cases_new','cases_total','cases_active','tests','icu','deaths','recovered'], axis=1, inplace=True)
## Group by date and state
clusters_df=clusters_df.groupby(['date_announced','state']).sum()clusters_df=clusters_df.groupby(['date_announced','state']).sum()
clusters_df

In [None]:
# State Cases Dataset
## Group by date and state
state_case_df=state_case_df.groupby(['date','state']).sum()
state_case_df

In [None]:
# State Tests Dataset
## Group by date and state
states_tests_df=states_tests_df.groupby(['date','state']).sum()
states_tests_df

In [None]:
# MySejahtera Checkins Dataset
## One-Hot Encoding
mysejahtera = pd.get_dummies(mysejahtera, prefix='', columns=['state'])
mysejahtera_checkins = mysejahtera.loc[:,'_Johor':].multiply(mysejahtera["checkins"], axis="index")
mysejahtera_checkins.columns = mysejahtera_checkins.columns.str[1:]
## Group by date and state
mysejahtera_checkins = mysejahtera_checkins.groupby([mysejahtera_checkins['date']]).sum()
mysejahtera_checkins.columns.name = 'state'
mysejahtera_checkins = mysejahtera_checkins.stack()
mysejahtera_checkins.name = 'Checkins number'
mysejahtera_checkins = mysejahtera_checkins.reset_index()
mysejahtera_checkins = mysejahtera_checkins.groupby(['date','state']).sum()
mysejahtera_checkins

In [None]:
# PKRC Dataset
## Group by date and state
pkrc_df = pkrc_df.groupby(['date_pkrc','state_pkrc']).sum()
pkrc_df

In [None]:
# Hospital Dataset
## Group by date and state
hospital_df = hospital_df.groupby(['date_hospital','state_hospital']).sum()
hospital_df

## Merge Datasets

In [None]:
df_final = pd.concat([clusters_df, states_tests_df, state_case_df,mysejahtera_checkins,pkrc_df,hospital_df], axis=1)
df_final.fillna(0,inplace=True)
df_final

## Perform Boruta Classifier

In [None]:
rslt_df_ph = df_final[df_final['state'] == "Pahang"]
rslt_df_kd = df_final[df_final['state'] == "Kedah"]
rslt_df_jh = df_final[df_final['state'] == "Johor"]
rslt_df_sl = df_final[df_final['state'] == "Selangor"]

### Random forest classifier

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight="balanced",criterion = "entropy")
rf

### Boruta Model

In [None]:
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

In [None]:
feat_selector = BorutaPy(rf, n_estimators="auto", random_state=1)

In [None]:
df = df_final
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
colnames = X.columns
feat_selector.fit(X.values, y.values.ravel()
print(feat_selector.support_)
print(feat_selector.ranking_)
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score']) 
boruta_score = boruta_score.sort_values("Score",ascending = False)
                  print('---------Top 5----------')
display(rfe_score.head(5))
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[0:35], kind = "bar", 
               height=14, aspect=1.5, palette='RdYlBu')
plt.title("Boruta Top Features for All 4 states")

In [None]:
df = rslt_df_ph
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
colnames = X.columns
feat_selector.fit(X.values, y.values.ravel()
print(feat_selector.support_)
print(feat_selector.ranking_)
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score']) 
boruta_score = boruta_score.sort_values("Score",ascending = False)
                  print('---------Top 5----------')
display(rfe_score.head(5))
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[0:35], kind = "bar", 
               height=14, aspect=1.5, palette='RdYlBu')
plt.title("Boruta Top Features for Pahang")

In [None]:
df =rslt_df_kd
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
colnames = X.columns
feat_selector.fit(X.values, y.values.ravel()
print(feat_selector.support_)
print(feat_selector.ranking_)
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score']) 
boruta_score = boruta_score.sort_values("Score",ascending = False)
                  print('---------Top 5----------')
display(rfe_score.head(5))
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[0:35], kind = "bar", 
               height=14, aspect=1.5, palette='RdYlBu')
plt.title("Boruta Top Features for Kedah")

In [None]:
df = rslt_df_jh
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
colnames = X.columns
feat_selector.fit(X.values, y.values.ravel()
print(feat_selector.support_)
print(feat_selector.ranking_)
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score']) 
boruta_score = boruta_score.sort_values("Score",ascending = False)
                  print('---------Top 5----------')
display(rfe_score.head(5))
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[0:35], kind = "bar", 
               height=14, aspect=1.5, palette='RdYlBu')
plt.title("Boruta Top Features for Johor")

In [None]:
df = rslt_df_sl
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
colnames = X.columns
feat_selector.fit(X.values, y.values.ravel()
print(feat_selector.support_)
print(feat_selector.ranking_)
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score']) 
boruta_score = boruta_score.sort_values("Score",ascending = False)
                  print('---------Top 5----------')
display(rfe_score.head(5))
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[0:35], kind = "bar", 
               height=14, aspect=1.5, palette='RdYlBu')
plt.title("Boruta Top Features for Selangor")

## Perform RFE Classifier

In [None]:
df = df_final
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
rfe = RFECV(rf, min_features_to_select = 1, cv =2)
rfe.fit(X, y)
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
print('---------Top 5----------')
display(rfe_score.head(5))
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[0:35], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE Features Ranking")

In [None]:
df = rslt_df_ph
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
rfe = RFECV(rf, min_features_to_select = 1, cv =2)
rfe.fit(X, y)
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
print('---------Top 5----------')
display(rfe_score.head(5))
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[0:35], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE Features Ranking")

In [None]:
df = rslt_df_kd
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
rfe = RFECV(rf, min_features_to_select = 1, cv =2)
rfe.fit(X, y)
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
print('---------Top 5----------')
display(rfe_score.head(5))
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[0:35], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE Features Ranking")

In [None]:
df = rslt_df_jh
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
rfe = RFECV(rf, min_features_to_select = 1, cv =2)
rfe.fit(X, y)
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
print('---------Top 5----------')
display(rfe_score.head(5))
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[0:35], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE Features Ranking")

In [None]:
df = rslt_df_sl
y = df.cases_new
X = df.drop(["cases_new","date","state"], 1)
rfe = RFECV(rf, min_features_to_select = 1, cv =2)
rfe.fit(X, y)
#model = LogisticRegression(solver='lbfgs')
#rfe = RFE(model, 3)
#rfe.fit(X, y)
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
print('---------Top 5----------')
display(rfe_score.head(5))
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[0:35], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE Features Ranking")

# <font color='FireBrick'>Question 3 (iv)</font>

## Regression 

In [None]:
X = df_final.drop(['cases_new','date','state'], axis=1)  #predict newcases
y = df_final['cases_new']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Linear Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
rfr = RandomForestRegressor()
rfr.fit(X, y)
y_pred = rfr.predict(X_test)
print("Random Forest Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

In [None]:
X = rslt_df_ph.drop(['cases_new','date','state'], axis=1)  #predict newcases
y = rslt_df_ph['cases_new']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Linear Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
rfr = RandomForestRegressor()
rfr.fit(X, y)
y_pred = rfr.predict(X_test)
print("Random Forest Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

In [None]:
X = rslt_df_kd.drop(['cases_new','date','state'], axis=1)  #predict newcases
y = rslt_df_kd['cases_new']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Linear Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
rfr = RandomForestRegressor()
rfr.fit(X, y)
y_pred = rfr.predict(X_test)
print("Random Forest Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

In [None]:
X = rslt_df_jh.drop(['cases_new','date','state'], axis=1)  #predict newcases
y = rslt_df_jh['cases_new']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Linear Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
rfr = RandomForestRegressor()
rfr.fit(X, y)
y_pred = rfr.predict(X_test)
print("Random Forest Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

In [None]:
X = rslt_df_sl.drop(['cases_new','date','state'], axis=1)  #predict newcases
y = rslt_df_sl['cases_new']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Linear Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
rfr = RandomForestRegressor()
rfr.fit(X, y)
y_pred = rfr.predict(X_test)
print("Random Forest Regressor")
print("Median absolute error : " + median_absolute_error(y_test, y_pred))
print("Mean absolute error : " + mean_absolute_error(y_test, y_pred))
print("Mean squared error : " + mean_squared_error(y_test, y_pred))
print("Root mean square error : " + np.sqrt(mean_squared_error(y_test,y_pred)))
print("R squared: " + r2_score(y_test,y_pred))

## Classifier

In [None]:
def getBinsRange(df):  
        data = df['cases_new'].values
        # First quartile (Q1)
        Q1 = np.percentile(data, 25, interpolation = 'midpoint')
        # Third quartile (Q3)
        Q3 = np.percentile(data, 75, interpolation = 'midpoint')

        return [np.min(data),Q1,Q3,np.inf]

labels = ['Low','Medium','High']

In [None]:
# Perform Binning
df_final['cases_new_category'] = (pd.cut(df_final['cases_new'].values, bins=getBinsRange(df_final),labels=labels, include_lowest=True))
X = df_final.drop(['cases_new','date','state','cases_new_category'], axis=1)
y = df_final.cases_new_category # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter='random') #pruning the tree by setting the depth
# Train Decision Tree Classifer*
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset*
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Gaussian Naie Bayes")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Perform Binning
rslt_df_ph['cases_new_category'] = (pd.cut(rslt_df_ph['cases_new'].values, bins=getBinsRange(rslt_df_ph),labels=labels, include_lowest=True))
X = rslt_df_ph.drop(['cases_new','date','state','cases_new_category'], axis=1)
y = rslt_df_ph.cases_new_category # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter='random') #pruning the tree by setting the depth
# Train Decision Tree Classifer*
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset*
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Gaussian Naie Bayes")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Perform Binning
rslt_df_kd['cases_new_category'] = (pd.cut(rslt_df_kd['cases_new'].values, bins=getBinsRange(rslt_df_kd),labels=labels, include_lowest=True))
X = rslt_df_kd.drop(['cases_new','date','state','cases_new_category'], axis=1)
y = rslt_df_kd.cases_new_category # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter='random') #pruning the tree by setting the depth
# Train Decision Tree Classifer*
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset*
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Gaussian Naie Bayes")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Perform Binning
rslt_df_jh['cases_new_category'] = (pd.cut(rslt_df_jh['cases_new'].values, bins=getBinsRange(rslt_df_jh),labels=labels, include_lowest=True))
X = rslt_df_jh.drop(['cases_new','date','state','cases_new_category'], axis=1)
y = rslt_df_jh.cases_new_category # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter='random') #pruning the tree by setting the depth
# Train Decision Tree Classifer*
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset*
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Gaussian Naie Bayes")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Perform Binning
rslt_df_sl['cases_new_category'] = (pd.cut(rslt_df_sl['cases_new'].values, bins=getBinsRange(rslt_df_sl),labels=labels, include_lowest=True))
X = rslt_df_sl.drop(['cases_new','date','state','cases_new_category'], axis=1)
y = rslt_df_sl.cases_new_category # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter='random') #pruning the tree by setting the depth
# Train Decision Tree Classifer*
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset*
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Gaussian Naie Bayes")
print('Precision= {:.2f}'.format(precision_score(y_test, y_pred, average="weighted")))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred, average="weighted")))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred, average="weighted")))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))