<a href="https://colab.research.google.com/github/hendradarwin/covid-19-prediction/blob/master/Covid_World_New_Death_Case_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prediction New Death Cases Global Covid-19 Cases


## Load Data and Import Libraries Section

In [None]:
# Use some functions from tensorflow_docs
!pip install -q git+https://github.com/tensorflow/docs

In [None]:
# %tensorflow_version 2.x # make sure that collab use tensorflow 2
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import os
import datetime
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
# from google.colab import drive
# drive.mount('/content/drive')

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 16, 10


In [None]:
# !rm '/root/.keras/datasets/global_total.csv'

## Load Data

In [None]:

df_new_cases = pd.read_csv("https://raw.githubusercontent.com/virgiawan/covid-19-prediction/linear-regression/dataset/corona-virus/new_cases.csv")
df_total_cases = pd.read_csv("https://raw.githubusercontent.com/virgiawan/covid-19-prediction/linear-regression/dataset/corona-virus/total_cases.csv")
df_new_deaths = pd.read_csv("https://raw.githubusercontent.com/virgiawan/covid-19-prediction/linear-regression/dataset/corona-virus/new_deaths.csv")
df_new_deaths = pd.read_csv("https://raw.githubusercontent.com/virgiawan/covid-19-prediction/linear-regression/dataset/corona-virus/new_deaths.csv")

In [None]:
np_data = np.array([df_new_cases['World'], df_total_cases['World'], df_new_deaths['World']]).T
dataset = pd.DataFrame({'new_cases': np_data[:, 0], 'total_cases': np_data[:, 1], 'new_deaths': np_data[:, 2]})
dataset

## Data Exploration

In [None]:
dataset.plot(subplots=True)

In [None]:
# Correlation between new_cases and new_deaths
cor_new_and_death_cases = np.corrcoef(dataset['new_cases'], 
                                      dataset['new_deaths'])[0, 1]
print("Coefisien correlation between new cases and new deaths %f" 
      %(cor_new_and_death_cases))
dataset.plot(kind='scatter', x='new_cases', 
             y='new_deaths', color='red', 
             title='Scatter Plot New Cases and New Death Cases')
plt.show()

In [None]:
# Correlation between total_cases and new_deaths
cor_total_and_death_cases = np.corrcoef(dataset['total_cases'], 
                                      dataset['new_deaths'])[0, 1]
print("Coefisien correlation between new cases and new deaths %f" 
      %(cor_total_and_death_cases))
dataset.plot(kind='scatter', x='total_cases', 
             y='new_deaths', color='red', 
             title='Scatter Plot Total Cases and New Death Cases')
plt.show()

From scatter plot and cofisien correlation value, we can see that `new cases and new deaths` and `total cases and new deaths` have strong correlations

## Data Preprocessing

In [None]:

dataset = dataset[dataset['new_deaths'] != 0] # remove 0 value on new death cases
pd.set_option('display.max_rows', dataset.shape[0]+1)
dataset.index = pd.RangeIndex(len(dataset.index)) # reset index
dataset

## Split the data (data training and testing)

In [None]:
train_size = int(len(dataset['new_cases']) * 0.7)
test_size = len(dataset['new_cases']) - train_size
train_new_cases, test_new_cases = dataset['new_cases'].iloc[0:train_size], dataset['new_cases'].iloc[train_size:len(dataset['new_cases'])]

In [None]:
train_size = int(len(dataset['total_cases']) * 0.7)
test_size = len(dataset['total_cases']) - train_size
train_total_cases, test_total_cases = dataset['total_cases'].iloc[0:train_size], dataset['total_cases'].iloc[train_size:len(dataset['total_cases'])]

In [None]:
train_size = int(len(dataset['new_deaths']) * 0.7)
test_size = len(dataset['new_deaths']) - train_size
train_new_deaths, test_new_deaths = dataset['new_deaths'].iloc[0:train_size], dataset['new_deaths'].iloc[train_size:len(dataset['new_deaths'])]

## Normalize the data

In [None]:
def norm_data(all_data, reverse = False):
  if (reverse):
    return lambda data: (data * np.std(all_data)) + np.mean(all_data)
  else:
    return lambda data: (data - np.mean(all_data)) / np.std(all_data)

In [None]:
# normalize new cases
fun_norm_new_cases = norm_data(dataset['new_cases'])
norm_train_new_cases = fun_norm_new_cases(train_new_cases)
norm_test_new_cases = fun_norm_new_cases(test_new_cases)

# normalize total cases
fun_norm_total_cases = norm_data(dataset['total_cases'])
norm_train_total_cases = fun_norm_total_cases(train_total_cases)
norm_test_total_cases = fun_norm_total_cases(test_total_cases)

# normalize death cases
fun_norm_death_cases = norm_data(dataset['new_deaths'])
norm_train_death_cases = fun_norm_death_cases(train_new_deaths)
norm_test_death_cases = fun_norm_death_cases(test_new_deaths)

## Prepare input data

In [None]:
train_input = np.vstack((norm_train_new_cases, norm_train_total_cases)).T
test_input = np.vstack((norm_test_new_cases, norm_test_total_cases)).T

# print shape data
print("train input shape %s" %(train_input.shape,))
print("test input shape %s" %(test_input.shape,))
print("train death cases shape %s" %(norm_train_death_cases.shape,))
print("test death cases shape %s" %(norm_test_death_cases.shape,))

### Create a model

In [None]:
model = tf.keras.models.Sequential([              
  tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1), input_shape=[None]),
  tf.keras.layers.LSTM(64, return_sequences=True),
  tf.keras.layers.LSTM(64, return_sequences=True),          
  tf.keras.layers.Dense(128, activation='relu', input_shape=[train_input.shape[1]]),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(1),
  tf.keras.layers.Lambda(lambda x: x * 10.0)
])

lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 0.001 * 10**(epoch / 20))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['mse'])

In [None]:
model.summary()

In [None]:
EPOCHS = 1000

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('loss') <= 0.2):
      print("\nReached loss below 20%, so cancelling training!")
      self.model.stop_training = True

accuracy_callback = myCallback()
history = model.fit(train_input, norm_train_death_cases,
                    epochs=EPOCHS,
                    callbacks=[lr_schedule,tfdocs.modeling.EpochDots(), accuracy_callback])

In [None]:
plt.semilogx(history.history["lr"], history.history["loss"])
plt.title("Learning rate vs Loss")

In [None]:
loss, mse = model.evaluate(test_input, norm_test_death_cases, verbose=0)

# try to predict the new death cases using test data
pred_norm_value = model.predict(test_input)
# create unormalize function
fun_unorm_death_cases = norm_data(dataset['new_deaths'], True)
pred_new_deaths = fun_unorm_death_cases(pred_norm_value)
pred_new_deaths = np.round(pred_new_deaths)

# try compare with real value
np_compare = np.array([pred_new_deaths.flatten(), test_new_deaths]).T
dt_compare = pd.DataFrame({'pred_new_deaths': np_compare[:, 0], 'real_new_deaths': np_compare[:, 1]})
dt_compare.plot()
print("Loss: %f" %(loss))
print("MSE: %f" %(mse))

In [None]:
dt_compare

In [None]:
list_date = (df_new_deaths['date'][(-1 * len(test_new_deaths)):]).tolist()
dt_compare.assign(date=list_date)
list_date
