# COVID19 - April 2020 Forecast
### A simple LSTM to predict time series

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import RepeatVector
from keras.layers import TimeDistributed

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')

## Exploration

In [None]:
display(train_df.head())
display(train_df.info())

In [None]:
print('We have', len(train_df.Country_Region.unique()), 'countries/regions in the dataset.')
print('We have', len(train_df.Province_State.unique()), 'provinces/states in the dataset.')

In [None]:
num_fatalities = train_df.loc[train_df['Date'] == train_df['Date'].max()]['Fatalities'].sum()
print('Between {} and {} there are {} fatalities.'.format(train_df['Date'].min(), train_df['Date'].max(), int(num_fatalities)))

In [None]:
cz_sick = int(train_df.loc[(train_df['Country_Region'] == 'Czechia') & (train_df['Date'] == train_df['Date'].max())]['ConfirmedCases'].values[0])
cz_fatalities = int(train_df.loc[(train_df['Country_Region'] == 'Czechia') & (train_df['Date'] == train_df['Date'].max())]['Fatalities'].values[0])
print('In Czech Republic there are {} confirmed cases and {} fatalities.'.format(cz_sick, cz_fatalities))

In [None]:
timetrend_sick = sns.lineplot(train_df['Date'], train_df['ConfirmedCases'])

In [None]:
timetrend_deceased = sns.lineplot(train_df['Date'], train_df['Fatalities'])

## Transformation, Pre-processing

In [None]:
# Add a new column to be able to distinguish regions

train_df['UniqueRegion'] = np.where(train_df['Province_State'].isna(), train_df['Country_Region'], train_df['Country_Region'] + ' - ' + train_df['Province_State'])

In [None]:
# Calculate number of new sick per day

countries = train_df['UniqueRegion'].unique()
train_df['SickPerDay'] = 0

baseline_length = len(train_df.loc[train_df['UniqueRegion'] == 'Afghanistan']) # Country chosen arbitrarily

for country in countries:
    len_country = len(train_df.loc[train_df['UniqueRegion'] == country])
    len_diffs = len(train_df.loc[train_df['UniqueRegion'] == country]['ConfirmedCases'].diff())
    if len_country > baseline_length or len_diffs > baseline_length:
        raise NameError('Too many rows for country {}'.format(country))
    train_df['SickPerDay'].loc[(train_df['UniqueRegion'] == country)] = train_df.loc[train_df['UniqueRegion'] == country]['ConfirmedCases'].diff()
    
train_df['SickPerDay'] = train_df['SickPerDay'].fillna(0)

# Show an example
display(train_df.loc[train_df['UniqueRegion'] == 'Czechia'].tail())

In [None]:
sns.lineplot(train_df.loc[(train_df['UniqueRegion'] == 'Czechia') & (train_df['ConfirmedCases'] > 0)]['Date'], train_df.loc[(train_df['UniqueRegion'] == 'Czechia')& (train_df['ConfirmedCases'] > 0)]['ConfirmedCases'])

In [None]:
sns.lineplot(x=train_df.loc[(train_df['UniqueRegion'] == 'Czechia') & (train_df['SickPerDay'] > 0)]['Date'],
             y=train_df.loc[(train_df['UniqueRegion'] == 'Czechia')& (train_df['SickPerDay'] > 0)]['SickPerDay'])

In [None]:
top10_most_cases = train_df.loc[train_df['Date'] == train_df['Date'].max()][['UniqueRegion','ConfirmedCases']].sort_values(by='ConfirmedCases', ascending=False).head(10)
top10_most_deceased = train_df.loc[train_df['Date'] == train_df['Date'].max()][['UniqueRegion','Fatalities']].sort_values(by='Fatalities', ascending=False).head(10)
top10_most_sick_per_day = train_df.loc[train_df['Date'] == train_df['Date'].max()][['UniqueRegion','SickPerDay']].sort_values(by='SickPerDay', ascending=False).head(10)

In [None]:
top10_most_cases_df = train_df.loc[train_df['UniqueRegion'].isin(top10_most_cases['UniqueRegion'].values)]

In [None]:
# Transform the dataframe to show each country in a different column

main_df = pd.DataFrame()

for i, top10_country in enumerate (top10_most_cases_df['UniqueRegion'].unique()):
    if i == 0:
        main_df = top10_most_cases_df.loc[top10_most_cases_df['UniqueRegion'] == top10_country][['Date', 'ConfirmedCases']].sort_values(by='Date')
        main_df = main_df.rename({'ConfirmedCases': top10_country}, axis='columns')

    else:
        temp_df = top10_most_cases_df.loc[top10_most_cases_df['UniqueRegion'] == top10_country][['Date', 'ConfirmedCases']]
        temp_df = temp_df.rename({'ConfirmedCases': top10_country}, axis='columns')
        main_df = pd.merge(main_df, temp_df, on=['Date'])

main_df = main_df.set_index('Date')
main_df.head()

In [None]:
main_df.plot(figsize=(20,10))

In [None]:
# Transform main data into a horizontal dataframe

def transform_horizontally(input_df, value_column):

    horizontal_df = pd.DataFrame()

    for i, uniqueRegion in enumerate (input_df['UniqueRegion'].unique()):
        if i == 0:
            horizontal_df = input_df.loc[input_df['UniqueRegion'] == uniqueRegion][['Date', value_column]].sort_values(by='Date')
            horizontal_df = horizontal_df.rename({value_column: uniqueRegion}, axis='columns')

        else:
            temp_df = input_df.loc[train_df['UniqueRegion'] == uniqueRegion][['Date', value_column]]
            temp_df = temp_df.rename({value_column: uniqueRegion}, axis='columns')
            horizontal_df = pd.merge(horizontal_df, temp_df, on=['Date'])
            
    return horizontal_df

In [None]:
confirmed_horizontal_df = transform_horizontally(train_df, 'ConfirmedCases').sort_values(by='Date')
fatalities_horizontal_df = transform_horizontally(train_df, 'Fatalities').sort_values(by='Date')


display(confirmed_horizontal_df.head())
display(confirmed_horizontal_df.shape)

display(fatalities_horizontal_df.head())
display(fatalities_horizontal_df.shape)

In [None]:
# Convert dataframes into numpy arrays

np_confirmed = confirmed_horizontal_df.drop(columns=['Date']).to_numpy()
np_confirmed

In [None]:
# Scale the values (better performance of LSTM)
scaler = MinMaxScaler(feature_range = (0, 1))
np_confirmed_scaled = scaler.fit_transform(np_confirmed)

In [None]:
# Split a multivariate sequence into samples
# Credits to: https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting

def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix:out_end_ix, :]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
n_steps_in = 11
n_steps_out = 1

X, y = split_sequences(np_confirmed_scaled, n_steps_in, n_steps_out)

In [None]:
n_features = X.shape[2]
display(n_features)

In [None]:
# Define model
model = Sequential()
model.add(LSTM(500, activation='relu', input_shape=(n_steps_in, n_features)))
model.add(RepeatVector(n_steps_out))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(n_features)))
model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit(X, y, epochs=300, verbose=0)

In [None]:
X_pred = np_confirmed_scaled[-n_steps_in-1:-n_steps_out].reshape((1, n_steps_in, n_features))
y_pred = model.predict(X_pred)

In [None]:
print(list(np_confirmed[-1]))
rounded_pred = [int(x) for x in scaler.inverse_transform(y_pred[0])[0].astype(int)]
print(rounded_pred)