In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from datetime import datetime
from datetime import timedelta
from statsmodels.tsa.arima_model import ARIMA
import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_dataset(url, country):
    raw_dataset = pd.read_csv(url)
    df = raw_dataset[raw_dataset["Country"]==country].reset_index(drop=True) 
    df.insert(5, 'Day', range(1, 1 + len(df)))
    for i in range(0, len(df)):
        df.loc[i, 'Active']    = df.loc[i, 'Confirmed'] - df.loc[i, 'Recovered']
        df.loc[i, 'New_Cases'] = 0 if i == 0 else df.loc[i, 'Confirmed'] - df.loc[i-1, 'Confirmed']
        df.loc[i, 'Spread_Rate'] = None if i == 0 else df.loc[i, 'Confirmed'] / df.loc[i-1, 'Confirmed']
        df.loc[i, 'New_Cases_Diff'] = 0 if i == 0 else df.loc[i, 'New_Cases'] - df.loc[i-1, 'New_Cases']
        df.loc[i, 'Spread_Rate_Diff'] = (0 if i == 0 else df.loc[i, 'Spread_Rate'] - df.loc[i-1, 'Spread_Rate']) * 100
    return df[['Country', 'Date','Day','Confirmed','Recovered','Deaths','Active','New_Cases','New_Cases_Diff','Spread_Rate','Spread_Rate_Diff']].reset_index(drop=True) 


In [3]:
def get_time_series_dataset(ds, column, train_cnt):
    ds['Date'] = pd.to_datetime(ds['Date'], format='%Y-%m-%d')
    return ds[['Date', column]].set_index('Date').tail(train_cnt) 

In [4]:
def get_arima_order(train):
    p=d=q=range(0,5)
    pdq = list(itertools.product(p,d,q))
    import warnings
    warnings.filterwarnings('ignore')
    d={}
    for param in pdq:
        try:
            model_arima = ARIMA(train,order=param)
            model_arima_fit = model_arima.fit()
            d[param]=model_arima_fit.aic
        except:
            continue
    return min(d, key=lambda k: d[k]);

In [5]:
def get_model(ds):
    X = ds.values
    order = get_arima_order(X)
    model_arima = ARIMA(X,order)
    model_arima_fit = model_arima.fit()
    return model_arima_fit

In [6]:
## Function to find the prediction when the curve starts flattening
def find_flatten_point(model, steps, threshold):
    predictions = model.forecast(steps)[0]
    last_prediction = predictions[len(predictions)-1]
    last_before_prediction = predictions[len(predictions)-2]
    if(last_prediction <= last_before_prediction):
        print("achieved flatten the curve at "+str(steps)+" step and the value is "+str(last_prediction))
        return steps
    elif (last_prediction > last_before_prediction and steps >= threshold):
        print('Threashold reached.. Flatten the curve never achieved in the prediction')
        return None
    else:
        return find_flatten_point(model, steps+1, threshold)

In [7]:
##  Function to find when the prediction goes below zero
def get_last_prediction(model, steps):
    predictions = model.forecast(steps)[0]
    return predictions[-1]

def find_zero_case(model, steps, threshold):
    last_value = get_last_prediction(model, steps)
    if(last_value < 0):
        return steps
    elif (last_value > 0 and steps >= threshold):
        return None
    else:
        return find_zero_case(model, steps + 1, threshold)

In [8]:
def get_flatten_point_pattern(ds, column, train_start, train_end, prediction_threshold):
    if(train_start > train_end):
        return
    time_series_ds = get_time_series_dataset(ds, column, train_cnt=train_start)
    model = get_model(time_series_ds)
    prediction = find_zero_case(model, 5, prediction_threshold)
    print("<<< Zero Case Prediction for "+str(train_start)+" cnt is "+str(prediction)+" >>>")
    train_start = train_start + 1
    return get_flatten_point_pattern(ds, column, train_start, train_end, prediction_threshold)

In [9]:
def tailor_predictions(predictions, negative_value_threshold):
    result = []
    negative_cnt = 0
    for i in range(0, len(predictions)):
        if (negative_cnt > negative_value_threshold):
            return result
        if (predictions[i] < 0):
            negative_cnt = negative_cnt + 1
        result.append(predictions[i])
    return result

In [10]:
def create_predictions_dataframe(dataset, predictions):
    df = dataset.tail(1).reset_index(drop=True)[['Country', 'Date', 'Day','Confirmed','New_Cases']]
    df.loc[0, 'Day'] = 0
    for i in range(0, len(predictions)):
        day =  df.Day[-1:].values[0] + 1
        new_cases = predictions[i]
        confirmed = int(df.Confirmed[-1:].values[0] + new_cases)
        dateString = pd.to_datetime(str(df.Date[-1:].values[0])).strftime('%Y-%m-%d')
        date = datetime.strftime(datetime.strptime(dateString, '%Y-%m-%d') + timedelta(days=1), '%Y-%m-%d')
        df = df.append({'Country': 'Canada', 'Date' : date, 'Day': day, 'Confirmed': confirmed, 'New_Cases': new_cases} , ignore_index=True)
    df['New_Cases'] = df['New_Cases'].astype(int)  
    return df  

In [11]:
def plot_model_predictions(df, country, column, peak_day):
    x=df.Day.values
    y=df[column].values
    plt.scatter(x, y, color='green')
    plt.plot(x, y)
    plt.title(country+' Predictions')
    plt.xlabel('Day')
    plt.ylabel(column)
    plt.axvline(x=peak_day)
    return plt

In [12]:
## Configure Parameters

In [13]:
url='/Users/laya/Desktop/work/projects/data/country/covid-19/data/countries-aggregated.csv'
country = 'Canada'
column = 'New_Cases'
dataset = get_dataset(url, country)
dataset.tail(15)

Unnamed: 0,Country,Date,Day,Confirmed,Recovered,Deaths,Active,New_Cases,New_Cases_Diff,Spread_Rate,Spread_Rate_Diff
85,Canada,2020-04-16,86,30809,9698,1259,21111.0,2600.0,1426.0,1.092169,4.874398
86,Canada,2020-04-17,87,32814,10545,1356,22269.0,2005.0,-595.0,1.065078,-2.709078
87,Canada,2020-04-18,88,34356,10964,1401,23392.0,1542.0,-463.0,1.046992,-1.808625
88,Canada,2020-04-19,89,35633,11847,1565,23786.0,1277.0,-265.0,1.03717,-0.98225
89,Canada,2020-04-20,90,37658,12543,1727,25115.0,2025.0,748.0,1.056829,1.965971
90,Canada,2020-04-21,91,39402,13188,1910,26214.0,1744.0,-281.0,1.046312,-1.05178
91,Canada,2020-04-22,92,41663,14454,2078,27209.0,2261.0,517.0,1.057383,1.107133
92,Canada,2020-04-23,93,43299,14761,2241,28538.0,1636.0,-625.0,1.039267,-1.811542
93,Canada,2020-04-24,94,44919,15149,2402,29770.0,1620.0,-16.0,1.037414,-0.18532
94,Canada,2020-04-25,95,46371,16013,2571,30358.0,1452.0,-168.0,1.032325,-0.508941


In [14]:
## Check the flatten point of new cases for different training set

In [15]:
get_flatten_point_pattern(dataset, column=column, train_start=5, train_end=20, prediction_threshold=250)

<<< Zero Case Prediction for 5 cnt is 91 >>>
<<< Zero Case Prediction for 6 cnt is 5 >>>
<<< Zero Case Prediction for 7 cnt is None >>>
<<< Zero Case Prediction for 8 cnt is None >>>
<<< Zero Case Prediction for 9 cnt is None >>>
<<< Zero Case Prediction for 10 cnt is None >>>
<<< Zero Case Prediction for 11 cnt is 21 >>>
<<< Zero Case Prediction for 12 cnt is 11 >>>
<<< Zero Case Prediction for 13 cnt is 14 >>>
<<< Zero Case Prediction for 14 cnt is 37 >>>
<<< Zero Case Prediction for 15 cnt is None >>>
<<< Zero Case Prediction for 16 cnt is 31 >>>
<<< Zero Case Prediction for 17 cnt is 16 >>>
<<< Zero Case Prediction for 18 cnt is 16 >>>
<<< Zero Case Prediction for 19 cnt is 26 >>>
<<< Zero Case Prediction for 20 cnt is 11 >>>


In [16]:
### Execute the model for best train set and ARIMA Order

In [19]:
train_cnt = 15
ts_dataset = get_time_series_dataset(dataset, column=column, train_cnt=train_cnt)
X = ts_dataset.values.astype('float32')
order=get_arima_order(X)
print("ARIMA Order is "+str(order))
model_arima = ARIMA(X, order)
model_arima_fit = model_arima.fit()
predictions = model_arima_fit.forecast(steps=150)[0]
predictions = tailor_predictions(predictions, negative_value_threshold=25)
df = create_predictions_dataframe(dataset, predictions)
print("Prediction Dataframe size is "+str(df.shape))

ARIMA Order is (0, 2, 3)
Prediction Dataframe size is (151, 5)


In [20]:
predictions

[2066.7454675418153,
 2155.8429673002142,
 2450.084656821732,
 2781.6907145308905,
 3150.66114042769,
 3556.9959345121306,
 4000.695096784212,
 4481.758627243935,
 5000.1865258912985,
 5555.978792726303,
 6149.135427748949,
 6779.656430959236,
 7447.541802357164,
 8152.791541942733,
 8895.405649715942,
 9675.384125676792,
 10492.726969825284,
 11347.434182161416,
 12239.50576268519,
 13168.941711396605,
 14135.742028295661,
 15139.906713382357,
 16181.435766656696,
 17260.329188118674,
 18376.586977768293,
 19530.209135605553,
 20721.195661630456,
 21949.546555843,
 23215.261818243183,
 24518.341448831008,
 25858.785447606475,
 27236.593814569584,
 28651.76654972033,
 30104.30365305872,
 31594.20512458475,
 33121.47096429842,
 34686.101172199735,
 36288.09574828869,
 37927.45469256528,
 39604.17800502952,
 41318.2656856814,
 43069.71773452092,
 44858.534151548076,
 46684.71493676287,
 48548.26009016531,
 50449.169611755395,
 52387.44350153312,
 54363.081759498484,
 56376.08438565149,
 

In [18]:
negative_cases = list(filter(lambda n: n < 0, predictions))
peak_day = predictions.index(negative_cases[0]) + 1
peak_date = df.loc[peak_day, 'Date']
print('Peak Count is expected on '+peak_date)

IndexError: list index out of range

In [None]:
plot_model_predictions(df, country, 'Confirmed', peak_day).show()