# **A very simple notebook to play with the data**

I copy stuff from this notebook to have a basic model: https://www.kaggle.com/code/jaimerv/covid-19-basic-model-not-leaky/edit



## Import Libraries and Define Auxiliary Functions


In [1]:
import math
import os
import random
from collections import defaultdict
from itertools import product
from datetime import timedelta, datetime, timezone
from pandas._libs.tslibs.timestamps import Timestamp


import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

%matplotlib inline

## Get data from the daily_reports

In [3]:
LAST_DAYS = 30 #Number of days to get data from

now = datetime.now()

dfs = []  # empty list which will hold your dataframes
df_temp_2 = pd.DataFrame()
for d in range(1, LAST_DAYS): #NOTE: do the same that has been done for the first day but for the whole period
    date = now - timedelta(days=d)
    date_str = date.strftime("%m-%d-%Y")
    # print(date_str)
    source_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + date_str + '.csv'
    df_temp = pd.read_csv(source_url)
    df_temp.rename(columns={"Last_Update": "Date"}, inplace=True) #Renane dataframe column from "Last_Update" to "Date"
    df_temp_2 = df_temp[["Admin2", "Province_State", "Country_Region","Confirmed", "Deaths"]] #TODO: consider also other columns in future versions like Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
    df_temp_2["Date"] = date.strftime("%Y-%m-%d") #TODO: fix this so that no warning comes
    dfs.append(df_temp_2)  # append dataframe to list
    
res = pd.concat(dfs, ignore_index=True)  # concatenate list of dataframes
res.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,Admin2,Province_State,Country_Region,Confirmed,Deaths,Date
0,,,Afghanistan,187966,7753,2022-08-09
1,,,Albania,318638,3566,2022-08-09
2,,,Algeria,268478,6878,2022-08-09
3,,,Andorra,45793,154,2022-08-09
4,,,Angola,102636,1917,2022-08-09


In [5]:
# group by Country_Region and sum Confirmed and Deaths
df = res.groupby(['Province_State','Country_Region','Date']).agg({'Confirmed':'sum', 'Deaths':'sum'})
df.reset_index(inplace=True)
df.rename(columns={"Confirmed": "ConfirmedCases", "Deaths": "Fatalities"}, inplace=True)
df

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,Abruzzo,Italy,2022-07-12,461690,3404
1,Abruzzo,Italy,2022-07-13,464800,3407
2,Abruzzo,Italy,2022-07-14,468074,3407
3,Abruzzo,Italy,2022-07-15,471058,3412
4,Abruzzo,Italy,2022-07-16,473897,3413
...,...,...,...,...,...
17830,Zuid-Holland,Netherlands,2022-08-05,1752542,5202
17831,Zuid-Holland,Netherlands,2022-08-06,1752542,5202
17832,Zuid-Holland,Netherlands,2022-08-07,1752542,5202
17833,Zuid-Holland,Netherlands,2022-08-08,1753564,5202


## Prepare train-test set
Code from COVID-19 Basic Model (Not Leaky)

In [6]:
loc_group = ["Province_State", "Country_Region"]


def preprocess(df):
    df["Date"] = df["Date"].astype("datetime64[ms]")
    for col in loc_group:
        df[col].fillna("none", inplace=True) #NOTE: replace all NaN with none
    return df

df = preprocess(df)
df.head()

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,Abruzzo,Italy,2022-07-12,461690,3404
1,Abruzzo,Italy,2022-07-13,464800,3407
2,Abruzzo,Italy,2022-07-14,468074,3407
3,Abruzzo,Italy,2022-07-15,471058,3412
4,Abruzzo,Italy,2022-07-16,473897,3413


In [7]:
TARGETS = ["ConfirmedCases", "Fatalities"]

for col in TARGETS:
    df[col] = np.log1p(df[col]) #NOTE :not sure why he does that

In [8]:
for col in TARGETS:
    df["prev_{}".format(col)] = df.groupby(loc_group)[col].shift() #NOTE: the prev_ columns basically has the same than the others but delayed one day
df.head()

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities
0,Abruzzo,Italy,2022-07-12,13.042651,8.133,,
1,Abruzzo,Italy,2022-07-13,13.049365,8.133881,13.042651,8.133
2,Abruzzo,Italy,2022-07-14,13.056384,8.133881,13.049365,8.133881
3,Abruzzo,Italy,2022-07-15,13.062739,8.135347,13.056384,8.133881
4,Abruzzo,Italy,2022-07-16,13.068747,8.13564,13.062739,8.135347


In [9]:
df = df[df["Date"] > df["Date"].min()].copy() #NOTE: removes the first day since it has NaNs in the "prev" columns
df.head(50)

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities
1,Abruzzo,Italy,2022-07-13,13.049365,8.133881,13.042651,8.133
2,Abruzzo,Italy,2022-07-14,13.056384,8.133881,13.049365,8.133881
3,Abruzzo,Italy,2022-07-15,13.062739,8.135347,13.056384,8.133881
4,Abruzzo,Italy,2022-07-16,13.068747,8.13564,13.062739,8.135347
5,Abruzzo,Italy,2022-07-17,13.073337,8.136226,13.068747,8.13564
6,Abruzzo,Italy,2022-07-18,13.075582,8.136811,13.073337,8.136226
7,Abruzzo,Italy,2022-07-19,13.08345,8.137396,13.075582,8.136811
8,Abruzzo,Italy,2022-07-20,13.088817,8.138273,13.08345,8.137396
9,Abruzzo,Italy,2022-07-21,13.094328,8.139441,13.088817,8.138273
10,Abruzzo,Italy,2022-07-22,13.099672,8.140898,13.094328,8.139441


In [10]:
from datetime import timedelta
TEST_DAYS = 7 #Number of days to test the model
TEST_FIRST = now - timedelta(days=TEST_DAYS)
TEST_FIRST = TEST_FIRST.replace(hour=0, minute=0, second=0, microsecond=0)
TEST_FIRST = Timestamp(TEST_FIRST)


TEST_FIRST

Timestamp('2022-08-03 00:00:00')

In [11]:
dev_df, test_df = df[df["Date"] < TEST_FIRST].copy(), df[df["Date"] >= TEST_FIRST].copy() 
dev_df.shape, test_df.shape
#dev_df to train the model and test_df to test the model

((12915, 7), (4305, 7))

In [12]:
test_df.head()

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities
22,Abruzzo,Italy,2022-08-03,13.146252,8.174985,13.143046,8.173575
23,Abruzzo,Italy,2022-08-04,13.149525,8.175548,13.146252,8.174985
24,Abruzzo,Italy,2022-08-05,13.15258,8.175829,13.149525,8.175548
25,Abruzzo,Italy,2022-08-06,13.155194,8.176954,13.15258,8.175829
26,Abruzzo,Italy,2022-08-07,13.157194,8.177235,13.155194,8.176954


# Modeling

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
                  ('linear', LinearRegression())])
model

Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('linear', LinearRegression())])

In [14]:
features = ["prev_{}".format(col) for col in TARGETS] #NOTE: take as the features the previous number of ConfirmedCases and Fatalities
features

['prev_ConfirmedCases', 'prev_Fatalities']

In [15]:
dev_df[features]

Unnamed: 0,prev_ConfirmedCases,prev_Fatalities
1,13.042651,8.133000
2,13.049365,8.133881
3,13.056384,8.133881
4,13.062739,8.135347
5,13.068747,8.135640
...,...,...
17823,14.374243,8.556414
17824,14.374554,8.556414
17825,14.374554,8.556414
17826,14.374554,8.556414


In [16]:
dev_df[TARGETS]

Unnamed: 0,ConfirmedCases,Fatalities
1,13.049365,8.133881
2,13.056384,8.133881
3,13.062739,8.135347
4,13.068747,8.135640
5,13.073337,8.136226
...,...,...
17823,14.374554,8.556414
17824,14.374554,8.556414
17825,14.374554,8.556414
17826,14.375367,8.556414


In [17]:
model.fit(dev_df[features], dev_df[TARGETS]) #NOTE: fit the model

Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('linear', LinearRegression())])

In [18]:
[mean_squared_error(dev_df[TARGETS[i]], model.predict(dev_df[features])[:, i]) for i in range(len(TARGETS))] #NOTE: check the mean_squared_error from the training dataset

[0.031765502480923274, 0.004218249461442678]

In [19]:
test_df.loc[test_df["Date"] == TEST_FIRST]

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities
22,Abruzzo,Italy,2022-08-03,13.146252,8.174985,13.143046,8.173575
51,Acre,Brazil,2022-08-03,11.882632,7.610358,11.882632,7.610358
80,Adygea Republic,Russia,2022-08-03,10.813297,6.911747,10.813056,6.911747
109,Aguascalientes,Mexico,2022-08-03,11.287116,8.170186,11.282619,8.170186
138,Aichi,Japan,2022-08-03,13.636189,7.693026,13.614692,7.688455
...,...,...,...,...,...,...,...
17712,Zaporizhia Oblast,Ukraine,2022-08-03,12.278723,8.677099,12.278723,8.677099
17741,Zeeland,Netherlands,2022-08-03,12.069147,5.973810,12.068791,5.973810
17770,Zhejiang,China,2022-08-03,8.073715,0.693147,8.070906,0.693147
17799,Zhytomyr Oblast,Ukraine,2022-08-03,12.169419,8.171599,12.169419,8.171599


In [20]:
#TODO: fix the UserWarnings that appear when running this cell

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate(df):
    error = 0
    for col in TARGETS:
        error += rmse(df[col].values, df["pred_{}".format(col)].values) #NOTE: checks the error between the predicted columns and the target columns
    return np.round(error/len(TARGETS), 5)


def predict(test_df, first_day, num_days, val=False):

    y_pred = np.clip(model.predict(test_df.loc[test_df["Date"] == first_day][features]), None, 16)#NOTE: here he is predicting the targets for the first day and 
                                                                                                    #saturating (clip) them with max=16
 
    for i, col in enumerate(TARGETS):
        test_df["pred_{}".format(col)] = 0
        test_df.loc[test_df["Date"] == first_day, "pred_{}".format(col)] = y_pred[:, i] #NOTE: here he sets the predicted columns

    if val:
        print(first_day, evaluate(test_df[test_df["Date"] == first_day])) #NOTE: print the date of the first day and the error between the predicted targets and the real targets

    for d in range(1, num_days): #NOTE: do the same that has been done for the first day but for the whole period
        y_pred = np.clip(model.predict(y_pred), None, 16)
        date = first_day + timedelta(days=d)

        for i, col in enumerate(TARGETS):
            test_df.loc[test_df["Date"] == date, "pred_{}".format(col)] = y_pred[:, i]

        if val:
            print(date, evaluate(test_df[test_df["Date"] == date])) #NOTE: when we see all the errors we can see that the farther the date from the first day the higher the error
        
    return test_df

test_df = predict(test_df, TEST_FIRST, TEST_DAYS, val=True) #NOTE: he makes predictions for TEST_DAYS number of days
evaluate(test_df) #NOTE: the error of all the predictions

2022-08-03 00:00:00 0.02766
2022-08-04 00:00:00 0.04321
2022-08-05 00:00:00 0.0611
2022-08-06 00:00:00 0.07638
2022-08-07 00:00:00 0.09059
2022-08-08 00:00:00 0.10535
2022-08-09 00:00:00 0.30238




0.13264

In [21]:
test_df.head()

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities,pred_ConfirmedCases,pred_Fatalities
22,Abruzzo,Italy,2022-08-03,13.146252,8.174985,13.143046,8.173575,13.142967,8.174456
23,Abruzzo,Italy,2022-08-04,13.149525,8.175548,13.146252,8.174985,13.142889,8.175336
24,Abruzzo,Italy,2022-08-05,13.15258,8.175829,13.149525,8.175548,13.142811,8.176215
25,Abruzzo,Italy,2022-08-06,13.155194,8.176954,13.15258,8.175829,13.142733,8.177093
26,Abruzzo,Italy,2022-08-07,13.157194,8.177235,13.155194,8.176954,13.142656,8.17797
