<a href="https://colab.research.google.com/github/ldejuan/covid/blob/master/covid19_ml_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#
# Model on delta log of the amount of either Fatalities or Confindent Cases
#


In [0]:
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import interpolate



In [0]:
#read zip train and test data set from zip Archive
PATH_DATA= "./drive/My Drive/covid/"
fileNameZip = 'covid19-global-forecasting-week-2.zip'
archiveName = "%s%s"%(PATH_DATA,fileNameZip)
trainName= 'train.csv'
testName = 'test.csv'
submissionNane = 'submission.csv'
if zipfile.is_zipfile(archiveName):
  with zipfile.ZipFile(archiveName) as archive:
    dfTrain = pd.read_csv(archive.open(trainName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
      
    dfTest  = pd.read_csv(archive.open(testName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
    dfSubmission  = pd.read_csv(archive.open(submissionNane,'r'))

In [0]:
#
# Replace Province_State by None
#
dfTrain.replace(to_replace = np.nan, value ='none', inplace  =True)
dfTest.replace(to_replace = np.nan, value ='none', inplace  =True)
dfTrain['Date'] = pd.to_datetime(dfTrain['Date'])
dfTest['Date'] = pd.to_datetime(dfTest['Date'])


In [0]:
dfTrainAgg.day_from_Fatality

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#
# Create features per country
#
startFatalityThreshold = 5
def aggByCountry(x):
  x['Country_Fatalities'] = np.sum(x.Fatalities.values)
  x['Country_ConfirmedCases'] = np.sum(x.ConfirmedCases.values)
  return x
dfTrainAgg = dfTrain.groupby(["Country_Region","Date"]).apply(aggByCountry)

In [0]:
#
# Make logarithm features and log difference par country & province
#
def log_corrected(x):
  y = np.log(x)
  y[y == -np.inf] = 0.
  return y

def makeLogFeatures(df):
  print ('processing ..%s: %s'%(df.Country_Region.iloc[0],df.Province_State.iloc[0]))
  df['log_Fatalities'] = log_corrected(df.Fatalities)
  df['log_ConfirmedCases'] = log_corrected(df.ConfirmedCases)
  df['log_Country_Fatalities'] = log_corrected(df.Country_Fatalities)
  df['log_Country_ConfirmedCases'] = log_corrected(df.Country_ConfirmedCases)

  df['diff_log_Fatalities'] = df['log_Fatalities'].diff().fillna(value = 0.)
  df['diff_log_ConfirmedCases'] = df['log_ConfirmedCases'].diff().fillna(value = 0.)

  df['diff_log_Country_Fatalities'] = df['log_Country_Fatalities'].diff().fillna(value = 0.)
  df['diff_log_Country_ConfirmedCases'] = df['log_Country_ConfirmedCases'].diff().fillna(value = 0.)

  return df
dfTrainFeatures = dfTrainAgg.groupby(["Country_Region","Province_State"]).apply(makeLogFeatures)
#
# Make days_from_Fatalities
#
startFatalityThreshold = 3
def calculateThreshold(df):
  dateMin = df[df.Country_Fatalities > startFatalityThreshold].Date.min()
  if pd.isnull(dateMin):
    df['days_from_Fatalities'] = -1000.
  else:
    df['days_from_Fatalities']= (df.Date - dateMin) / np.timedelta64(1,'D')

  return df

dfTrainFeatures = dfTrainFeatures.groupby(["Country_Region","Province_State"]).apply(calculateThreshold)  


In [0]:
#
# Create Forecast features and outputs 
#
def createShiftedFeature(df):
  df['diff_log_Cntry_Fatalities_1'] = df.diff_log_Country_Fatalities.shift(1)
  df['diff_log_Cntry_Fatalities_3'] = df.diff_log_Country_Fatalities.shift(3)
  df['diff_log_Cntry_Fatalities_5'] = df.diff_log_Country_Fatalities.shift(5)
  df['diff_log_Fatalities_1'] = df.diff_log_Country_Fatalities.shift(1)
  df['diff_log_Fatalities_3'] = df.diff_log_Country_Fatalities.shift(3)
  df['diff_log_Fatalities_5'] = df.diff_log_Country_Fatalities.shift(5)
# Output  
  df['diff_log_Fatalities_p1'] = df.diff_log_Fatalities.shift(-1)
  df['diff_log_Fatalities_p3'] = df.diff_log_Fatalities.shift(-3)
  df['diff_log_Fatalities_p5'] = df.diff_log_Fatalities.shift(-5)
  df['diff_log_Fatalities_p21'] = df.diff_log_Fatalities.shift(-21)

  df['diff_log_Cntry_Confirmed_1'] = df.diff_log_Country_ConfirmedCases.shift(1)
  df['diff_log_Cntry_Confirmed_3'] = df.diff_log_Country_ConfirmedCases.shift(3)
  df['diff_log_Cntry_Confirmed_5'] = df.diff_log_Country_ConfirmedCases.shift(5)
  df['diff_log_Confirmed_1'] = df.diff_log_Country_ConfirmedCases.shift(1)
  df['diff_log_Confirmed_3'] = df.diff_log_Country_ConfirmedCases.shift(3)
  df['diff_log_Confirmed_5'] = df.diff_log_Country_ConfirmedCases.shift(5)
# Output
  df['diff_log_Confirmed_p1'] = df.diff_log_ConfirmedCases.shift(-1)
  df['diff_log_Confirmed_p3'] = df.diff_log_ConfirmedCases.shift(-3)
  df['diff_log_Confirmed_p5'] = df.diff_log_ConfirmedCases.shift(-5)
  df['diff_log_Confirmed_p21'] = df.diff_log_ConfirmedCases.shift(-21)

  return df

df2 = dfTrainFeatures.groupby(["Country_Region","Province_State"]).apply(createShiftedFeature)
df2['Country_Province'] = df2['Country_Region'] + df2['Province_State']
dfLast = df2.groupby(['Country_Region','Province_State']).tail(1).copy()

In [0]:
#
# Test a multi-regression
#


numeric_features = ['diff_log_Fatalities', 
                      'diff_log_ConfirmedCases',
                      'diff_log_Country_Fatalities',
                      'diff_log_Country_ConfirmedCases',
                      'diff_log_Cntry_Fatalities_1',
                      'diff_log_Cntry_Fatalities_3',
                      'diff_log_Cntry_Fatalities_5',
                      'diff_log_Cntry_Confirmed_1',
                      'diff_log_Fatalities_1',
                      'diff_log_Fatalities_3',
                      'diff_log_Fatalities_5',
                      'days_from_Fatalities']

categorical_features = ['Country_Region', 'Country_Province']
features = numeric_features + categorical_features

def modelPrediction(model_forecast):
  numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())])


  categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
      ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = True))])

  preprocessor = ColumnTransformer(
      transformers=[
          ('num', numeric_transformer, numeric_features),
          ('cat', categorical_transformer, categorical_features)])

  # Append classifier to preprocessing pipeline.
  # Now we have a full prediction pipeline.
  clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestRegressor(n_estimators=100, 
                                              max_depth = 30))])

#removed na_values
  df3 = df2.dropna(subset=features + [model_forecast])

  X_train, X_test, y_train, y_test = train_test_split(df3[features], df3[model_forecast], test_size=0.1)

  clf.fit(X_train, y_train)
  print("model score: %.3f" % clf.score(X_test, y_test))
  return clf
  


In [0]:
model_forecasts = ['diff_log_Fatalities_p1','diff_log_Fatalities_p3','diff_log_Fatalities_p5', 'diff_log_Fatalities_p21',
                   'diff_log_Confirmed_p1','diff_log_Confirmed_p3','diff_log_Confirmed_p5','diff_log_Confirmed_p21']
#model_forecasts = ['diff_log_Fatalities_p21']
#clf = modelPrediction(model_forecasts)
dfPredict = dfLast[['Country_Region','Province_State','Date']].copy()
for model_forecast in model_forecasts:
  print ('processing ... %s'%model_forecast)
  dfPredict[model_forecast] = modelPrediction(model_forecast).predict(dfLast[features])

In [0]:
#dfPredict = dfLast[['Country_Region','Province_State','Date']].copy()
#dfResult = pd.DataFrame(data = clf.predict(dfLast[features]), index = dfPredict.index, columns = model_forecasts)
#df4 = dfPredict.merge(right = dfPredict, left_index = True, right_index= True)
#df4[df4.Country_Region == 'Spain']
dfPredict[dfPredict.Country_Region == 'Spain']

In [0]:
#
# Aggregate with date and dfTest first
#
df4.head()

In [0]:
dfPredict.to_csv('%spredict_ml_2.csv'%PATH_DATA, index = False)

In [0]:
#
# Transforming the results into rows
#
def aggByColumns(df):
  x=pd.DataFrame({'diff_log_Fatalities' : [df.diff_log_Fatalities_p1.iloc[0], df.diff_log_Fatalities_p3.iloc[0],
                                  df.diff_log_Fatalities_p5.iloc[0], df.diff_log_Fatalities_p21.iloc[0]], 
                'diff_log_ConfirmedCases' : [df.diff_log_Confirmed_p1.iloc[0], df.diff_log_Confirmed_p3.iloc[0],
                                    df.diff_log_Confirmed_p5.iloc[0], df.diff_log_Confirmed_p21.iloc[0]],
                'Date' : [df.Date.iloc[0] + np.timedelta64(1,'D'), df.Date.iloc[0] + np.timedelta64(3,'D'),
                         df.Date.iloc[0] + np.timedelta64(5,'D'),
                          df.Date.iloc[0] + np.timedelta64(21,'D')]
  })
  return x

dfResult = dfPredict.groupby(['Country_Region','Province_State']).apply(aggByColumns).reset_index()

In [0]:
#
# get the output for submission and merge the results
#
# change rows in test first
dfResultAll = dfTest.merge(dfTrainFeatures, left_on=['Country_Region','Province_State','Date'], right_on=['Country_Region','Province_State','Date'], how= 'left')
dfResultAllM = dfResultAll.set_index(keys = ['Country_Region','Province_State','Date'])
dfResultM = dfResult.set_index(keys = ['Country_Region','Province_State','Date'])
dfResultAllM.update(dfResultM[['diff_log_ConfirmedCases','diff_log_Fatalities']])
dfResultAllM.reset_index(inplace = True, drop = False)

In [0]:
#
# Fill forward on diff_log_ConfirmedCases and diff_log_Fatalities backwards than forward
#
dfResultAllM[['diff_log_Fatalities','diff_log_ConfirmedCases']] = dfResultAllM[['diff_log_Fatalities','diff_log_ConfirmedCases']]\
.fillna(method = 'bfill')\
.fillna(method = 'ffill')

In [0]:
dfResultAllM[dfResultAllM.Country_Region == 'Spain'].tail(20)

In [0]:
#
# recalculate the ConfirmedCases and Fatalites from diff log
#
def completionForward(df):
  print ('processing ..%s'%df.Country_Region.iloc[0])
  cum_Fatalities = np.hstack(([0.], np.cumsum(df.diff_log_Fatalities.iloc[1:])))
  df['Fatalities'] = df.Fatalities.iloc[0]*np.exp(cum_Fatalities)
  cum_ConfirmedCases = np.hstack(([0.], np.cumsum(df.diff_log_ConfirmedCases.iloc[1:]))) 
  df['ConfirmedCases'] = df.ConfirmedCases.iloc[0]*np.exp(cum_ConfirmedCases)
  return df

submission = dfResultAllM.groupby(["Country_Region","Province_State"]).apply(completionForward)

In [71]:

submission[['Date','ForecastId','ConfirmedCases','Fatalities']].to_csv('%ssubmission.csv'%PATH_DATA)

print ('Success Finished')
  

Success Finished
