<a href="https://colab.research.google.com/github/ldejuan/covid/blob/master/covid19_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.ensemble import RandomForestRegressor

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import interpolate


In [0]:
#read zip train and test data set from zip Archive
PATH_DATA= "./drive/My Drive/covid/"
fileNameZip = 'covid19-global-forecasting-week-2.zip'
archiveName = "%s%s"%(PATH_DATA,fileNameZip)
trainName= 'train.csv'
testName = 'test.csv'
submissionNane = 'submission.csv'
if zipfile.is_zipfile(archiveName):
  with zipfile.ZipFile(archiveName) as archive:
    dfTrain = pd.read_csv(archive.open(trainName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
      
    dfTest  = pd.read_csv(archive.open(testName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
    dfSubmission  = pd.read_csv(archive.open(submissionNane,'r'))

In [0]:
#
# Replace Province_State by None
#
dfTrain.replace(to_replace = np.nan, value ='none', inplace  =True)
dfTest.replace(to_replace = np.nan, value ='none', inplace  =True)
dfTrain['Date'] = pd.to_datetime(dfTrain['Date'])
dfTest['Date'] = pd.to_datetime(dfTest['Date'])


In [0]:
#
# Create features per country
#
def aggByCountry(x):
  x['Country_Fatalities'] = np.sum(x.Fatalities)
  x['Country_ConfirmedCases'] = np.sum(x.ConfirmedCases)
  return x
dfTrainAgg = dfTrain.groupby(["Country_Region","Date"]).apply(aggByCountry)

In [0]:
#
# Create Forecast features
#
def createShiftedFeature(df):
  df['Cntry_Fatalities_1'] = df.Country_Fatalities.shift(1)
  df['Cntry_Fatalities_2'] = df.Country_Fatalities.shift(2)
  df['Cntry_Fatalities_4'] = df.Country_Fatalities.shift(4)
  df['Cntry_Fatalities_6'] = df.Country_Fatalities.shift(6)
  df['Fatalities_p1'] = df.Fatalities.shift(-1)
  df['Fatalities_p4'] = df.Fatalities.shift(-4)
  df['Fatalities_p8'] = df.Fatalities.shift(-8)
  df['Fatalities_p16'] = df.Fatalities.shift(-16)
  df['Fatalities_p32'] = df.Fatalities.shift(-32)
  df['Cntry_Confirmed_1'] = df.Country_ConfirmedCases.shift(1)
  df['Cntry_Confirmed_2'] = df.Country_ConfirmedCases.shift(2)
  df['Cntry_Confirmed_4'] = df.Country_ConfirmedCases.shift(4)
  df['Cntry_Confirmed_6'] = df.Country_ConfirmedCases.shift(6)
  df['Confirmed_p1'] = df.ConfirmedCases.shift(-1)
  df['Confirmed_p4'] = df.ConfirmedCases.shift(-4)
  df['Confirmed_p8'] = df.ConfirmedCases.shift(-8)
  df['Confirmed_p16'] = df.ConfirmedCases.shift(-16)
  df['Confirmed_p32'] = df.ConfirmedCases.shift(-32)


  return df

df2 = dfTrainAgg.groupby(["Country_Region","Province_State"]).apply(createShiftedFeature)
df2['Country_Province'] = df2['Country_Region'] + df2['Province_State']
dfLast = df2.groupby(['Country_Region','Province_State']).tail(1).copy()

In [0]:
#
# Test a random Forest 
#

model_forecasts = ['Fatalities_p1','Fatalities_p4','Fatalities_p8','Fatalities_p16','Fatalities_p32',
                  'Confirmed_p1','Confirmed_p4','Confirmed_p8','Confirmed_p16','Confirmed_p32']
numeric_features = ['ConfirmedCases', 
                      'Fatalities',
                      'Country_Fatalities',
                      'Country_ConfirmedCases',
                      'Cntry_Fatalities_1',
                      'Cntry_Fatalities_2',
                      'Cntry_Fatalities_4',
                      'Cntry_Fatalities_6',
                      'Cntry_Confirmed_1',
                      'Cntry_Confirmed_2',
                      'Cntry_Confirmed_4',
                      'Cntry_Confirmed_6']
categorical_features = ['Country_Region', 'Country_Province']
features = numeric_features + categorical_features

def modelPrediction(model_forcast):
  numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())])


  categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))])

  preprocessor = ColumnTransformer(
      transformers=[
          ('num', numeric_transformer, numeric_features),
          ('cat', categorical_transformer, categorical_features)])

  # Append classifier to preprocessing pipeline.
  # Now we have a full prediction pipeline.
  clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestRegressor())])


  df3 = df2.dropna(subset=features + [model_forcast])

  X_train, X_test, y_train, y_test = train_test_split(df3[features], df3[model_forcast], test_size=0.1)

  clf.fit(X_train, y_train)
  print("model score: %.3f" % clf.score(X_test, y_test))
  return clf
  


In [0]:
dfPredict = dfLast[['Country_Region','Province_State','Date']].copy()
for model_forecast in model_forecasts:
  print ('processing ... %s'%model_forecast)
  dfPredict[model_forecast] = modelPrediction(model_forecast).predict(dfLast[features])

processing ... Fatalities_p1
model score: 0.991
processing ... Fatalities_p4
model score: 0.981
processing ... Fatalities_p8
model score: 0.988
processing ... Fatalities_p16
model score: 0.935
processing ... Fatalities_p32
model score: 0.854
processing ... Confirmed_p1
model score: 0.995
processing ... Confirmed_p4
model score: 0.996
processing ... Confirmed_p8
model score: 0.983
processing ... Confirmed_p16
model score: 0.992
processing ... Confirmed_p32
model score: 0.964


In [0]:
dfPredict.to_csv('%spredict.csv'%PATH_DATA, index = False)

In [0]:
#
# Transforming the results into rows
#
def aggByColumns(df):
  x=pd.DataFrame({'Fatalities' : [df.Fatalities_p1.iloc[0], df.Fatalities_p4.iloc[0],
                                  df.Fatalities_p8.iloc[0], df.Fatalities_p16.iloc[0],df.Fatalities_p32.iloc[0] ], 
                'ConfirmedCases' : [df.Confirmed_p1.iloc[0], df.Confirmed_p4.iloc[0],
                                    df.Confirmed_p8.iloc[0], df.Confirmed_p16.iloc[0],df.Confirmed_p32.iloc[0] ],
                'Date' : [df.Date.iloc[0] + np.timedelta64(1,'D'), df.Date.iloc[0] + np.timedelta64(4,'D'),
                         df.Date.iloc[0] + np.timedelta64(8,'D'), df.Date.iloc[0] + np.timedelta64(16,'D'),
                          df.Date.iloc[0] + np.timedelta64(32,'D')]
  })
  return x

dfResult = dfPredict.groupby(['Country_Region','Province_State']).apply(aggByColumns).reset_index()

In [0]:
#
# get the output for submission and merge the results
#
dfResultAll = dfTest.merge(dfTrain, left_on=['Country_Region','Province_State','Date'], right_on=['Country_Region','Province_State','Date'], how= 'left')
dfResultAllM = dfResultAll.set_index(keys = ['Country_Region','Province_State','Date'])
dfResultM = dfResult.set_index(keys = ['Country_Region','Province_State','Date'])
dfResultAllM.update(dfResultM[['ConfirmedCases','Fatalities']])
dfResultAllM.reset_index(inplace = True, drop = False)

In [0]:
#
# Make an interpolation of values to fill na
#
def interpolation(df):
  x=(df.Date - df.Date.iloc[0])/np.timedelta64(1,'D')
  y=df.ConfirmedCases
  y_=y.dropna()
  x_=x[y_.index]
  f=interpolate.interp1d(x_,y_,fill_value ='extrapolate')
  df['ConfirmedCases_'] = np.round(f(x))

  y = df.Fatalities
  y_=y.dropna()
  x_=x[y_.index]
  f=interpolate.interp1d(x_,y_,fill_value = 'extrapolate')
  df['Fatalities_'] =np.round(f(x))

  return df

submission = dfResultAllM.groupby(["Country_Region","Province_State"]).apply(interpolation)

ERROR! Session/line number was not unique in database. History logging moved to new session 64


In [0]:
LastSubmissionFile = submission.drop(['Country_Region','Province_State','Id','ConfirmedCases','Fatalities'], axis = 1)\
  .rename(columns= {'ConfirmedCases_': 'ConfirmedCases', 
                    'Fatalities_': 'Fatalities'})
LastSubmissionFile[['Date','ForecastId','ConfirmedCases','Fatalities']].to_csv('%ssubmissiion.csv'%PATH_DATA)

print ('Success Finished')
  

Success Finished


Index(['Date', 'ForecastId', 'ConfirmedCases', 'Fatalities'], dtype='object')