In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from keras import Sequential
from keras.models import Model
from keras.layers import LSTM, Dense, Concatenate, Flatten, Input, GRU
from keras.layers.merge import concatenate
from keras.optimizers import Adam, RMSprop, Adadelta
from keras.losses import logcosh
from keras.layers.advanced_activations import LeakyReLU
from keras.constraints import non_neg
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras import regularizers
from keras import backend as K


Using TensorFlow backend.


In [2]:
#Population values
population = {
    "Chiapas_2016-2017.csv": 5217908,
    "Colima_2016-2017.csv": 711235,
    "Guerrero_2016-2017.csv": 3533251,
    "Hidalgo_2016-2017.csv": 2858359,
    "NuevoLeon_2016-2017.csv": 5119504,
    "Oaxaca_2016-2017.csv": 3967889,
    "QuintanaRoo_2016-2017.csv": 1501562,
    "Tabasco_2016-2017.csv" : 2395272,
    "Veracruz_2016-2017.csv" : 8112505,
    "Yucatan_2016-2017.csv" : 2097175,
    
    "casanare_2016-2017.csv" : 356438,
    "cordoba_2016-2017.csv" : 1709603,
    "cundinamarca_2016-2017.csv" : 2680041,
    "huila_2016-2017.csv" : 1154804,
    "meta_2016-2017.csv" : 961292,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
    "Alagoas_2016-2017.csv": 3375823,
    "Bahia_2016-2017.csv": 15344447,
    "Ceara_2016-2017.csv": 9020460,
    "Goias_2016-2017.csv": 6778772,
    "Maranhao_2016-2017.csv": 7000229,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "Para_2016-2017.csv": 8366628,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def saveModel(model, modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)

    model_json = model.to_json()
    with open(jsonName, "w") as json_file:
        json_file.write(model_json)
    #seralize weights to HDF5
    model.save_weights(h5Name)

In [5]:
def createModel(modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)
    
    
    input_layer = Input(shape=(4,2))
    b1_out = LSTM(64, return_sequences=False)(input_layer)
    
    b2_out = Dense(64, activation="relu", kernel_regularizer="l2")(input_layer)
    b2_out = Flatten()(b2_out)
    
    concatenated = concatenate([b1_out, b2_out])
    out = Dense(8, activation="relu", kernel_regularizer="l2")(concatenated)
    out = Dense(8, activation="relu", kernel_regularizer="l2")(out)
    out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    
    model = Model([input_layer], out)
    model.compile(loss=["mae"], optimizer="adam", metrics=["mae"])

    return model

In [6]:
def getXY(dataset, scale):
    dataset[["Searches"]] /= 100
    dataset[["Cases"]] = dataset[["Cases"]].apply(lambda x: x*100000/scale, axis=1)

    values = dataset.values.astype("float32")
    
    n_weeks = 4
    n_features = 2

    reframed = series_to_supervised(values, n_weeks, 3)
    values = reframed.values
    print(reframed.columns)
    
    print("Reframed Shape: ", reframed.shape)
    totalFeatures = reframed.shape[1]
    n_obs = n_weeks * n_features

    x,y = values[:, :8], values[:, -1] # Pick 4 previous weeks and predict 4 week ahead 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [7]:
def formatFilename(filename):
    return filename.replace(".csv", "")

In [8]:
def getSortedFiles(files):
    filesArray = []
    for file in files:
        dataset = pd.read_csv(file)
        filesArray.append([file, dataset["Cases"].sum()])
    s = sorted(filesArray, key=lambda x: x[1])
    result = []
    for file, cases in s:
        result.append(file)
    
    return result

In [9]:
def ExportPredictions(folder, files, model):
    
    comparisonCsv = pd.DataFrame(columns=["File", "NaiveRMSE", "LSTMRMSE"])
    
    for file in files:
        state = file.split("/")[-1]
        dataset = pd.read_csv(file, index_col=0)
        x, y = getXY(dataset, population[state])
        
        predictions = model.predict(x)
        
        #Transform to 1-D
        y = y.reshape((len(y), 1))

        #Rescale
        inv_yPred = np.apply_along_axis(lambda x: x * population[state] / 100000, 1, predictions)

        dataset["Cases"] *= (population[state] / 100000)
        naive = dataset["Cases"].values[3:-3]
        dataset = dataset[6:]

        dataset["LSTM-Prediction"] = inv_yPred

        dataset["LSTMError"] = dataset["LSTM-Prediction"] - dataset["Cases"]

        #Naive
        dataset["Naive-Prediction"] = naive
        dataset["NaiveError"] = dataset["Naive-Prediction"] - dataset["Cases"]


        naiveErrorSquared = dataset["NaiveError"] ** 2
        naiveMSE = naiveErrorSquared.mean()
        naiveRMSE = naiveMSE ** (0.5)

        LSTMErrorSquared = dataset["LSTMError"] ** 2
        LSTMMSE = LSTMErrorSquared.mean()
        LSTMRMSE = LSTMMSE ** (0.5)

        comparisonCsv = comparisonCsv.append({
            "File": state,
            "NaiveRMSE": naiveRMSE,
            "LSTMRMSE" : LSTMRMSE,
                             }, ignore_index=True)

        #Plots
        dataset.rename(index=str, columns={"Cases": "Observed"}, inplace=True)

        dataset.to_csv("{}/{}.csv".format(folder, state))
        
    comparisonCsv["LSTM-Naive-Ratio"] = comparisonCsv["LSTMRMSE"] / comparisonCsv["NaiveRMSE"]
    comparisonCsv["Average-Ratio"] = comparisonCsv["LSTMRMSE"].sum() / comparisonCsv["NaiveRMSE"].sum()
    comparisonCsv["TotalLSTM-RMSE"] = comparisonCsv["LSTMRMSE"].sum()
    comparisonCsv["TotalNaive-RMSE"] = comparisonCsv["NaiveRMSE"].sum()
    comparisonCsv.to_csv("{}/RMSE.csv".format(folder))

In [10]:
testCountry = input("Country: ")

Country: Brazil


In [11]:
countries = ["Mexico", "Colombia", "Brazil"]
trainCountries = [x for x in countries if x != testCountry]

model = createModel("{}-{}".format(trainCountries[0], trainCountries[1]))

trainFiles = []
for country in trainCountries:
    folder = "../../data/{}/processed_data".format(country)
    for file in os.listdir(folder):
        trainFiles.append("{}/{}".format(folder, file))
        
testFiles = []
folder = "../../data/{}/processed_data".format(testCountry)
for file in os.listdir(folder):
    testFiles.append("{}/{}".format(folder, file))

trainFiles = getSortedFiles(trainFiles)
with tf.Session() as sess:
    for file in trainFiles:
        dataset = pd.read_csv(file, index_col=0)
        state = file.split("/")[-1]
        x, y = getXY(dataset, population[state])
        model.fit(x, y,
                epochs =30,
                batch_size=x.shape[0],
                verbose=0, 
                shuffle=False
                 )
        
    #Export train files
    if(not os.path.isdir("{}-{}".format(trainCountries[0], trainCountries[1]))):
        os.mkdir("{}-{}".format(trainCountries[0], trainCountries[1]))
    
    if(not os.path.isdir("{}-{}/train".format(trainCountries[0], trainCountries[1]))):
        os.mkdir("{}-{}/train".format(trainCountries[0], trainCountries[1]))
            
    for file in trainFiles:
        ExportPredictions("{}-{}/train".format(trainCountries[0], trainCountries[1]), trainFiles, model)
        
    #Export test files
    if(not os.path.isdir("{}-{}/test".format(trainCountries[0], trainCountries[1]))):
        os.mkdir("{}-{}/test".format(trainCountries[0], trainCountries[1]))
        
    for file in testFiles:
        ExportPredictions("{}-{}/test".format(trainCountries[0], trainCountries[1]), testFiles, model)
    
    saveModel(model, "{}-{}/Model".format(trainCountries[0], trainCountries[1]))

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)'],
      dtype='object')
Reframed Shape:  (98, 14)
Index(['