In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from keras import Sequential
from keras.models import Model
from keras.layers import LSTM, Dense, Concatenate, Flatten, Input
from keras.layers.merge import concatenate
from keras.optimizers import Adam
from keras.losses import logcosh
import tensorflow as tf
from keras.utils import plot_model
%matplotlib inline


Using TensorFlow backend.


In [2]:
#Population values
population = {
    "Chiapas_2016-2017.csv": 5217908,
    "Colima_2016-2017.csv": 711235,
    "Guerrero_2016-2017.csv": 3533251,
    "Hidalgo_2016-2017.csv": 2858359,
    "NuevoLeon_2016-2017.csv": 5119504,
    "Oaxaca_2016-2017.csv": 3967889,
    "QuintanaRoo_2016-2017.csv": 1501562,
    "Tabasco_2016-2017.csv" : 2395272,
    "Veracruz_2016-2017.csv" : 8112505,
    "Yucatan_2016-2017.csv" : 2097175,
    
    "casanare_2016-2017.csv" : 356438,
    "cordoba_2016-2017.csv" : 1709603,
    "cundinamarca_2016-2017.csv" : 2680041,
    "huila_2016-2017.csv" : 1154804,
    "meta_2016-2017.csv" : 961292,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
    "Alagoas_2016-2017.csv": 3375823,
    "Bahia_2016-2017.csv": 15344447,
    "Ceara_2016-2017.csv": 9020460,
    "Goias_2016-2017.csv": 6778772,
    "Maranhao_2016-2017.csv": 7000229,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "Para_2016-2017.csv": 8366628,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def saveModel(model, modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)

    model_json = model.to_json()
    with open(jsonName, "w") as json_file:
        json_file.write(model_json)
    #seralize weights to HDF5
    model.save_weights(h5Name)

In [5]:
def createModel(modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)
    
    
    input_layer = Input(shape=(4,2))
    b1_out = LSTM(512, activation="relu", return_sequences=True)(input_layer)
#     b1_out = LSTM(512, activation="tanh", return_sequences=True)(b1_out)
    b1_out = LSTM(256, activation="relu", return_sequences=False)(b1_out)
    
    model1 = Model(input_layer, b1_out)
    
    b2_out = Dense(256, activation="relu")(input_layer)
#     b2_out = Dense(256, activation="tanh")(b2_out)
    b2_out = Flatten()(b2_out)
    
    model2 = Model(input_layer, b2_out)
    
    concatenated = concatenate([b1_out, b2_out])
    out = Dense(128, activation="relu")(concatenated)
    out = Dense(4, activation='linear', name='output_layer')(concatenated)
    
    model = Model([input_layer], out)
    model.compile(loss=[logcosh], optimizer="adam", metrics=["mae"])
    
    return model

In [6]:
def getXY(dataset, scale):
    dataset[["Searches"]] /= 100
    dataset[["Cases"]] = dataset[["Cases"]].apply(lambda x: x*100000/scale, axis=1)

    values = dataset.values.astype("float32")
    
    n_weeks = 4
    n_features = 2

    reframed = series_to_supervised(values, n_weeks, 4)
    values = reframed.values
    print(reframed.columns)
    
    print("Reframed Shape: ", reframed.shape)
    totalFeatures = reframed.shape[1]
    n_obs = n_weeks * n_features

    x,y = values[:, :8], values[:, 9::2] # Pick 4 previous weeks and predict 4 next 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [7]:
def formatFilename(filename):
    return filename.replace(".csv", "")

In [8]:
country = input("Country: ")

Country: Mexico


In [9]:
np.random.seed(2018)

countries = ["Mexico", "Colombia", "Brazil"]
otherCountries = [x for x in countries if x != country]

model = createModel(country)
#Train
folder = "../../data/{}/processed_data".format(country)
files = os.listdir(folder)
with tf.Session() as sess:
    for file in files:
        dataset = pd.read_csv("{}/{}".format(folder, file), index_col=0)
        state = file
        print(file)
        x, y = getXY(dataset, population[file])
        model.fit(x, y,
                epochs = 200,
                batch_size=x.shape[0],
                verbose=0, 
                shuffle=True
                 )
    for otherCountry in otherCountries:
        otherFolder = "../../data/{}/processed_data".format(otherCountry)
        testFiles = os.listdir(otherFolder)

        for file in testFiles:

                test_dataset = pd.read_csv("{}/{}".format(otherFolder, file), index_col=0)
                 #Calculate Naive 
                    
                naive1Week = test_dataset["Cases"].values[3:-4]
                naive2Week = test_dataset["Cases"].values[2:-5]
                naive3Week = test_dataset["Cases"].values[1:-6]
                naive4Week = test_dataset["Cases"].values[0:-7]
                
                state = file
                if(not os.path.isdir("{}/{}".format(country, otherCountry))):
                    os.mkdir("{}/{}".format(country, otherCountry))
                
                if(not os.path.isdir("{}/{}/{}".format(country, otherCountry, file))):
                    os.mkdir("{}/{}/{}".format(country, otherCountry, file))
                    
                outFolder = "{}/{}/{}".format(country, otherCountry, file)
                
                formattedFilename = formatFilename(file)
                
                test_x, test_y = getXY(test_dataset, population[file])
                predictions = model.predict(test_x)
                
                errors = test_y - predictions
                
                #Transform to 1-D
                test_y = test_y.reshape((len(test_y), 4))
                
                #Rescale
                inv_yPred = np.apply_along_axis(lambda x: x * population[file] / 100000, 1, predictions)
#                 inv_y = np.apply_along_axis(lambda x: x * population[file] / 100000, 1, test_y)
                
                
                test_dataset = test_dataset[4:-3]
                test_dataset["Cases"] *= (population[file] / 100000)
                test_dataset["Week1-Prediction"] = inv_yPred[:,0]
                test_dataset["Week2-Prediction"] = inv_yPred[:,1]
                test_dataset["Week3-Prediction"] = inv_yPred[:,2]
                test_dataset["Week4-Prediction"] = inv_yPred[:,3]
                
                #Set naive
                test_dataset["Naive-1Week"] = naive1Week
                test_dataset["Naive-2Week"] = naive2Week
                test_dataset["Naive-3Week"] = naive3Week
                test_dataset["Naive-4Week"] = naive4Week
                #
                
                test_dataset["Week2-Prediction"] = test_dataset["Week2-Prediction"].shift(1)
                test_dataset["Week3-Prediction"] = test_dataset["Week3-Prediction"].shift(2)
                test_dataset["Week4-Prediction"] = test_dataset["Week4-Prediction"].shift(3)
                
                test_dataset.dropna(inplace=True)
                
                test_dataset["Week1-Error"] = test_dataset["Week1-Prediction"] - test_dataset["Cases"]
                test_dataset["Week2-Error"] = test_dataset["Week2-Prediction"] - test_dataset["Cases"]
                test_dataset["Week3-Error"] = test_dataset["Week3-Prediction"] - test_dataset["Cases"]
                test_dataset["Week4-Error"] = test_dataset["Week4-Prediction"] - test_dataset["Cases"]
                
                test_dataset.to_csv("{}/{}".format(outFolder, file))

                
                #Plot each week vs cases
                
                #Week 1
                colors = ['#2962FF', '#F44336']
                test_dataset[["Cases", "Week1-Prediction"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1 Week ahead prediction vs. Observed ZIKV cases".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid()
                fig = plt.gcf()
                fig.savefig("{}/1-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                
                #Week 2
                colors = ['#2962FF', '#D500F9']
                test_dataset[["Cases", "Week2-Prediction"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n2 Weeks ahead prediction vs. Observed ZIKV cases".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/2-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()

                #Week 3
                colors = ['#2962FF', '#09af00']
                test_dataset[["Cases", "Week3-Prediction"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n3 Weeks ahead prediction vs. Observed ZIKV cases".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/3-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #Week 4
                colors = ['#2962FF', '#212121']
                test_dataset[["Cases", "Week4-Prediction"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n4 Weeks ahead prediction vs. Observed ZIKV cases".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/4-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #All
                colors = ['#2962FF', '#F44336', "#D500F9", "#09af00", "#212121"]
                test_dataset[["Cases", "Week1-Prediction",
                              "Week2-Prediction",
                              "Week3-Prediction",
                             "Week4-Prediction"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1-4 Weeks ahead prediction vs. Observed ZIKV cases".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/All-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #Compare with Naive
                #Week 1
                colors = ['#2962FF', '#09af00', "#212121"]
                test_dataset[["Cases", "Week1-Prediction", "Naive-1Week"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1 Weeks ahead prediction vs. Observed ZIKV cases vs. Naive".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/Naive-1-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #Week 2
                colors = ['#2962FF', '#09af00', "#212121"]
                test_dataset[["Cases", "Week2-Prediction", "Naive-2Week"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1 Weeks ahead prediction vs. Observed ZIKV cases vs. Naive".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/Naive-2-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #Week 3
                colors = ['#2962FF', '#09af00', "#212121"]
                test_dataset[["Cases", "Week3-Prediction", "Naive-3Week"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1 Weeks ahead prediction vs. Observed ZIKV cases vs. Naive".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/Naive-3-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                #Week 4
                colors = ['#2962FF', '#09af00', "#212121"]
                test_dataset[["Cases", "Week4-Prediction", "Naive-4Week"]].plot(figsize=(10,10), color=colors)
                ax = plt.gca()
                ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
                plt.title("LSTM Model -{}\n1 Weeks ahead prediction vs. Observed ZIKV cases vs. Naive".format(formattedFilename))
                plt.xlabel("Week")
                plt.ylabel("ZIKV Cases")
                plt.legend()
                plt.grid(linestyle='dashed', linewidth=1.5)
                fig = plt.gcf()
                fig.savefig("{}/Naive-4-Week-{}.png".format(outFolder, formattedFilename))
                plt.close("all")
                plt.clf()
                
                
                
    saveModel(model, "{}/Model".format(country))


Chiapas_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)',
       'var2(t+3)'],
      dtype='object')
Reframed Shape:  (97, 16)
Colima_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)',
       'var2(t+3)'],
      dtype='object')
Reframed Shape:  (97, 16)
Guerrero_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)',
       'var2(t+3)'],
      dtype='object')
Reframed Shape:  (97, 16)
Hidalgo_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)

NAIVE [ 95 292   0 873 615 159 846 417  42 589 345  90 282 236 163 193 137 124
  89 122  79  89  56  23  48  43  26  26  32  18  22  17   3   9  14  52
  27   3  10  12   6   5  37   9  11  14   3   3   2  30   7   5   1   6
   6  13   2   4   4   5   2  -1   4   1   2   0   7   6   1   3   1   2
   7  -6   2   0   3  -6   4   1   4   0   1   3   3   0  -1  -5   6   1
   4   1   1   2  -2   0   2   6  -1  -1   2   2  -1   2]
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)',
       'var2(t+3)'],
      dtype='object')
Reframed Shape:  (97, 16)
NAIVE [   1   64    0  413  636  855  881  980 1243 1257 1044  952 1212 1525 1531
 1472 1247 1077 1164 1260  786  949  859  809  730  519  507  301  268  319
  198  216  144  110   80  188   25  106  108   72   34   72   91   46   41
   37  -29   96   30   85   26   20   12   15   34   44 

NAIVE [3520 3520 3520 3521 4297 4297 4298 4012 4012 4012 3539 3540 3260 3260 3262
 1879 1879 1879 1881  979  979  979  979  979  979 -464 -464 -464 -464 -464
 -464 1303 2606   76   76   66   66   66    0   33   34  102  103   63   64
   79   65   62   64   60   41   22    6    6    6    6   30   30   30   34
   34   35   25   25   25   25   27    3    3    3    3    5    5    5    5
    5    6  -10  -10  -10  -10  -10  -10    4    8   -3   -2   -5   -5   -5
    1    0    0   16   17    3    3   17   11    9    4   -4    1   15]
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)',
       'var2(t+3)'],
      dtype='object')
Reframed Shape:  (97, 16)


<matplotlib.figure.Figure at 0x153007a39e8>