In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from keras import Sequential
from keras.models import Model
from keras.layers import LSTM, Dense, Concatenate, Flatten, Input, GRU
from keras.layers.merge import concatenate
from keras.optimizers import Adam, RMSprop
from keras.losses import logcosh
from keras.layers.advanced_activations import LeakyReLU
from keras.constraints import non_neg
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras import regularizers
from keras import backend as K

Using TensorFlow backend.


In [2]:
#Population values
population = {
    "Chiapas_2016-2017.csv": 5217908,
    "Colima_2016-2017.csv": 711235,
    "Guerrero_2016-2017.csv": 3533251,
    "Hidalgo_2016-2017.csv": 2858359,
    "NuevoLeon_2016-2017.csv": 5119504,
    "Oaxaca_2016-2017.csv": 3967889,
    "QuintanaRoo_2016-2017.csv": 1501562,
    "Tabasco_2016-2017.csv" : 2395272,
    "Veracruz_2016-2017.csv" : 8112505,
    "Yucatan_2016-2017.csv" : 2097175,
    
    "casanare_2016-2017.csv" : 356438,
    "cordoba_2016-2017.csv" : 1709603,
    "cundinamarca_2016-2017.csv" : 2680041,
    "huila_2016-2017.csv" : 1154804,
    "meta_2016-2017.csv" : 961292,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
    "Alagoas_2016-2017.csv": 3375823,
    "Bahia_2016-2017.csv": 15344447,
    "Ceara_2016-2017.csv": 9020460,
    "Goias_2016-2017.csv": 6778772,
    "Maranhao_2016-2017.csv": 7000229,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "Para_2016-2017.csv": 8366628,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def saveModel(model, modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)

    model_json = model.to_json()
    with open(jsonName, "w") as json_file:
        json_file.write(model_json)
    #seralize weights to HDF5
    model.save_weights(h5Name)

In [5]:
def createModel(modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)
    
    
    input_layer = Input(shape=(4,2))
    b1_out = LSTM(128, activation="tanh", return_sequences=True)(input_layer)
    b1_out = LSTM(128, activation="tanh", return_sequences=False)(b1_out)
#     b1_out = LSTM(128, activation="tanh", return_sequences=False)(b1_out)
    
    model1 = Model(input_layer, b1_out)
    
    b2_out = Dense(128, activation="relu")(input_layer)
#     b2_out = Dense(128, activation="relu")(b2_out)
    b2_out = Flatten()(b2_out)
    
    concatenated = concatenate([b1_out, b2_out])
    out = Dense(32, activation="relu")(concatenated)
    out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    
    model = Model([input_layer], out)
    model.compile(loss=["mae"], optimizer="rmsprop", metrics=["mse"])
    
    return model

In [6]:
def getXY(dataset, scale):
    dataset[["Searches"]] /= 100
    dataset[["Cases"]] = dataset[["Cases"]].apply(lambda x: x*100000/scale, axis=1)

    values = dataset.values.astype("float32")
    
    n_weeks = 4
    n_features = 2

    reframed = series_to_supervised(values, n_weeks, 2)
    values = reframed.values
    print(reframed.columns)
    
    print("Reframed Shape: ", reframed.shape)
    totalFeatures = reframed.shape[1]
    n_obs = n_weeks * n_features

    x,y = values[:, :8], values[:, -1] # Pick 4 previous weeks and predict 4 week ahead 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [7]:
def formatFilename(filename):
    return filename.replace(".csv", "")

In [8]:
country = input("Country: ")

Country: Brazil


In [9]:
np.random.seed(2018)
countries = ["Mexico", "Colombia", "Brazil"]
otherCountries = [x for x in countries if x != country]

model = createModel(country)
#Train
folder = "../../data/{}/processed_data".format(country)
files = os.listdir(folder)


with tf.Session() as sess:
    for file in files:
        dataset = pd.read_csv("{}/{}".format(folder, file), index_col=0)
        state = file
        print(file)
        x, y = getXY(dataset, population[file])
        model.fit(x, y,
                epochs = 200,
                batch_size=x.shape[0],
                verbose=0, 
                shuffle=True
                 )
    for otherCountry in otherCountries:
        otherFolder = "../../data/{}/processed_data".format(otherCountry)
        testFiles = os.listdir(otherFolder)
        comparisonCsvName = "{}/comparisoncsv-{}-{}.folder".format(folder, country, otherCountry)
        
        comparisonCsv = pd.DataFrame(columns=["File", "NaiveRMSE", "LSTMRMSE", "NaiveRelativeRMSE", "LSTMRelativeRMSE"])
        
        for file in testFiles:

            test_dataset = pd.read_csv("{}/{}".format(otherFolder, file), index_col=0)
            state = file
            formattedFilename = formatFilename

            if(not os.path.isdir("{}/{}".format(country, otherCountry))):
                os.mkdir("{}/{}".format(country, otherCountry))

            test_x, test_y = getXY(test_dataset, population[file])
            predictions = model.predict(test_x)
            #Transform to 1-D
            test_y = test_y.reshape((len(test_y), 1))

            #Rescale
            inv_yPred = np.apply_along_axis(lambda x: x * population[file] / 100000, 1, predictions)

            test_dataset["Cases"] *= (population[file] / 100000)
            
            naive = test_dataset["Cases"].values[2:-3]
            
            test_dataset = test_dataset[4:-1]

    
            print(inv_yPred.shape)
            print(len(test_dataset.index))
            test_dataset["LSTM-Prediction"] = inv_yPred

            test_dataset["error"] = test_dataset["Cases"] - test_dataset["LSTM-Prediction"]

            #Naive
            test_dataset["Naive-Prediction"] = naive
            test_dataset["NaiveError"] = test_dataset["Naive-Prediction"] - test_dataset["Cases"]

            naiveRMSE = sqrt(mean_squared_error(
                test_dataset["Cases"].values,
                            test_dataset["Naive-Prediction"].values
            ))

            LSTMRMSE = sqrt(mean_squared_error(
                test_dataset["Cases"].values,
                test_dataset["LSTM-Prediction"].values
            ))
            
            naiveRelativeRMSE = naiveRMSE / test_dataset["Cases"].mean()
            LSTMRelativeRMSE = LSTMRMSE / test_dataset["Cases"].mean()
            
            comparisonCsv = comparisonCsv.append({
                "File": file,
                "NaiveRMSE": naiveRMSE,
                "LSTMRMSE" : LSTMRMSE,
                "NaiveRelativeRMSE" : naiveRelativeRMSE,
                "LSTMRelativeRMSE" : LSTMRelativeRMSE,
                                 }, ignore_index=True)

            #


            test_dataset.rename(index=str, columns={"Cases": "Observed"}, inplace=True)

            test_dataset.to_csv("{}/{}/{}".format(country, otherCountry, file))

            test_dataset[["Observed", "LSTM-Prediction", "Naive-Prediction"]].plot(figsize=(10,10))
            plt.title("LSTM Model\n{}".format(file))
            ax = plt.gca()
            ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
            plt.xlabel("Date")
            plt.ylabel("ZIKV Cases")
            plt.legend()
            plt.grid(linestyle='dashed', linewidth=1.5)
            fig = plt.gcf()
            fig.savefig("{}/{}/{}.png".format(country, otherCountry, file))
            plt.close("all")
        
        comparisonCsv["LSTM-Naive-Ratio"] = comparisonCsv["LSTMRMSE"] / comparisonCsv["NaiveRMSE"]
        
        comparisonCsv["Average-Ratio"] = comparisonCsv["LSTM-Naive-Ratio"].mean()
        
        comparisonCsv.to_csv("{}/{}/Comparison.csv".format(country, otherCountry))
    saveModel(model, "{}/Model".format(country))

Alagoas_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)'],
      dtype='object')
Reframed Shape:  (99, 12)
Bahia_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)'],
      dtype='object')
Reframed Shape:  (99, 12)
Ceara_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)'],
      dtype='object')
Reframed Shape:  (99, 12)
Goias_2016-2017.csv
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)',
       'var1(t+1)', 'var2(t+1)'],
      dtype='object')
Reframed Shape:  (99, 12)
Maranhao_2016-2017.csv
Index([