In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from keras import Sequential
from keras.models import Model
from keras.layers import LSTM, Dense, Concatenate, Flatten, Input
from keras.layers.merge import concatenate
from keras.optimizers import Adam
from keras.losses import logcosh
import tensorflow as tf
from keras.constraints import non_neg

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

Using TensorFlow backend.


In [2]:
#Population values
population = {
    "Chiapas_2016-2017.csv": 5217908,
    "Colima_2016-2017.csv": 711235,
    "Guerrero_2016-2017.csv": 3533251,
    "Hidalgo_2016-2017.csv": 2858359,
    "NuevoLeon_2016-2017.csv": 5119504,
    "Oaxaca_2016-2017.csv": 3967889,
    "QuintanaRoo_2016-2017.csv": 1501562,
    "Tabasco_2016-2017.csv" : 2395272,
    "Veracruz_2016-2017.csv" : 8112505,
    "Yucatan_2016-2017.csv" : 2097175,
    
    "casanare_2016-2017.csv" : 356438,
    "cordoba_2016-2017.csv" : 1709603,
    "cundinamarca_2016-2017.csv" : 2680041,
    "huila_2016-2017.csv" : 1154804,
    "meta_2016-2017.csv" : 961292,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
    "Alagoas_2016-2017.csv": 3375823,
    "Bahia_2016-2017.csv": 15344447,
    "Ceara_2016-2017.csv": 9020460,
    "Goias_2016-2017.csv": 6778772,
    "Maranhao_2016-2017.csv": 7000229,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "Para_2016-2017.csv": 8366628,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def getXY(dataset, scale):
    dataset[["Searches"]] /= 100
    dataset[["Cases"]] = dataset[["Cases"]].apply(lambda x: x*100000/scale, axis=1)
    
    values = dataset.values.astype("float32")
    
    n_weeks = 4
    n_features = 2

    reframed = series_to_supervised(values, n_weeks, 1)
    values = reframed.values
    print("Reframed Shape: ", reframed.shape)
    totalFeatures = reframed.shape[1]
    n_obs = n_weeks * n_features

    x,y = values[:, :8], values[:, -1] # Pick 4 previous weeks and predict 4 week ahead 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y 

In [5]:
def saveModel(model, modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)

    model_json = model.to_json()
    with open(jsonName, "w") as json_file:
        json_file.write(model_json)
    #seralize weights to HDF5
    model.save_weights(h5Name)

In [6]:
countryUnits = {
    "Brazil": [
        32,
        16,
        8,
        8
    ],
    "Colombia": [
        64,
        64,
        8,
        2
    ],
    #BEST
    "Mexico" : [
        32,
        64,
        16,
        8
    ]
    
}


def createModel(country):
    lstm, dense1, dense2, dense3 = countryUnits[country]
    
    input_layer = Input(shape=(4,2))
    b1_out = LSTM(lstm, return_sequences=False)(input_layer)
    
    b2_out = Dense(dense1, activation="relu", kernel_regularizer="l2")(input_layer)
    b2_out = Flatten()(b2_out)
    
    concatenated = concatenate([b1_out, b2_out])
    out = Dense(dense2, activation="relu", kernel_regularizer="l2")(concatenated)
    out = Dense(dense3, activation="relu", kernel_regularizer="l2")(out)
    out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    
    model = Model([input_layer], out)
    model.compile(loss=["mae"], optimizer="adam", metrics=["mae"])

    return model

In [7]:
def formatFilename(filename):
    return filename.replace(".csv", "")

In [12]:
np.random.seed(2018)
for country in ["Mexico", "Brazil", "Colombia"]:
    folder = "../../data/{}/processed_data".format(country)
    files = os.listdir(folder)
    for file in files:
        
        with tf.Session() as sess:
            dataset = pd.read_csv("{}/{}".format(folder, file), index_col=0)
            state = file
            formattedFilename = formatFilename(file)
            
            if(not os.path.isdir("{}/{}".format(country, file))):
                os.mkdir("{}/{}".format(country, file))

            train = dataset[:52]
            test = dataset[48:] # Keep 4 previous values to be able to predict all 52 weeks of next year
            x, y = getXY(train, population[file])
            
            test_x, test_y = getXY(test, population[file])
            model = createModel(country)
            model.fit(x, y,
                    epochs = 30,
                    batch_size=x.shape[0],
                    verbose=0, 
                    shuffle=False
                     )
            
            predictions = []
            for i in range(len(test_x)):
                xShape = test_x[i].shape
                x = np.expand_dims(test_x[i], axis=0)
                y = np.zeros((1,), dtype=float)
                y[0] = np.array(test_y[i])
                predictions.append(model.predict(x))
                model.fit(x, y)
            
            
#             predictions = model.predict(test_x)
            #Transform to 1-D
            test_y = test_y.reshape((len(test_y), 1))
    
            #Rescale
            inv_yPred = np.apply_along_axis(lambda x: x * population[file] / 100000, 1, predictions)
            inv_y = np.apply_along_axis(lambda x: x * population[file] / 100000, 1, test_y)
            
            np.squeeze(inv_yPred, axis=2)
            
            test = test[4:] # drop the first 4 values used previously to be able to predict the full year
            test.drop(["Searches"], axis=1, inplace=True)
            test.drop(["Cases"], axis=1, inplace=True)
            test["Observed"] = inv_y
            test["Predicted"] = inv_yPred
            test["error"] = test["Predicted"] - test["Observed"]
            test.to_csv("{}/{}/{}".format(country, file, file))
            
            colors = ['#2962FF', '#212121']
            test[["Observed", "Predicted"]].plot(figsize=(10,10), color=colors)
            plt.title("LSTM Model\n{}".format(formattedFilename))
            plt.xlabel("Date")
            plt.ylabel("ZIKV Cases")
            ax = plt.gca()
            ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
            plt.legend()
            plt.grid(linestyle='dashed', linewidth=1.5)
            fig = plt.gcf()
            fig.savefig("{}/{}/{}.png".format(country, file, formattedFilename))
            plt.grid()
            plt.close("all")
            
            test[["error"]].plot(figsize=(10,10))
            plt.title("LSTM Model Error\n{}".format(formattedFilename))
            plt.xlabel("Date")
            plt.ylabel("LSTM Error")
            ax = plt.gca()
            ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
            plt.legend()
            plt.grid(linestyle='dashed', linewidth=1.5)
            fig = plt.gcf()
            fig.savefig("{}/{}/Error-{}.png".format(country, file, formattedFilename))
            plt.close("all")

            saveModel(model, "{}/{}/Model".format(country, file))

# Sound to alert me that the experiment has finished
import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Reframed Shape:  (48, 10)
Reframed Shape:  (52, 10)
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


AxisError: axis 3 is out of bounds for array of dimension 3