In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from keras import Sequential
from keras.models import Model
from keras.layers import LSTM, Dense, Concatenate, Flatten, Input, GRU
from keras.layers.merge import concatenate
from keras.optimizers import Adam, RMSprop, Adadelta
from keras.constraints import non_neg
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras import regularizers
from keras import backend as K
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

Using TensorFlow backend.


In [2]:
#Population values
population = {
    
#     Colombia
    "huila_2016-2017.csv" : 1154804,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
#     Brazil
    "Bahia_2016-2017.csv": 15344447,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

countries = {
#     Colombia
    "huila_2016-2017.csv" : "Colombia",
    "santander_2016-2017.csv" : "Colombia",
    "santander_norte_2016-2017.csv" : "Colombia",
    "tolima_2016-2017.csv" : "Colombia",
    "valle_cauca_2016-2017.csv" : "Colombia",
    
#     Brazil
    "Bahia_2016-2017.csv": "Brazil",
    "MatoGrosso_2016-2017.csv": "Brazil",
    "MinasGerais_2016-2017.csv": "Brazil",
    "RioDeJaneiro_2016-2017.csv": "Brazil",
    "SaoPaulo_2016-2017.csv": "Brazil",
}

In [3]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [5]:
def saveModel(model, modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)

    model_json = model.to_json()
    with open(jsonName, "w") as json_file:
        json_file.write(model_json)
    #seralize weights to HDF5
    model.save_weights(h5Name)

In [6]:
def createModel(modelName):
    jsonName = "{}.json".format(modelName)
    h5Name = "{}.h5".format(modelName)
    
    
    input_layer = Input(shape=(4,2))
    b1_out = Dense(64)(input_layer)
    b1_out = Flatten()(b1_out)
    
    b2_out = Dense(32, activation="relu", kernel_regularizer="l2")(input_layer)
    b2_out = Flatten()(b2_out)
    
    concatenated = concatenate([b1_out, b2_out])
    out = Dense(4, activation="relu", kernel_regularizer="l2")(concatenated)
    out = Dense(4, activation="relu", kernel_regularizer="l2")(out)
    out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    
    model = Model([input_layer], out)
    model.compile(loss=["mae"], optimizer="adam", metrics=["mae", root_mean_squared_error])

    return model

In [7]:
def getXY(dataset, scale):
    dataset[["Searches"]] /= 100
    dataset[["Cases"]] = dataset[["Cases"]].apply(lambda x: x*100000/scale, axis=1)

    values = dataset.values.astype("float32")
    
    n_weeks = 4
    n_features = 2

    reframed = series_to_supervised(values, n_weeks, 1)
    values = reframed.values
    print(reframed.columns)
    
    print("Reframed Shape: ", reframed.shape)
    totalFeatures = reframed.shape[1]
    n_obs = n_weeks * n_features

    x,y = values[:, :8], values[:, -1] # Pick 4 previous weeks and predict 4 week ahead 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [8]:
def formatFilename(filename):
    return filename.replace(".csv", "")

In [12]:
states = list(population.keys())
experiments= []
for i in range(len(states)):
    test_state = states[i]
    train_states = states[:i]+states[i+1:]
    experiments.append({"test_state": test_state, "train_states": train_states})

for experiment in experiments:
    foldername = experiment["test_state"]
    model = createModel(foldername)
    if(not os.path.isdir(foldername)):
        os.mkdir(foldername)
        
    if(not os.path.isdir("{}/train".format(foldername))):
        os.mkdir("{}/train".format(foldername))
       
    if(not os.path.isdir("{}/test".format(foldername))):
        os.mkdir("{}/test".format(foldername))
       
    for train_state in experiment["train_states"]:
        state_population = population[train_state]
        state_country = countries[train_state]
        data_folder = "../../data/{}/processed_data/{}".format(state_country, train_state)
        
        state_dataset = pd.read_csv(data_folder, index_col=0)
        
        x, y = getXY(state_dataset, state_population)
        
        history = model.fit(x,y,
                 epochs = 30,
                 batch_size = x.shape[0],
                 verbose = 0,
                 shuffle=False)
        
        plt.clf()
        plt.title("Train - {}".format(train_state))
        ax = plt.gca()
        ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
        plt.plot(history.history["root_mean_squared_error"])
        plt.ylabel("rmse")
        plt.xlabel("epoch")
        plt.legend(['train'], loc="upper left")
        fig = plt.gcf()
        fig.savefig("{}/train/Loss-{}.png".format(foldername, train_state))
        plt.close("all")
        

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absol

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absol

Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absolute_error', 'root_mean_squared_error'])
Index(['var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)', 'var1(t-2)',
       'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var1(t)', 'var2(t)'],
      dtype='object')
Reframed Shape:  (100, 10)
dict_keys(['loss', 'mean_absol