In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from statsmodels.tsa.statespace.sarimax import SARIMAX
# from statsmodels.tsa.statespace.sarimax import ARIMA

  from pandas.core import datetools


In [5]:
#Population values
population = {
    "Chiapas_2016-2017.csv": 5217908,
    "Colima_2016-2017.csv": 711235,
    "Guerrero_2016-2017.csv": 3533251,
    "Hidalgo_2016-2017.csv": 2858359,
    "NuevoLeon_2016-2017.csv": 5119504,
    "Oaxaca_2016-2017.csv": 3967889,
    "QuintanaRoo_2016-2017.csv": 1501562,
    "Tabasco_2016-2017.csv" : 2395272,
    "Veracruz_2016-2017.csv" : 8112505,
    "Yucatan_2016-2017.csv" : 2097175,
    
    "casanare_2016-2017.csv" : 356438,
    "cordoba_2016-2017.csv" : 1709603,
    "cundinamarca_2016-2017.csv" : 2680041,
    "huila_2016-2017.csv" : 1154804,
    "meta_2016-2017.csv" : 961292,
    "santander_2016-2017.csv" : 2061095,
    "santander_norte_2016-2017.csv" : 1355723,
    "tolima_2016-2017.csv" : 1408274,
    "valle_cauca_2016-2017.csv" : 4613377,
    
    "Alagoas_2016-2017.csv": 3375823,
    "Bahia_2016-2017.csv": 15344447,
    "Ceara_2016-2017.csv": 9020460,
    "Goias_2016-2017.csv": 6778772,
    "Maranhao_2016-2017.csv": 7000229,
    "MatoGrosso_2016-2017.csv": 3344544,
    "MinasGerais_2016-2017.csv": 21119536,
    "Para_2016-2017.csv": 8366628,
    "RioDeJaneiro_2016-2017.csv": 16718956,
    "SaoPaulo_2016-2017.csv": 45094866,
}

In [6]:
ArimaxParameters = {
    "Brazil" : [4, 0, 1, 1, 0, 0, 1],
    "Colombia" : [1, 1, 0, 0, 0, 0, 52],
    "Mexico" : [1, 1, 0, 0, 0, 0, 52],
}

In [7]:
for country in ["Brazil", "Colombia", "Mexico"]:
    
    print("Country - {}".format(country))
    p, d, q, sp, sd, sq, seasonal = ArimaxParameters[country]
    
    folder = "../../data/{}/processed_data".format(country)
    files = os.listdir(folder)
    
    for file in files:
        
        print("File - {}".format(file))
        if(not os.path.exists("{}/{}".format(country, file))):
            os.mkdir("{}/{}".format(country, file))
        
        dataset = pd.read_csv("{}/{}".format(folder, file))
        dataset["Searches"] /= 100
        state = file
        dataset["Cases"] *= (100000/population[state]) # Transform to incidence per 100,000 habitants  
        
        predictions_df = pd.DataFrame()
        predictions_df["Observed Cases"] = dataset["Cases"][52:].values * (population[state] / 100000)
        
        predictionsArr= []
        for i in range(52,104):
            model= SARIMAX(dataset["Cases"][:i].values,
                exog=dataset["Searches"][:i].values,
                trend="n", order=(p,d,q), seasonal_order=(sp,sd,sq,seasonal))
            results = model.fit()
            
            predictions = results.predict(
                start=i,
                end=i,
                exog=dataset["Searches"][i-1:i].values.reshape((1,1))
            )
            predictionsArr.append(predictions[0]) #CHECK
        predictions_df["Predicted Cases"] = predictionsArr
        predictions_df["Predicted Cases"] *= (population[state] / 100000)
        
        predictions_df["error"] = predictions_df["Predicted Cases"]  - predictions_df["Observed Cases"]
        predictions_df.to_csv("{}/{}/Arimax-{}".format(country, file, file))
        
        #Save images
        plt.clf()

        cols = ['Observed Cases', 'Predicted Cases']
        colors = ['#2962FF', '#FF9800']

        predictions_df[cols].plot(
            figsize=(10, 10),
            grid=True,
            color=colors
        )

        plt.title("Arimax Model\n{}".format(file.replace(".csv", "").replace("_2016-"," ")))
        ax = plt.gca()
        ax.set_facecolor((0.9, 0.9, 0.9, 0.7))
        plt.xlabel("Date")
        plt.ylabel("ZIKV Cases")
        plt.grid(linestyle='dashed', linewidth=1.5)
        fig = plt.gcf()
        fig.savefig("{}/{}/Arimax-{}.png".format(country, file, file.replace(".csv", "")))
        plt.close("all")

Country - Brazil
File - Bahia_2016-2017.csv




File - MatoGrosso_2016-2017.csv




LinAlgError: Singular matrix