In [150]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing
from sklearn.metrics import mean_squared_error 
    
import sys
sys.path.append('..')
from utils import preprocess

In [12]:
target = 'SI.POV.DDAY'
predict_year=2010
#percent of input Indicators to use (set to 100 for full set of input features)
percent = 0

In [144]:
#Load the data from disk
input_dir = '.\\..\\data\\'
data_input = "cleaned_data.pkl"
data = pd.read_pickle(input_dir + data_input)

#Get rid of all columns except the target column for each country
data = data.drop(list(data.columns.drop(target)), axis=1)

#Impute data
#This is done at this point as the impute function below works with data in a..
#particular format: dataframe with country/year hierarchial index and feature as column
data = preprocess.impute_data_interpolation(data, predict_year-1, 'linear')

#Reshape data to have every line being a timeseries
data = data.reset_index()
data = data.pivot(index='Year',columns='Country', values='SI.POV.DDAY')

#Convert the year index to a timestamp
data.index = pd.to_datetime(data.index)

#We will use years up to and including 2009 as training data. 2010 is the year we want ot forecast
#and compare with the real values
training_data = data['1972-01-01':'2009-01-01']

real_values = data.loc['2010-01-01',:]

training_data.head(5)

Country,Argentina,Armenia,Australia,Austria,Bangladesh,Belarus,Belgium,Bulgaria,Canada,China,...,Thailand,Tunisia,Turkey,Ukraine,United Kingdom,United States,Uruguay,Viet Nam,West Bank and Gaza,Zambia
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1972-01-01,0.4,16.9,1.0,0.2,29.9,11.6,0.2,0.0,0.5,66.2,...,19.6,15.0,2.4,3.0,0.5,0.5,0.0,52.9,1.1,54.1
1973-01-01,0.4,16.9,1.0,0.2,29.9,11.6,0.2,0.0,0.5,66.2,...,19.6,15.0,2.4,3.0,0.5,0.5,0.0,52.9,1.1,54.1
1974-01-01,0.4,16.9,1.0,0.2,29.9,11.6,0.2,0.0,0.5,66.2,...,19.6,15.0,2.4,3.0,0.5,0.5,0.0,52.9,1.1,54.1
1975-01-01,0.4,16.9,1.0,0.2,29.9,11.6,0.2,0.0,0.5,66.2,...,19.6,15.0,2.4,3.0,0.5,0.5,0.0,52.9,1.1,54.1
1976-01-01,0.4,16.9,1.0,0.2,29.9,11.6,0.2,0.0,0.5,66.2,...,19.6,15.0,2.4,3.0,0.5,0.5,0.0,52.9,1.1,54.1


Set the freq of the time series with the offset alias 'AS' standing for 'Year Begin'. more details can be found on [this](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html) page. StatsModels will infer the frequency from this attribute in the dataframe.

In [146]:
training_data.index.freq= 'AS'

#### Holt's method exponential smoothing

The method used here is based on chapter 7 of [1]. One model is created for every country. Note that in the machine learning methods that I used elsewhere in this project, one model covers all countries.

Note that I don't set values for alpha or beta when fitting the model. This let's the optimizer select the ideal values.

[1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice. OTexts, 2014.

In [147]:
sim_exp_results = pd.Series(index=training_data.columns)

for country in data.columns:
    model = ExponentialSmoothing(training_data[country], trend='add')
    simple_fitted = model.fit()
    sim_exp_results[country] =  simple_fitted.forecast(1)

In [151]:
mse= mean_squared_error(real_values.values, sim_exp_results.values)
print("RMSE of Holt method exponential smoothing:", np.sqrt(mse))

RMSE of Holt method exponential smoothing: 4.797448047023713
