In [1]:
import sys
sys.path.insert(0, '../../../src/helper')
import zbp_visualizer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Data

In [4]:
file_path = '../../../src/data/temp/zbp_totals_with_features.csv'
data = pd.read_csv(file_path, parse_dates = ['year'])
data.head()

Unnamed: 0,zip,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,year,naics_11_pct,...,n50_99_pct,n100_249_pct,n250_499_pct,n500_999_pct,n1000_pct,median_hh_income,total_population_x,total_population_y,total_retirement,total_midcareer
0,91901,H,4141,H,36304,H,174786,391,2012-01-01,0.0,...,0.01023,0.005115,0.002558,0.0,0.002558,76496,17034.0,17034.0,2691.0,3452.0
1,91902,G,2265,G,19111,G,81569,349,2012-01-01,0.002865,...,0.017192,0.002865,0.0,0.0,0.0,86099,17659.0,17659.0,3485.0,4031.0
2,91905,G,19,S,0,H,748,9,2012-01-01,0.0,...,0.0,0.0,0.0,0.0,0.0,70000,1088.0,1088.0,101.0,225.0
3,91906,D,0,D,0,D,0,27,2012-01-01,0.0,...,0.0,0.0,0.037037,0.0,0.0,54135,3679.0,3679.0,395.0,836.0
4,91910,G,19799,G,200767,G,805325,1458,2012-01-01,0.000686,...,0.021262,0.013032,0.002743,0.0,0.001372,55875,73761.0,73761.0,9195.0,19426.0


# Model

In [5]:
class ARIMAForecast():
    
    def __init__(self, data, n_lag_terms ,diff_order ,window_size):
        self.data = data
        self.models = {}
        self.n_lag_terms = n_lag_terms
        self.diff_order = diff_order
        self.window_size = window_size
        
    def train(self):
        for zip_code in self.data['zip'].unique():
            # filter
            curr_data = self.data[self.data['zip']==zip_code][['year', 'est']].set_index('year')
            start_time = curr_data.index[0]
            # train
            model = ARIMA(curr_data, order=(self.n_lag_terms ,self.diff_order ,self.window_size))
            try:
                results = model.fit()
                self.models[zip_code] = (results, start_time)
            except:
                pass
#                 print(zip_code)
#                 print(curr_data)
            
    def forecast(self, year):
        preds = []
        # last year seen in the training set
        # used to calculate start range for forecast, to avoid predicting values in training set
        data_last_year = self.data['year'].max().year
        for zip_code, model_info in self.models.items():
            model, start_time = model_info
            # make predictions
            curr_pred = model.predict(data_last_year-start_time.year+1,year-start_time.year)
            # modify results into a df object
            curr_pred = curr_pred.to_frame().assign(zip=np.full(curr_pred.shape[0], zip_code)).reset_index()
            curr_pred = curr_pred.rename(columns={'index':'year', 0:'est', 'predicted_mean':'est'})
            # address issue where timestamp of some predictions is the number of years after the last year
            # in the training data rather than a timestamp object
            max_int = curr_pred[curr_pred['year'].apply(lambda x: type(x) == int)]['year'].max()
            curr_pred['year'] = curr_pred['year'].apply(lambda x: pd.Timestamp(str(year-max_int+x)) if (type(x) == int) else x)
            preds += [curr_pred]
            
        return pd.concat(preds, ignore_index=True).reset_index(drop=True)
            

# Train-test Split

In [6]:
end_year = 2020
data_train = data[data['year'] <= pd.Timestamp(str(end_year))]
data_test = data[data['year'] > pd.Timestamp(str(end_year))]

# Evaluate

In [7]:
model = ARIMAForecast(data_train, 1, 1, 1)
model.train()
forecast = model.forecast(data_test['year'].max().year)
preds_labels = forecast.merge(data_test, on=['zip', 'year'], suffixes=('_pred', '_true'))
mean_squared_error(preds_labels['est_true'], preds_labels['est_pred'], squared=False)

21.740434312894557

# Visualize Predictions

In [None]:
last_year = data_test['year'].max().year
preds_last_year = preds_labels[preds_labels['year'] == pd.Timestamp(str(last_year))][['zip','est_pred']]
zbp_visualizer.generate_zbp_chloropleth(preds_last_year, 'zip', 'est_pred', f'arima_{last_year}_preds')