In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import pickle

import warnings

In [None]:
from statsmodels.tsa.arima.model import ARIMA

class ARIMAForecast():
    
    def __init__(self, data, n_lag_terms ,diff_order ,window_size):
        self.data = data
        self.models = {}
        self.n_lag_terms = n_lag_terms
        self.diff_order = diff_order
        self.window_size = window_size
        
    def train(self):
        for zip_code in self.data['zip'].unique():
            # filter
            curr_data = self.data[self.data['zip']==zip_code][['year', 'est']].set_index('year')
            start_time = curr_data.index[0]
            # train
            model = ARIMA(curr_data, order=(self.n_lag_terms ,self.diff_order ,self.window_size))
            try:
                results = model.fit()
                self.models[zip_code] = (results, start_time)
            except:
                pass
#                 print(zip_code)
#                 print(curr_data)
            
    def forecast(self, year):
        preds = []
        # last year seen in the training set
        # used to calculate start range for forecast, to avoid predicting values in training set
        data_last_year = self.data['year'].max()
        for zip_code, model_info in self.models.items():
            model, start_time = model_info
            # make predictions
            curr_pred = model.predict(data_last_year-start_time+1,year-start_time)
            # modify results into a df object
            curr_pred = curr_pred.to_frame().assign(zip=np.full(curr_pred.shape[0], zip_code)).reset_index()
            curr_pred = curr_pred.rename(columns={'index':'year', 0:'est', 'predicted_mean':'est'})
            # address issue where timestamp of some predictions is the number of years after the last year
            # in the training data rather than a timestamp object
            max_int = curr_pred[curr_pred['year'].apply(lambda x: type(x) == int)]['year'].max()
            curr_pred['year'] = curr_pred['year'].apply(lambda x: year-max_int+x if (type(x) == int) else x)
            preds += [curr_pred]
            
        return pd.concat(preds, ignore_index=True).reset_index(drop=True)
            

# Data

In [None]:
file_path = '../../src/data/temp/zbp_totals_with_features.csv'
data = pd.read_csv(file_path)

In [None]:
file_path = '../../src/data/temp/lagged_zbp_totals_with_features.csv'
lagged_data = pd.read_csv(file_path)

In [None]:
data = data[data['year'] <= 2020]
lagged_data = lagged_data[lagged_data['year'] <= 2020]

# Drop Categorical Flags

In [None]:
non_numerical_cols = data.select_dtypes(exclude=['int64', 'float64']).columns
data = data.drop(columns=non_numerical_cols)

non_numerical_cols = lagged_data.select_dtypes(exclude=['int64', 'float64']).columns
lagged_data = lagged_data.drop(columns=non_numerical_cols)

# Train-Test Split

### Short-Term

In [None]:
end_year = 2020 - 1

short_data_train = data[data['year'] <= end_year]
short_data_test = data[data['year'] > end_year]

short_lagged_data_train = lagged_data[lagged_data['year'] <= end_year]
short_lagged_data_test = lagged_data[lagged_data['year'] > end_year]

### Long-Term

In [None]:
end_year = 2018 - 1

long_data_train = data[data['year'] <= end_year]
long_data_test = data[data['year'] > end_year]

long_lagged_data_train = lagged_data[lagged_data['year'] <= end_year]
long_lagged_data_test = lagged_data[lagged_data['year'] > end_year]

# Standardization

In [None]:
short_lagged_train_mean = short_lagged_data_train.mean()
short_lagged_train_mean['zip'] = 0
short_lagged_train_std = short_lagged_data_train.std()
short_lagged_train_std['zip'] = 1

long_lagged_train_mean = long_lagged_data_train.mean()
long_lagged_train_mean['zip'] = 0
long_lagged_train_std = long_lagged_data_train.std()
long_lagged_train_std['zip'] = 1

In [None]:
short_lagged_data_train = (short_lagged_data_train - short_lagged_train_mean) / short_lagged_train_std
short_lagged_data_test = (short_lagged_data_test - short_lagged_train_mean) / short_lagged_train_std

long_lagged_data_train = (long_lagged_data_train - long_lagged_train_mean) / long_lagged_train_std
long_lagged_data_test = (long_lagged_data_test - long_lagged_train_mean) / long_lagged_train_std

# Forward Feature Selection

In [None]:
# from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
# X_train = data_train.drop(columns=['est'])
# y_train = data_train['est']
# X_test = data_test.drop(columns=['est'])
# y_test = data_test['est']

In [None]:
# ffs = SequentialFeatureSelector(LinearRegression(n_jobs=-1), k_features=11, forward=True, n_jobs=-1)
# ffs.fit(X_train, y_train)
# features = list(ffs.k_feature_names_)
# features

In [None]:
# pl, train_rmse, test_rmse = fit_eval(pl, data_train, data_test, features)
# print('train_rmse: ', train_rmse)
# print('test_rmse: ', test_rmse)

# Corr Matrix

In [None]:
# data_ohe_train = preproc.fit_transform(data_train)
# feature_names = preproc.get_feature_names_out()
# feature_names = np.char.replace(feature_names.astype('str'), 'onehots__','')
# feature_names = np.char.replace(feature_names, 'remainder__','')

# data_ohe_train = pd.DataFrame(data_ohe_train, columns=feature_names)

# data_ohe_test = preproc.transform(data_test)
# data_ohe_test = pd.DataFrame(data_ohe_test, columns=feature_names)

In [None]:
# top_k = -3
# corr = data_ohe_train.corr()[['est']].sort_values(by='est', ascending=False)
# vmin = corr.min()
# vmax = corr.max()
# corr_thresh = corr.abs().sort_values('est', ascending=False).iloc[top_k+2]['est']
# corr = corr[corr['est'].abs() > corr_thresh]
# print(f'top {corr.shape[0]} features:')
# corr[2:].style.background_gradient(cmap='coolwarm', vmin=vmin, vmax=vmax)

# Models

In [None]:
def unstandardize_series(ser, mean, std):
    return (ser*std)+mean

def fit_eval(model, data_train, data_test, included_feats, train_mean, train_std):
    X_train = data_train[included_feats]
    y_train = data_train['est']
    X_test = data_test[included_feats]
    y_test = data_test['est']
    
    model.fit(X_train, y_train)
    
    y_preds = model.predict(X_train)
    inverted_y_train = unstandardize_series(y_train, train_mean['est'], train_std['est'])
    inverted_y_preds = unstandardize_series(y_preds, train_mean['est'], train_std['est'])
    train_rmse = mean_squared_error(inverted_y_train, inverted_y_preds, squared=False)
    
    y_preds = model.predict(X_test)
    inverted_y_test = unstandardize_series(y_test, train_mean['est'], train_std['est'])
    inverted_y_preds = unstandardize_series(y_preds, train_mean['est'], train_std['est'])
    test_rmse = mean_squared_error(inverted_y_test, inverted_y_preds, squared=False)
    
    return model, train_rmse, test_rmse

### Lin Reg

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('lr', LinearRegression(n_jobs=-1))])

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, short_lagged_data_train, short_lagged_data_test, 
                                     short_lagged_data_train.columns.drop(['est']), 
                                     short_lagged_train_mean, short_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, long_lagged_data_train, long_lagged_data_test, 
                                     long_lagged_data_train.columns.drop(['est']), 
                                     long_lagged_train_mean, long_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

### Lasso

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('lr', Lasso(alpha=0.007))])

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, short_lagged_data_train, short_lagged_data_test, 
                                     short_lagged_data_train.columns.drop(['est']), 
                                     short_lagged_train_mean, short_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, long_lagged_data_train, long_lagged_data_test, 
                                     long_lagged_data_train.columns.drop(['est']), 
                                     long_lagged_train_mean, long_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

### Random Forest

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('lr', RandomForestRegressor(random_state=42, n_estimators=50, max_depth=50, n_jobs=-1))])

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, short_lagged_data_train, short_lagged_data_test, 
                                     short_lagged_data_train.columns.drop(['est']), 
                                     short_lagged_train_mean, short_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, long_lagged_data_train, long_lagged_data_test, 
                                     long_lagged_data_train.columns.drop(['est']), 
                                     long_lagged_train_mean, long_lagged_train_std)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

### ARIMA

In [None]:
model = ARIMAForecast(short_data_train, 1, 1, 1)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model.train()
forecast = model.forecast(short_data_test['year'].max())
preds_labels = forecast.merge(short_data_test, on=['zip', 'year'], suffixes=('_pred', '_true'))
mean_squared_error(preds_labels['est_true'], preds_labels['est_pred'], squared=False)

In [None]:
model = ARIMAForecast(long_data_train, 1, 1, 1)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model.train()
forecast = model.forecast(long_data_test['year'].max())
preds_labels = forecast.merge(long_data_test, on=['zip', 'year'], suffixes=('_pred', '_true'))
mean_squared_error(preds_labels['est_true'], preds_labels['est_pred'], squared=False)