# Testing on States


In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import importlib
from seaborn import set_style
set_style=("whitegrid")

In [3]:
df = pd.read_csv("D:\Graduate Center Dropbox\Yuan Liu\Data Science Project\data\destination_State_d_2018.csv")
unique_states = df['Masked_DestinationState'].drop_duplicates()


  df = pd.read_csv("D:\Graduate Center Dropbox\Yuan Liu\Data Science Project\data\destination_State_d_2018.csv")


#### Explanation. 
Preprocessing the state data sets. This takes about 100 minutes. 

In [None]:
import os
import preprocessing
statedict={}
scalerdict = {}
rowsdict = {}
for state in unique_states:
    try:
        L=[]
        S=[]
        rowsdict[state]=0
        for i in range(2018,2023): #adjust here when we get more years of data
            if not os.path.exists('D:/Graduate Center Dropbox/Yuan Liu/Data Science Project/parquetfiles/'+str(i)+'states/state_'+state+'.parquet'):
                raise FileNotFoundError(f"{state} not found")
            else:
                df=pd.read_parquet('D:/Graduate Center Dropbox/Yuan Liu/Data Science Project/parquetfiles/'+str(i)+'states/state_'+state+'.parquet')
                data, scaler=preprocessing.get_processed_series(df)
                L.append(data)
                S.append(scaler)
                rowsdict[state]=rowsdict[state]+df.shape[0]
        scalerdict[state]=S
        statedict[state]=pd.concat(L, ignore_index=True)
        print(state)
    except FileNotFoundError as e:
        print(e)
        continue

In [50]:
for state in statedict:
    statedict[state].to_csv('state_timeseries/' +state+'.csv')

#### Explanation:
Earlier testing had shown that for the state datasets, the best performing model was usually arima or some combination of another model and arima. 

#### Results for some states. 5-fold test-train split and 14 day horizon: 
| Model                              | MSE      | MAE      | NMSE     |
|------------------------------------|----------|----------|----------|
| arima                              | 0.007985 | 0.069237 | 0.145986 |
| smoothing_arima_mult_60            | 0.011781 | 0.083090 | 0.215386 |
| expsmoothing_predict               | 0.026719 | 0.135345 | 0.488499 |
| prophet_arima_mult_60              | 0.043296 | 0.193694 | 0.791573 |
| constant_predict                   | 0.045169 | 0.185501 | 0.825808 |
| prophet_arima_withlockdown_mult_60 | 0.045963 | 0.200433 | 0.840332 |
| naive_predict                      | 0.054696 | 0.207830 | 1.000000 |
| prophet_predict                    | 0.066626 | 0.221163 | 1.218112 |
| prophet_predict_withlockdown       | 0.071463 | 0.230041 | 1.306538 |

---

| Model                              | MSE      | MAE      | NMSE     |
|------------------------------------|----------|----------|----------|
| smoothing_arima_mult_60            | 0.024166 | 0.121657 | 0.243903 |
| arima                              | 0.025288 | 0.124166 | 0.255231 |
| prophet_arima_mult_60              | 0.030775 | 0.135252 | 0.310610 |
| prophet_predict                    | 0.034370 | 0.145436 | 0.346894 |
| prophet_arima_withlockdown_mult_60 | 0.034663 | 0.142469 | 0.349848 |
| prophet_predict_withlockdown       | 0.042451 | 0.166660 | 0.428456 |
| constant_predict                   | 0.063579 | 0.228481 | 0.641695 |
| expsmoothing_predict               | 0.069580 | 0.219339 | 0.702263 |
| naive_predict                      | 0.099080 | 0.282994 | 1.000000 |

---

| Model                              | MSE      | MAE      | NMSE     |
|------------------------------------|----------|----------|----------|
| smoothing_arima_mult_60            | 0.013612 | 0.086667 | 0.202720 |
| prophet_arima_withlockdown_mult_60 | 0.014033 | 0.090110 | 0.208993 |
| prophet_arima_mult_60              | 0.014170 | 0.090703 | 0.211026 |
| prophet_predict_withlockdown       | 0.015100 | 0.086135 | 0.224886 |
| arima                              | 0.015156 | 0.092909 | 0.225717 |
| prophet_predict                    | 0.016409 | 0.094101 | 0.244378 |
| expsmoothing_predict               | 0.050661 | 0.180158 | 0.754489 |
| constant_predict                   | 0.066803 | 0.235438 | 0.994884 |
| naive_predict                      | 0.067146 | 0.232929 | 1.000000 |

---

| Model                              | MSE      | MAE      | NMSE     |
|------------------------------------|----------|----------|----------|
| arima                              | 0.009272 | 0.067246 | 0.120145 |
| smoothing_arima_mult_60            | 0.009972 | 0.070253 | 0.129212 |
| prophet_arima_withlockdown_mult_60 | 0.016693 | 0.101468 | 0.216302 |
| prophet_arima_mult_60              | 0.021541 | 0.118026 | 0.279119 |
| prophet_predict_withlockdown       | 0.036426 | 0.176712 | 0.471995 |
| prophet_predict                    | 0.042576 | 0.194689 | 0.551677 |
| constant_predict                   | 0.056545 | 0.187477 | 0.732683 |
| expsmoothing_predict               | 0.057328 | 0.186379 | 0.742821 |
| naive_predict                      | 0.077176 | 0.253326 | 1.000000 |


In [40]:
largest_states = dict(sorted(rowsdict.items(), key=lambda item: item[1], reverse=True))
largest_list=list(largest_states.keys())

In [None]:
import forecasting
from itertools import islice
params = {
    'type': ['mult'],
    'window': [180]
}
results_dict={}
for state in largest_list[:5]:
    print(state)
    if state in statedict:
        results_dict[state]=forecasting.ttsplit_predictions(statedict[state], 5, 14, extra_models=[], printprogress=True, do_arima=True, smoothing_params=params, fixed_residue_models=True)


In [48]:
from forecasting import holdout_values
for state in results_dict:
    print(forecasting.evaluate_predictions(holdout_values(statedict[state].fillna(0), 5,14), results_dict[state]).sort_values(by='NMSE'))


                                 Model       MSE       MAE      NMSE
3             smoothing_arima_mult_180  0.007861  0.070061  0.143725
4               prophet_arima_mult_180  0.007871  0.069551  0.143912
0                                arima  0.007985  0.069237  0.145986
5  prophet_arima_withlockdown_mult_180  0.009354  0.076441  0.171012
2                     constant_predict  0.045169  0.185501  0.825808
1                        naive_predict  0.054696  0.207830  1.000000
                                 Model       MSE       MAE      NMSE
4               prophet_arima_mult_180  0.025648  0.104347  0.213916
5  prophet_arima_withlockdown_mult_180  0.025719  0.104933  0.214502
3             smoothing_arima_mult_180  0.026052  0.105167  0.217283
0                                arima  0.028443  0.108505  0.237226
2                     constant_predict  0.073849  0.245204  0.615923
1                        naive_predict  0.119899  0.315722  1.000000
                                 M

In [144]:

import os
import dask.dataframe as dd
test_dict = {}
for state in largest_states:
    if os.path.exists('../parquetfiles/2023states/state_'+state+'.parquet'):
        df=dd.read_parquet('../parquetfiles/2023states/state_'+state+'.parquet')
        test_dict[state]=df

In [157]:
import loadingdata
for state in test_dict:
    test_dict[state]=loadingdata.filtered_df(test_dict[state], ['eTimes_03'])

In [None]:
import preprocessing
importlib.reload(preprocessing)
test_series={}
test_scalers={}
for state in test_dict:
    test_series[state], test_scalers[state]=preprocessing.get_processed_series(test_dict[state])

In [134]:
total_dict={}
for state in largest_states:
    if state in statedict and state in test_dict:
        total_dict[state]=pd.concat([statedict[state], test_dict[state]])

In [None]:
import forecasting
importlib.reload(forecasting)
params = {
    'type': ['mult'],
    'window': [120]
}

final_dict2={}
for county in total_dict:
    print(county)
    final_dict2[county]=forecasting.ttsplit_predictions(total_dict[county], 10, 14, fixed_residue_models=True,smoothing_params=params, printprogress=False, do_arima=True)


In [163]:
for state in final_dict2:
    print(state)
    scaler=test_scalers[state]
    av=holdout_values(total_dict[state].fillna(0), 10,14).reshape(-1,1)
    av=scaler.inverse_transform(av)
    preds={}
    for model in final_dict2[state]:
        preds[model]=scaler.inverse_transform(final_dict2[state][model].reshape(-1,1))

    print(forecasting.evaluate_predictions(av, preds).sort_values(by='MAE'))


GG20J
                                 Model           MSE         MAE      NMSE
5  prophet_arima_withlockdown_mult_120  2.000064e+05  329.198209  0.159648
3             smoothing_arima_mult_120  1.998163e+05  334.779607  0.159496
0                                arima  2.355673e+05  347.358173  0.188033
4               prophet_arima_mult_120  2.186270e+05  348.695276  0.174511
2                     constant_predict  3.855355e+05  495.712774  0.307740
1                        naive_predict  1.252795e+06  979.400394  1.000000
AO6O4
                                 Model            MSE         MAE      NMSE
5  prophet_arima_withlockdown_mult_120  164246.126316  244.735054  0.194815
4               prophet_arima_mult_120  164316.292349  245.102250  0.194898
3             smoothing_arima_mult_120  163913.673383  245.634838  0.194421
0                                arima  169831.032729  258.231570  0.201439
2                     constant_predict  532236.182190  584.996918  0.631294
1      

Prophet arima performs much better at the state level than at the county level. This may because the increase in datasize makes it possible for a more complicated model to do well. 

In [165]:
total_monthly={}
for state in total_dict:
    total_monthly[state]=preprocessing.convert_to_monthly(total_dict[state])

In [None]:
from forecasting import expsmoothing_predict, monthly_prophet_predict, monthly_prophet_predict_withlockdown
monthly_predictionsdict={}
params = {
    'type': ['mult'],
    'window': [6]
}

for state in total_monthly:
    monthly_predictionsdict[state]=forecasting.ttsplit_predictions(total_monthly[state], 1,12,extra_models=[monthly_prophet_predict_withlockdown, monthly_prophet_predict, expsmoothing_predict], smoothing_params=params, fixed_residue_models=True, residue_window_size=6, monthly=True)

In [179]:
for state in monthly_predictionsdict:
    print(state)
    scaler=test_scalers[state]
    av=holdout_values(total_monthly[state].fillna(0), 1,12).reshape(-1,1)
    av=scaler.inverse_transform(av)
    preds={}
    for model in monthly_predictionsdict[state]:
        preds[model]=scaler.inverse_transform(monthly_predictionsdict[state][model].reshape(-1,1))
    print(forecasting.evaluate_predictions(av, preds).sort_values(by='MAE'))


GG20J
                                  Model           MSE           MAE       NMSE
0                                 arima  7.225390e+07   7213.175074   0.168062
2                      constant_predict  7.361833e+07   7281.402224   0.171236
6                smoothing_arima_mult_6  2.068546e+08  12911.124765   0.481142
4               monthly_prophet_predict  2.703686e+08  13695.688089   0.628875
3  monthly_prophet_predict_withlockdown  3.009666e+08  14671.593499   0.700045
1                         naive_predict  4.299245e+08  19710.021249   1.000000
5                  expsmoothing_predict  8.679547e+08  28875.070633   2.018854
8     prophet_arima_withlockdown_mult_6  8.613867e+09  85713.034964  20.035767
7                  prophet_arima_mult_6  9.060580e+09  86791.259331  21.074818
AO6O4
                                  Model           MSE           MAE  \
0                                 arima  4.361848e+07   4832.744391   
1                         naive_predict  4.361848e+07   