In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import statsmodels.api as sm
import warnings
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import stats
from datetime import datetime
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_absolute_error
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

file = 'rainfalldata.csv'
rd = pd.read_csv(file)
file2 = 'ncrainfalldata.csv'
ncrd = pd.read_csv(file2)
rd.Date = pd.to_datetime(rd.Date)
rd = rd.set_index('Date')

In [2]:
from ipynb.fs.full.Data_Wrangling_CAP1 import exofind
latlong = pd.read_csv('latlong.csv')
latlongsplit = latlong.iloc[0].apply(str.split, sep=',')
latlongdf = pd.DataFrame(latlongsplit)
latlongdf = latlongdf.drop(['Unnamed: 0','Raleigh AP, NC', 'Greensboro, NC', ' WILMINGTON 7 N, NC','LUMBERTON, NC','MYRTLE BEACH, SC','CHARLOTTE DOUGLAS AIRPORT, NC','GRNVL SPART INTL AP, SC','PICKENS, SC',' MT. MITCHELL, NC',' Caesars Head Area, SC'])


   Year  Month
0  1887      1
1  1888      1
2  1889      1
3  1890      1
4  1891      1




<class 'pandas.core.frame.DataFrame'>
Index: 472 entries, 1-1980 to 4-2019
Columns: 235 entries, Raleigh, NC to row_number
dtypes: float64(234), int64(1)
memory usage: 890.2+ KB




In [3]:
# if this doesn't work you can comment out this cell and then run the next cell.
locations = rd.columns
ncloc = locations[locations.str.endswith('NC')]
valoc = locations[locations.str.endswith('VA')]
scloc = locations[locations.str.endswith('SC')]
galoc = locations[locations.str.endswith('GA')]
tnloc = locations[locations.str.endswith('TN')]
exoloc = valoc.append(galoc)
exoloc = exoloc.append(scloc)
exoloc = exoloc.append(tnloc)
exogen = exofind(latlongdf, ncloc, exoloc)

In [None]:
# this cell takes the stored exogen dictionary that is stored in the Data_Wrangling_CAP1 jupyter notebook
# that was imported above.

# %store -r exogen
# exogen

In [15]:
def sarima_model_creation(data, p, d, q, P, D, Q, m, exog=None):
    my_order = [p,d,q]
    my_sorder = [P,D,Q,m]
    sarimamod = sm.tsa.statespace.SARIMAX(data, exog, order=my_order, seasonal_order=my_sorder, 
                                          enforce_stationarity=False, enforce_invertibility=False,
                                          initialization='approximate_diffuse')
    model_fit = sarimamod.fit()# start_params=[0, 0, 0, 0, 1])
    return(model_fit)

In [16]:
def iteration_hyper(it):
    outlist = []
    for AR in range(it):
        for MA in range(it):
            for SAR in range(it):
                for SMA in range(it):
                    outlist.append([AR,MA,SAR,SMA])
    return(outlist)
        
config = iteration_hyper(5)

In [17]:
def hyperparameter_find(training_data, comb, testing_data, search = False, exogtr = None, exogtest = None):
    leastmae = 1000
    for com in comb:
        li_one_step = []
        for i in range(len(testing_data)):
            if i == 0:
                copytraining = training_data.copy()
                if exogtr is not None:
                    excopy = exogtr.copy()
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, 
                                                  com[3], 12, exog=excopy)
                    one_step_pred = mod_1.forecast(exog=excopy.iloc[[-12]]) #uses the data from the year before
                    excopy = pd.concat([excopy, exogtest.iloc[[i]]])
                else:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, com[3], 12)
                    one_step_pred = mod_1.forecast()
                li_one_step.append(one_step_pred[0])
                copytraining = pd.concat([copytraining, testing_data[[i]]])
            else:
                if exogtr is not None:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, 
                                                  com[3], 12, exog=excopy)
                    one_step_pred2 = mod_1.forecast(exog=excopy.iloc[[-12]])
                    excopy = pd.concat([excopy, exogtest.iloc[[i]]])
                else:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, com[3], 12)
                    one_step_pred2 = mod_1.forecast()
                li_one_step.append(one_step_pred2[0])
                copytraining = pd.concat([copytraining, testing_data[[i]]])
        mae = mean_absolute_error(testing_data, li_one_step)
        if search is True:
            if mae < leastmae:
                leastmae = mae
                H_AR = com[0]
                H_MA = com[1]
                H_SAR = com[2]
                H_SMA = com[3]
            print(com,mae)            
    if search is True:
        return('AR: '+ str(H_AR), 'MA: ' +str(H_MA), 'SAR: '+str(H_SAR), 'SMA: '+str(H_SMA))
    else:
        return(mae)

In [18]:
def exog_combinations(df, exoe):
    lo_dfs = []
    if len(exoe) == 1:
        lo_dfs.append(df.loc[:,exoe])
    if len(exoe) > 1:
        lo_dfs.append(df.loc[:,exoe])
        for ex in exoe:
            lo_dfs.append(df.loc[:,[ex]])
        if len(exoe) >2:
            for i in range(2, len(exoe)):
                combolist = list(combinations(exoe,i))
                for c in combolist:
                    lo_dfs.append(df.loc[:,c])
    return(lo_dfs)

In [19]:
def exogenous_var(data, exo_dict, best_comb):
    for key, value in exo_dict.items():
        dat = data[key]
        l_exog = exog_combinations(data, value)
        tr, test = train_test_split(dat, test_size = 0.2, shuffle=False)
        keymae = hyperparameter_find(tr, best_comb, test)
        print('keymae of: '+ key +' = '+str(keymae))
        bettermae = {}
        for exog in l_exog:
            extr, extest = train_test_split(exog, test_size = 0.2, shuffle=False)
            exmae = hyperparameter_find(tr, best_comb, test, exogtr=extr, exogtest = extest)
            co = tuple(exog.columns)
            print('exmae = {}'.format(co) + ' '+ str(exmae))
            if exmae < keymae:
                bettermae[co] = exmae
                bettermae2 = {key: bettermae}
    return(bettermae2)

In [None]:
best_comb = [[4,3,3,4]]
warnings.filterwarnings("ignore")
exogenous_var(rd, exogen, best_comb)

In [None]:
# if you need to pause the code from running and then start back up from where you left off you can use the
# following code to get a sub_dictionary that will be only a portion of the large dictionary (exogen) that I am
# using

# I had to do this because my computer started running really slow so I had to restart it in order to let the 
# memory reset and so it could start running at normal speed again.

# use exogen.keys() to list all of the keys then find the location where you left off from the printout above
# and then copy and paste the rest of the keys into the "todokeys" tuple below. Then call sub_exogen in place
# of exogen in the "exogenous_var" function above. 


# todokeys = ('Henderson 2 NNW, NC', 'JACKSON SPRINGS 5 WNW, NC', 'Laurinburg, NC', 'Louisburg, NC', 'Roanoke Rapids, NC', 'Rougemont, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'ELIZABETHTOWN, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', ' WILLIAM O HUSKE L&D, NC', 'Asheville Area, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BLACK MOUNTAIN 2 W, NC', 'BREVARD, NC', 'BRIDGEWATER HYDRO, NC', 'CASAR, NC', 'CATAWBA 3 NNW, NC', 'CONCORD, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', 'Hickory FAA Airport, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'LENOIR, NC', 'LINCOLNTON 4 W, NC', 'MARION, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', 'MORGANTON, NC')
# sub_exogen = {k: exogen[k] for k in todokeys}