In [1]:
import os
import getpass
import pandas as pd
import numpy as np
import csv
import statsmodels.api as sm
import warnings
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import stats
from datetime import datetime
from sklearn.metrics import mean_absolute_error

curr_dir = os.path.abspath('')
app_root = curr_dir if os.path.basename(curr_dir) != "src" else os.path.dirname(curr_dir)

if getpass.getuser() == "rainfalld":  # docker daemon
    home = os.path.expanduser("~")
    destdir = home                    # /var/cache/rainfall-predictor
else:
    destdir = os.path.join(app_root,'data')      # non-docker stay in repository


file = os.path.join(app_root,'data','rainfalldata.csv')
rd = pd.read_csv(file)
file2 = os.path.join(app_root,'data','ncrainfalldata.csv')
ncrd = pd.read_csv(file2)
rd.Date = pd.to_datetime(rd.Date)
rd = rd.set_index('Date')

In [2]:
# this cell takes the stored exogen dictionary that is stored in the Data_Wrangling_CAP1 jupyter notebook
# that was imported above.
try:
    %store -r exogen
except NameError:
    f = open(os.path.join(destdir,"exogen.json"),"r")
    exogen = json.load(f)      # read from file, passed from Data_Wrangling
    f.close()


In [3]:
exogen.keys()

dict_keys(['Arcola, NC', 'Henderson 2 NNW, NC', 'Laurinburg, NC', 'Roanoke Rapids, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BREVARD, NC', 'CASAR, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', ' MOUNT HOLLY 4 NE, NC', ' OCONALUFTEE, NC', 'PISGAH FOREST 3 NE, NC', 'ROBBINSVILLE AG 5 NE, NC', 'ROSMAN, NC', 'SHELBY 2 NW, NC', 'TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC'])

In [14]:
def sarima_model_creation(data, p, d, q, P, D, Q, m, exog=None):
    my_order = [p,d,q]
    my_sorder = [P,D,Q,m]
    sarimamod = sm.tsa.statespace.SARIMAX(data, exog, order=my_order, seasonal_order=my_sorder, 
                                          enforce_stationarity=False, enforce_invertibility=False,
                                          initialization='approximate_diffuse')
    model_fit = sarimamod.fit()# start_params=[0, 0, 0, 0, 1])
    return(model_fit)

In [15]:
def model_creation_pred_one_step(train_data, test_data, exotrain = None, exotest=None):
    list_one_step = []
    if exotrain is not None:
        mod = sarima_model_creation(train_data, 4, 0, 3, 3, 0, 4, 12, exog = exotrain)
        nextMonth = mod.forecast(exog = exotrain.iloc[[-1]]) # passing prevMonth (december, for forecasting jan)
    else:
        mod = sarima_model_creation(train_data, 4, 0, 3, 3, 0, 4, 12)
        nextMonth = mod.forecast()
    list_one_step.append(nextMonth[0]) # captures prediction

    # if test data exists
    if len(test_data) >= 1:
        # increment data for next month's iteration
        train_data = pd.concat([train_data, test_data[[0]]])
        test_data = test_data.drop(test_data.index[0], axis = 0)
        if exotrain is not None:
            exotrain = pd.concat([exotrain, exotest[[0]]])
            exotest = exotest.drop(exotest.index[0], axis = 0)

        # execute & capture future predictions
        futurePredictions = model_creation_pred_one_step(train_data, test_data, exotrain, exotest)
        # add to list
        list_one_step.append(futurePredictions)
    
    return list_one_step


def billsFn(train_data, test_data, exotrain = None, exotest=None):
    tra = train_data.copy()
    tes = test_data.copy()
    if exotrain is not None:     
        exotra = exotrain.copy()
        exotes = exotest.copy()
        preds = model_creation_pred_one_step(tra,tes,exotrain=exotra,exotest=exotes)
    else:
        preds = model_creation_pred_one_step(tra,tes)
    mae = mean_absolute_error(testing_data,  preds)
    return(mae)




In [16]:
def exog_combinations(df, exoe):
    lo_dfs = []
    if len(exoe) == 1:
        lo_dfs.append(df.loc[:,exoe])
    if len(exoe) > 1:
        lo_dfs.append(df.loc[:,exoe])
        for ex in exoe:
            lo_dfs.append(df.loc[:,[ex]])
        if len(exoe) >2:
            for i in range(2, len(exoe)):
                combolist = list(combinations(exoe,i))
                for c in combolist:
                    lo_dfs.append(df.loc[:,c])
    return(lo_dfs)


In [17]:
todokeys = ('HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC')
sub_exogen = {k: exogen[k] for k in todokeys}

In [18]:
from collections import defaultdict
l_o_dfs = defaultdict(list)
for key,value in tqdm(sub_exogen.items()):
    lo_dfs2 = exog_combinations(rd, value)
    l_o_dfs[key] = lo_dfs2
# l_o_dfs['ROBBINSVILLE AG 5 NE, NC']

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

In [19]:
def exogenous_var(data, ncloc, l_exoloc):
#     for key, value in tqdm(exo_dict.items()):
    dat = data[ncloc]
#         l_exog = exog_combinations(data, value)
    tr, test = train_test_split(dat, test_size = 0.2, shuffle=False)
    keymae = billsFn(tr,test)
    print('keymae of: '+ key +' = '+str(keymae))
    bettermae = {}
    for exog in tqdm(l_exoloc):
        extr, extest = train_test_split(exog, test_size = 0.2, shuffle=False)
        exmae = billsFn(tr,test, exogtrain=extr, exotest = extest)
        co = tuple(exog.columns)
        print('exmae = {}'.format(co) + ' '+ str(exmae))
        if exmae < keymae:
            bettermae[co] = exmae
            bettermae2 = {key: bettermae}
    return(co)

In [None]:
# best_comb = [[4,3,3,4]]
warnings.filterwarnings("ignore")
for key,value in tqdm(l_o_dfs.items()):
    exogenous_var(rd, key, value)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

In [23]:
# exogenous_var(rd, sub_exogen, best_comb)

keymae of: Henderson 2 NNW, NC = 1.6002245784392526
exmae = ('SOUTH BOSTON, VA', 'John Kerr Dam, VA') 1.9436784695953795
exmae = ('SOUTH BOSTON, VA',) 1.905091464504384
exmae = ('John Kerr Dam, VA',) 1.9066707555352826
keymae of: JACKSON SPRINGS 5 WNW, NC = 1.880135880758885
exmae = ('CHESTERFIELD 3 E, SC', ' CHERAW, SC') 2.27692897709342
exmae = ('CHESTERFIELD 3 E, SC',) 2.685657575743084
exmae = (' CHERAW, SC',) 2.1676857485863783
keymae of: Laurinburg, NC = 1.648422257446495
exmae = (' DARLINGTON, SC', ' DILLON, SC', ' FLORENCE REGIONAL AIRPORT, SC', 'FLORENCE 8 NE, SC', 'CHESTERFIELD 3 E, SC', ' CHERAW, SC') 2.175538475973483
exmae = (' DARLINGTON, SC',) 2.048045487433798
exmae = (' DILLON, SC',) 1.8562542631849823
exmae = (' FLORENCE REGIONAL AIRPORT, SC',) 2.268586897110784
exmae = ('FLORENCE 8 NE, SC',) 2.0791528083270987
exmae = ('CHESTERFIELD 3 E, SC',) 2.256082996088929
exmae = (' CHERAW, SC',) 2.2657683522657046
exmae = (' DARLINGTON, SC', ' DILLON, SC') 2.0042636269567677
e

KeyboardInterrupt: 

In [21]:
# if you need to pause the code from running and then start back up from where you left off you can use the
# following code to get a sub_dictionary that will be only a portion of the large dictionary (exogen) that I am
# using

# I had to do this because my computer started running really slow so I had to restart it in order to let the 
# memory reset and so it could start running at normal speed again.

# use exogen.keys() to list all of the keys then find the location where you left off from the printout above
# and then copy and paste the rest of the keys into the "todokeys" tuple below. Then call sub_exogen in place
# of exogen in the "exogenous_var" function above. 


# todokeys = ('Henderson 2 NNW, NC', 'JACKSON SPRINGS 5 WNW, NC', 'Laurinburg, NC', 'Louisburg, NC', 'Roanoke Rapids, NC', 'Rougemont, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'ELIZABETHTOWN, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', ' WILLIAM O HUSKE L&D, NC', 'Asheville Area, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BLACK MOUNTAIN 2 W, NC', 'BREVARD, NC', 'BRIDGEWATER HYDRO, NC', 'CASAR, NC', 'CATAWBA 3 NNW, NC', 'CONCORD, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', 'Hickory FAA Airport, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'LENOIR, NC', 'LINCOLNTON 4 W, NC', 'MARION, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', 'MORGANTON, NC')
# sub_exogen = {k: exogen[k] for k in todokeys}