In [1]:
import os
import getpass
import pandas as pd
import numpy as np
import csv
import statsmodels.api as sm
import warnings
import multiprocessing
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import stats
from datetime import datetime
from sklearn.metrics import mean_absolute_error

try: 
    __file__
except:
    curr_dir = os.path.abspath('')
else:
    curr_dir = os.path.dirname(os.path.abspath(__file__))
    
app_root = curr_dir if os.path.basename(curr_dir) != "src" else os.path.dirname(curr_dir)

if getpass.getuser() == "rainfalld":  # docker daemon
    home = os.path.expanduser("~")
    destdir = home                    # /var/cache/rainfall-predictor
else:
    destdir = os.path.join(app_root,'data')      # non-docker stay in repository


file = os.path.join(app_root,'data','rainfalldata.csv')
rd = pd.read_csv(file)
file2 = os.path.join(app_root,'data','ncrainfalldata.csv')
ncrd = pd.read_csv(file2)
rd.Date = pd.to_datetime(rd.Date)
rd = rd.set_index('Date')

In [2]:
import json
# this cell takes the stored exogen dictionary that is stored in the Data_Wrangling_CAP1 jupyter notebook
# that was imported above.
try:
    %store -r exogen
except NameError:
    f = open(os.path.join(destdir,"exogen.json"),"r")
    exogen = json.load(f)      # read from file, passed from Data_Wrangling
    f.close()


In [3]:
exogen.keys()

dict_keys(['Arcola, NC', 'Henderson 2 NNW, NC', 'Laurinburg, NC', 'Roanoke Rapids, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BREVARD, NC', 'CASAR, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', ' MOUNT HOLLY 4 NE, NC', ' OCONALUFTEE, NC', 'PISGAH FOREST 3 NE, NC', 'ROBBINSVILLE AG 5 NE, NC', 'ROSMAN, NC', 'SHELBY 2 NW, NC', 'TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC'])

In [4]:
def sarima_model_creation(data, p, d, q, P, D, Q, m, exog=None):
    my_order = [p,d,q]
    my_sorder = [P,D,Q,m]
    sarimamod = sm.tsa.statespace.SARIMAX(data, exog, order=my_order, seasonal_order=my_sorder, 
                                          enforce_stationarity=False, enforce_invertibility=False,
                                          initialization='approximate_diffuse')
    model_fit = sarimamod.fit()# start_params=[0, 0, 0, 0, 1])
    return(model_fit)

In [5]:
# Function : make forecast based on provided data
#
# @param train_data --- what I already believe is true.  Dec 2007 and before 80%
# @param test_data -- what I want to prove Jan 2008 and up 20%
# @param exotrain -- external data not included but could help predictions Dec 2007 and before
# @param exotest -- external data I want to prove
# @return -- list of all predictions for the location
def model_creation_pred_one_step(train_data, test_data, exotrain=None, exotest=None):
    list_one_step = []

    nextMonth = model_based_forecast(train_data, test_data, exotrain, exotest)
    list_one_step.append(nextMonth[0])             # captures prediction

    # if test data exists
    if len(test_data) >= 1:
        # increment data for next month's iteration
        train_data = pd.concat([train_data, test_data[[0]]])
        test_data = test_data.drop(test_data.index[0], axis = 0)
        if exotrain is not None:
            exotrain = pd.concat([exotrain, exotest[[0]]])
            exotest = exotest.drop(exotest.index[0], axis = 0)

        # execute & capture future predictions
        futurePredictions = model_creation_pred_one_step(train_data, test_data, exotrain, exotest)
        # add to list
        list_one_step.append(futurePredictions)
        
    return(list_one_step)

# Function : Make forecast from model
# @return -- a forecast of next month's rain amount
def model_based_forecast(train_data, test_data, exotrain=None, exotest=None):
    mod = sarima_model_creation(train_data, 4, 0, 3, 3, 0, 4, 12, exotrain)
    # if exists, passing exotrain's prevMonth (december, for forecasting jan), otherwise only forcast based on model
    nextMonth = mod.forecast() if exotrain is None else mod.forecast( exotrain.iloc[[-1]] )       # turnary assignment expression
    return(nextMonth)

# previously billsFn
def maeFinder(train_data, test_data, exotrain=None, exotest=None):
    clone_train_data = train_data.copy()
    clone_test_data = test_data.copy()
    clone_exotrain = exotrain if exotrain is None else exotrain.copy()
    clone_exotest = exotest if exotest is None else exotest.copy()

    predictions = model_creation_pred_one_step(clone_train_data, clone_test_data, clone_exotrain, clone_exotest)
    mae = mean_absolute_error(test_data, predictions)
    return(mae)



In [6]:
def exog_combinations(df, exoe):
    lo_dfs = []
    if len(exoe) == 1:
        lo_dfs.append(df.loc[:,exoe])
    if len(exoe) > 1:
        lo_dfs.append(df.loc[:,exoe])
        for ex in exoe:
            lo_dfs.append(df.loc[:,[ex]])
        if len(exoe) >2:
            for i in range(2, len(exoe)):
                combolist = list(combinations(exoe,i))
                for c in combolist:
                    lo_dfs.append(df.loc[:,c])
    return(lo_dfs)

exogen.keys()

dict_keys(['Arcola, NC', 'Henderson 2 NNW, NC', 'Laurinburg, NC', 'Roanoke Rapids, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BREVARD, NC', 'CASAR, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', ' MOUNT HOLLY 4 NE, NC', ' OCONALUFTEE, NC', 'PISGAH FOREST 3 NE, NC', 'ROBBINSVILLE AG 5 NE, NC', 'ROSMAN, NC', 'SHELBY 2 NW, NC', 'TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC'])

In [7]:
# Defining set of cities to evaluate
if getpass.getuser() == "rainfalld":       # docker daemon, automatically do all exogen
    todokeys = exogen.keys()
else:    # manual setting of dictionary elements to do
    todokeys = ('Roanoke Rapids, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BREVARD, NC', 'CASAR, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', ' MOUNT HOLLY 4 NE, NC', ' OCONALUFTEE, NC', 'PISGAH FOREST 3 NE, NC', 'ROBBINSVILLE AG 5 NE, NC', 'ROSMAN, NC', 'SHELBY 2 NW, NC', 'TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC')

sub_exogen = {k: exogen[k] for k in todokeys}

In [8]:
from collections import defaultdict
l_o_dfs = defaultdict(list)
for key,value in tqdm(sub_exogen.items()):
    lo_dfs2 = exog_combinations(rd, value)
    l_o_dfs[key] = lo_dfs2
l_o_dfs['LONGWOOD, NC']

HBox(children=(IntProgress(value=0, max=43), HTML(value='')))




[             LORIS 2 S, SC  Myrtle Beach Area, SC
 Date                                             
 1980-01-01        4.220000                 4.4375
 1980-02-01        2.100000                 2.0825
 1980-03-01        8.240000                 7.9225
 1980-04-01        1.400000                 1.7625
 1980-05-01        4.520000                 3.9300
 1980-06-01        4.670000                 3.4900
 1980-07-01        1.960000                 4.3800
 1980-08-01        2.270000                 2.2325
 1980-09-01        4.370000                 4.5650
 1980-10-01        1.550000                 1.9775
 1980-11-01        1.650000                 1.7225
 1980-12-01        3.120000                 3.2300
 1981-01-01        0.950000                 0.9850
 1981-02-01        2.650000                 2.5425
 1981-03-01        3.700000                 3.6425
 1981-04-01        0.780000                 1.1600
 1981-05-01        4.710000                 5.5375
 1981-06-01        3.940000    

In [9]:
def exogenous_var(data, ncloc, l_exoloc):
#     for key, value in tqdm(exo_dict.items()):
    dat = data[ncloc]
#         l_exog = exog_combinations(data, value)
    tr, test = train_test_split(dat, test_size=0.2, shuffle=False)
    keymae = maeFinder(tr, test)
    print('keymae of: '+ key +' = '+str(keymae))
    bettermae = {}
    bettermaeLock = multiprocessing.Lock()
    
    def find_exmae(exog, l):
        extr, extest = train_test_split(exog, test_size=0.2, shuffle=False)
        exmae = maeFinder(tr, test, extr, extest)
        co = tuple(exog.columns)
        if exmae < keymae:
            l.acquire()
            try:
                bettermae[co] = exmae
                bettermae2 = {key: bettermae}
            finally:
                l.release()
        
        return { "co": co, "exmae": exmae }
    
    def on_success(result):
        tqdm(l_exoloc); # update counter of completion
        print('exmae = {}'.format(result["co"]) + ' '+ str(result["exmae"]))
    
    def on_error():
        # do something
        pass
    
    process_limit = multiprocessing.cpu_count()
    pool = multiprocessing.Semaphore(process_limit)
    tqdm(l_exoloc); # initialize counter
    # num_exmaes = len(list(l_exoloc.keys()))
    for exog in l_exoloc:
        pool.apply_async(find_exmae, (exog, bettermaeLock), None, on_success, on_error)
    
    pool.close()      # no more tasks can be added for the pool to accomplish
    pool.join()       # tell parent to wait until all tasks are accomplished by the process pool

    return()

    # for exog in tqdm(l_exoloc):
    #     extr, extest = train_test_split(exog, 0.2, False)
    #     exmae = maeFinder(tr, test, extr, extest)
    #     co = tuple(exog.columns)
    #     print('exmae = {}'.format(co) + ' '+ str(exmae))
    #     if exmae < keymae:
    #         bettermae[co] = exmae
    #         bettermae2 = {key: bettermae}
    # return(co)


In [None]:
# best_comb = [[4,3,3,4]]
warnings.filterwarnings("ignore")
for key,value in tqdm(l_o_dfs.items()):
    exogenous_var(rd, key, value)

HBox(children=(IntProgress(value=0, max=43), HTML(value='')))

In [None]:
# exogenous_var(rd, sub_exogen, best_comb)

In [None]:
# if you need to pause the code from running and then start back up from where you left off you can use the
# following code to get a sub_dictionary that will be only a portion of the large dictionary (exogen) that I am
# using

# I had to do this because my computer started running really slow so I had to restart it in order to let the 
# memory reset and so it could start running at normal speed again.

# use exogen.keys() to list all of the keys then find the location where you left off from the printout above
# and then copy and paste the rest of the keys into the "todokeys" tuple below. Then call sub_exogen in place
# of exogen in the "exogenous_var" function above. 


# todokeys = ('Henderson 2 NNW, NC', 'JACKSON SPRINGS 5 WNW, NC', 'Laurinburg, NC', 'Louisburg, NC', 'Roanoke Rapids, NC', 'Rougemont, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'ELIZABETHTOWN, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', ' WILLIAM O HUSKE L&D, NC', 'Asheville Area, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BLACK MOUNTAIN 2 W, NC', 'BREVARD, NC', 'BRIDGEWATER HYDRO, NC', 'CASAR, NC', 'CATAWBA 3 NNW, NC', 'CONCORD, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', 'Hickory FAA Airport, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'LENOIR, NC', 'LINCOLNTON 4 W, NC', 'MARION, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', 'MORGANTON, NC')
# sub_exogen = {k: exogen[k] for k in todokeys}