In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
from scipy.special import expit
from scipy.optimize import fmin, minimize
import glob as glob
import matplotlib
matplotlib.style.use('ggplot')

# .py
%load_ext autoreload
%autoreload 2
import weather_alphas, weather_costs, model, model_01, validate, prediction

running model
getting cost from nweek ahead
getting bounded alphas ...
done getting alphas ...
running model


In [2]:
def split_data(data):
    train_mask = (data.index < "2011")
    train = data[train_mask]
    test_mask = (data.index > "2011")
    test = data[test_mask]
    return train, test

In [3]:
all_weather = pd.read_csv("weather01-16.csv")

In [4]:
cm_weather = all_weather[all_weather.stn_name == "Chiang Mai"]
cm_weather.index = pd.DatetimeIndex(cm_weather.date)
mask = (cm_weather.index > "2003") & (cm_weather.index < "2016")
cm_weather = cm_weather[mask]

provinces_df = pd.read_csv("all-dengues.csv")
provinces_df.index = pd.DatetimeIndex(provinces_df.date)
provinces_df = provinces_df.drop(['date','date.1'],axis=1)
cm_dengues = provinces_df[provinces_df['จังหวัด'] == 'ChiangMai'].resample('W').size()
cm_dengues_df = pd.DataFrame(cm_dengues,columns=['cases'])

In [5]:
cm_avg_weather = cm_weather[['avgrh','dday','meantemp']].resample('W').mean()
cm_avg_weather['rain'] = cm_weather[['rain']].resample('W').sum() # cumulative rainfall
cm_dengues_weather = pd.concat([cm_avg_weather,cm_dengues_df[:-52]],axis=1)

cm_dengues_weather_split = split_data(cm_dengues_weather)
cm_dengues_train, cm_dengues_test = cm_dengues_weather_split[0], cm_dengues_weather_split[1]

In [6]:
def make_line(start_week,real,predictions,title,x_axis,y_axis,
              real_legend='Real',predict_legend='Prediction'):
    plt.figure(figsize=(12,6))
    plt.plot(range(len(real)),real,'-r',label=real_legend)
    plt.plot(range(start_week,len(predictions)+start_week),predictions,'-k',label=predict_legend,alpha=0.4)
    plt.axvline(start_week)
    plt.text((start_week*2)+10,300,'Prediction Start',horizontalalignment='center',verticalalignment='center')
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.legend()
    plt.title(title)
    
def plot_all_predictions(LAG, real, province, nweeks_to_predict, all_predictions):
    
    plt.figure(figsize=(13,6))
    plt.plot(range(len(real)),real,label='real')
    graph_title = "%s's Dengue Prediction with Mean Temperature"%province
    plt.title(graph_title)
    for i in range(len(all_predictions)):
        cur_week_guess = nweeks_to_predict[i]
        start_predict_week = LAG+(cur_week_guess-1)
        predicted_cases = all_predictions[i]
        x_weeks = range(start_predict_week,len(predicted_cases)+start_predict_week)
        line_label = "%d week guess"%cur_week_guess
        plt.plot(x_weeks,predicted_cases,label=line_label,alpha=0.8)
    
    seperate_train_test(cm_dengues_df.cases,cm_dengues_train)
    plt.axvline(LAG)
    plt.text((LAG*2)+10,300,'Prediction Start',horizontalalignment='center',verticalalignment='center')
    plt.xlabel("# week starting 2003")
    plt.ylabel("Dengue cases")
    plt.legend()
    plt.savefig("16-lags-16-weeks-9-temp-wks-chiangmai-00.png")
    
def seperate_train_test(all_data,train_data):
    plt.axvspan(0, len(train_data), color='blue', alpha=0.1)
    plt.text((len(train_data)/2)-25,700,'Train Period',size=16)
    plt.axvline(len(train_data)+1,linewidth=0.5, color='green')
    plt.text(len(train_data)+20,700,'Test Period',size=16)
    plt.axvspan(len(train_data)+1, len(all_data), color='green', alpha=0.1)

In [19]:
import multiprocessing as mp
from multiprocessing import Process, Queue

# Define an output queue
output = Queue()

# def validate

# def compute_ret(temp):
#     for i in range(int(1e5)): pass
#     return temp/2.


ws_csv = np.array(pd.read_csv("best-for-21-lags-ws.csv").T)[0]
    
def validation(LAG, TEMPERATURE_WEEKS, RAIN_WEEKS, train, real, output):
                                    # get_alphas takes 4 args, if theres no input csv
    alphas = weather_alphas.get_alphas(LAG, TEMPERATURE_WEEKS, RAIN_WEEKS, train, ws_csv)
                                    # get_predictions takes 6 args
    predictions_to_validate = prediction.get_predictions(LAG, TEMPERATURE_WEEKS, RAIN_WEEKS, real, alphas.x, "CM")
                                    # get_validations takes 3 argsg
    validated_result = validate.get_validations(LAG, predictions_to_validate, real)
    output.put((LAG,validated_result,alphas.x)) # add in ws as well alphas.x


In [None]:
%%time
# Setup a list of processes that we want to run
train = cm_dengues_train
real = cm_dengues_weather
processes = [Process(target=validation, args=(weeks, 9, 10, train, real, output)) for weeks in range(19,28)]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)

43
44
bnds 43
bnds 44
prev_ws 45
ws_csv
ws_csv
45
prev_ws 43
46
bnds 45
bnds 46
ws_csv
ws_csv
prev_ws 46
prev_ws 44
47
48
50
bnds 50
bnds 48
51
49
ws_csv
ws_csv
bnds 47
prev_ws 50
prev_ws 48
bnds 49
bnds 51
ws_csv
ws_csv
prev_ws 51
prev_ws 47
ws_csv
prev_ws 49


In [10]:
def get_best(validations):
    idx = None
    best_so_far = validations[0][1][-1]
    print best_so_far
    for i in range(len(validations)):
        cur_validation = validations[i][1]
#         print cur_validation
        if cur_validation[0] <= best_so_far:
            best_so_far = cur_validation[0]
            idx = i
    return validations[idx]
        
    
best_param = get_best(results)

2299.5581899


In [11]:
best_param

(21,
 [166.85156662978127,
  292.08981300795017,
  564.36657359209823,
  1035.0747400172374,
  1527.0827702719544],
 array([  7.71015438e-01,   0.00000000e+00,   1.52370530e-03,
          0.00000000e+00,   5.31392493e-02,   1.35030649e-01,
          6.92165714e-05,   2.80015059e-05,   4.44525305e-06,
          2.68477229e-06,   1.99995288e-06,   2.12478072e-06,
          2.09810615e-06,   1.20135864e-06,   5.25325478e-07,
          8.53575139e-07,   2.25703514e-07,   2.91735322e-07,
          3.80337909e-07,   3.78992920e-08,   1.30146070e-08,
          5.33704240e-01,   1.30338530e+00,   1.14734996e+01,
          2.92601035e-01,   1.15981060e-01,   8.54652738e-02,
          1.13036071e-01,   5.54779729e-02,   6.25187499e-02,
         -3.73418462e-02,  -2.85982794e-01,  -3.79565438e-02,
         -1.56748221e-01,   3.43701332e-01,   3.41783500e-02,
          3.57185012e-02,  -2.67238368e-02,   1.74774433e-02,
          8.65917163e-03,   2.46997964e-02,   3.56501205e-02,
          1.7432

In [17]:
# pd.DataFrame(best_param[2]).to_csv("best-for-21-lags-ws.csv",index=False)
# np.array(pd.read_csv("best-for-21-lags-ws.csv").T)[0]

array([  7.71015438e-01,   0.00000000e+00,   1.52370530e-03,
         0.00000000e+00,   5.31392493e-02,   1.35030649e-01,
         6.92165714e-05,   2.80015059e-05,   4.44525305e-06,
         2.68477229e-06,   1.99995288e-06,   2.12478072e-06,
         2.09810615e-06,   1.20135864e-06,   5.25325478e-07,
         8.53575139e-07,   2.25703514e-07,   2.91735322e-07,
         3.80337909e-07,   3.78992920e-08,   1.30146070e-08,
         5.33704240e-01,   1.30338530e+00,   1.14734996e+01,
         2.92601035e-01,   1.15981060e-01,   8.54652738e-02,
         1.13036071e-01,   5.54779729e-02,   6.25187499e-02,
        -3.73418462e-02,  -2.85982794e-01,  -3.79565438e-02,
        -1.56748221e-01,   3.43701332e-01,   3.41783500e-02,
         3.57185012e-02,  -2.67238368e-02,   1.74774433e-02,
         8.65917163e-03,   2.46997964e-02,   3.56501205e-02,
         1.74323266e-02,  -3.03576168e-02,  -5.30104636e-02])

In [147]:
# # pd.DataFrame(best_param[2]).to_csv("best-for-18-lags-ws.csv",index=False)

# lg = [16,17,18,20,22]
# a = np.array(pd.read_csv("best-for-18-lags-ws.csv").T)[0]

# # for i in lg:
# #     if i == len(a):
# #         arr = a
# #         print len(arr)
# #     elif i < len(a):
# #         arr = a[:-(18-i)]
# #         print len(arr)
# #     else:
# #         mean_val = 1/float(i)
# #         arr = np.append(a,[mean_val]*(i-len(a)))
# #         print arr
        
# def ws_helper(LAG, ws, 18):
#     n_ws = len(ws)
#     if LAG == n_ws:
#         print "eq"
#         arr = ws[:LAG+1]
#     elif LAG < n_ws:
#         print "les"
#         arr = ws[:-(n_ws - LAG + 1)]
#     else:
#         print "else"
#         tail = (LAG - n_ws) - 1
#         mean_val = 1/(float(LAG)*tail)
#         arr = np.append(ws,[mean_val]*tail)
#     return arr

# len(ws_helper(20,a))

les


19

In [69]:
%%time
[compute_ret(temp.get_alphas(i,9,10)) for i in range(16,26)]

NameError: name 'compute_ret' is not defined