In [111]:
import papermill as pm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LassoCV
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNetCV

In [45]:
notebook = r"D:\gitClones\nteract_models\optimize\projects\the_dalles\the_dalles_raw.ipynb"

In [52]:
nb = pm.read_notebook(notebook)
nb_df = nb.dataframe
raw_data = nb_df[nb_df['name']=='raw_data']['value'].values[0]

In [54]:
data = pd.DataFrame(raw_data)
data['index'] = pd.to_datetime(data['index'])
data.set_index('index', inplace = True)
#separate only data with spill.  A separate process will be optimized with no spill conditions
spill = data[(data['q_s']>0)]

In [120]:
train = spill['2016':]
test = spill[:'2015']

In [121]:
def make_interactions(df, response_column):
    x_columns = [x for x in train.columns if x not in  [response_column]]
    interactions = []
    for combo in range(2,len(x_columns)+1):
        interactions += [x for x in combinations(x_columns,combo)]
    return interactions

def min_max_scale(df,):
    data = df.copy()
    scaler = MinMaxScaler()
    scaler.fit(data)
    scaled = pd.DataFrame(columns = data.columns, data = scaler.transform(data))
    return scaled

def make_interaction_df(df, interactions):
    interaction_df = df.copy()
    for interaction in interactions:
        name = '-'.join(interaction)
        interaction_df[name] = 1
        for term in interaction:
            interaction_df[name] = interaction_df[name] * interaction_df[term]
    return interaction_df





In [122]:
interactions = make_interactions(spill, 'tdg_tw')
train_interaction = make_interaction_df(train, interactions)
test_interaction = make_interaction_df(test, interactions)

In [123]:
X = train_interaction.copy().drop('tdg_tw', axis = 1)
y = train_interaction['tdg_tw']
reg = LassoCV(cv=100, random_state=0, normalize = True).fit(X, y)



In [124]:
bool_matrix = reg.coef_ !=0

In [125]:
cols = X.columns[bool_matrix]

In [126]:
new_train = train_interaction[cols].pipe(min_max_scale)
new_test = test_interaction[cols].pipe(min_max_scale)
pca = PCA(n_components=.95,svd_solver = 'full')
pca.fit(new_train)
X = pca.transform(new_train)
X_test = pca.transform(new_test)

In [127]:
X_test

array([[ 0.44677853,  1.38437543, -0.31866461],
       [ 0.44647023,  1.37791469, -0.26102357],
       [ 0.43856417,  1.36499   , -0.25246552],
       ...,
       [-0.70603154, -0.14538906,  0.47149864],
       [-0.71024916, -0.14776726,  0.44727128],
       [-1.12568292,  0.04274386, -0.05066405]])

In [128]:
regr = ElasticNetCV(cv=50, random_state=0)
regr.fit(X,y)
y_hat = regr.predict(X_test)

In [129]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, reset_output
from bokeh.layouts import gridplot
from scripts.theme import theme
from bokeh.io import curdoc
output_notebook()

def qq(y,y_hat):
    y_min = y.min()
    y_max = y.max()
    p = figure()
    p.line([y_min,y_max], [y_min,y_max], line_width=2, color = 'black')
    p.circle(y, y_hat, size=1, alpha=0.5)
    p.xaxis.axis_label = "y (%)"
    p.yaxis.axis_label = "y hat (%)"
    return p


In [130]:
p = qq(y,y_hat)
show(p)



In [131]:
#svr
from sklearn.svm import SVR

svr = SVR(kernel = 'poly', degree = 3, C = .5, epsilon = .001)
svr.fit(x_train, y_train.values) 
y_hat = svr.predict(x_test)


NameError: name 'x_train' is not defined

In [20]:
sum(abs(y_test-y_hat))/len(y_hat)

1.2193437759929784

In [21]:
sum(y_test-y_hat)**2/len(y_hat)

811.4236012308397

In [22]:
p = qq(y_test,y_hat)
show(p)

In [23]:
# NN
from sklearn.neural_network import MLPRegressor

nn = MLPRegressor(max_iter=1000)
nn.fit(x_train,y_train)
y_hat = nn.predict(x_test)

In [24]:
sum(abs(y_test-y_hat))/len(y_hat)

1.9328557076546478

In [25]:
sum(y_test-y_hat)**2/len(y_hat)

2510.187366065561

In [26]:
p = qq(y_test,y_hat)
show(p)