In [None]:
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
import time
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import torch


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# OBIS FRAMEWORK

In [None]:
with open('../results/OBIS_results.pickle', 'rb') as f:
    results = pickle.load(f)

In [None]:
results

In [None]:
#used functions:

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg


def init_model(train_X):
    model = keras.models.Sequential()
    model.add(keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.MeanSquaredLogarithmicError(), optimizer='adam')
    return model

def train_model(train_X,train_y,test_X,test_y,model):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
    history = model.fit(train_X, train_y, epochs=250, batch_size=64, validation_data=(test_X, test_y), 
                        verbose=0,callbacks=[es], shuffle=False)

    return history, model


In [None]:
target_intersections={"T1":{"North":"K504", "South":"K198"},
                      "T2":{"North":"K703", "South":"K206"}}
thresholds = [1,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0]
def build_LSTMs(results,targets=target_intersections,thesholds=thresholds):
    """Takes the entire results dict and builds LSTM models for each trajectory and direction"""
    errors={}
    dfs={}
    intersection_arrays = []
    for trajectory in results.keys():
        errors[trajectory]={}
        print("Starting trajectory: {}".format(trajectory))
        for direction in results[trajectory]:
            errors[trajectory][direction]={}
            print("Starting direction: {}".format(direction))
            lof = results[trajectory][direction]['lof_df'] # load LOF DF for cors
            data = pd.DataFrame()
            for intersection_name in results[trajectory][direction]['raw']:
                intersection = results[trajectory][direction]['raw'][intersection_name]
                intersection = intersection.rename(columns={"cars": intersection_name})
                intersection = intersection.set_index(pd.DatetimeIndex(intersection['timestamp']))
                intersection = intersection.drop(columns=['timestamp'])
                data = pd.merge(data, intersection, left_index=True, right_index=True, how='outer')
            data.dropna(inplace=True)
            dfs["{}_{}".format(trajectory,direction)] = data

            for threshold in thresholds:
                errors[trajectory][direction][threshold] ={}
                print("Threshold: {}".format(threshold))
                #grab relevant intersections:
                target = target_intersections[trajectory][direction]
                isct_inc = lof.corr()[lof.corr()[target]>=threshold].index.tolist()
                if (isct_inc in intersection_arrays) & (threshold != 0.4):
                    print("same intersections, copy MSE")
                    errors[trajectory][direction][threshold]['MSE'] = mse(preds,val_y)
                else:    
                    print("Intersections included: {}".format(len(isct_inc)))
                    errors[trajectory][direction][threshold]['intersections'] = isct_inc

                    df = data[isct_inc].copy(deep=True)

                    #move target var to front of DF
                    df = df[ [target] + [ col for col in df.columns if col != target ] ]

                    #do scaling:
                    scaler = StandardScaler()
                    df= scaler.fit_transform(df.values)
                    df_test = df[math.ceil(len(df)*0.8):]
                    df = df[:math.ceil(len(df)*0.8)]

                    # specify the lag sequence
                    sequence_length = 12
                    n_features = len(isct_inc)
                    # frame as supervised learning
                    reframed = series_to_supervised(df, sequence_length, 1)
                    # print(reframed)
                    # split into train and test sets
                    values = reframed.values
                    n_train_hours = math.ceil(len(df)*0.8)
                    train = values[:n_train_hours, :]
                    test = values[n_train_hours:, :]
                    # split into input and outputs
                    n_obs = sequence_length * n_features
                    train_X, train_y = train[:, :n_obs], train[:, -n_features]
                    test_X, test_y = test[:, :n_obs], test[:, -n_features]
                    # reshape input to be 3D [samples, timesteps, features]
                    train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
                    test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))
                    #init model & train stuff
                    model=init_model(train_X)
                    history,model = train_model(train_X,train_y,test_X,test_y,model)

                    #eval:
                    reframed = series_to_supervised(df_test, sequence_length, 1)
                    values = reframed.values
                    train = values
                    n_obs = sequence_length * n_features
                    val_X, val_y = train[:, :n_obs], train[:, -n_features]
                    val_X = val_X.reshape((val_X.shape[0], sequence_length, n_features))
                    preds = model.predict(val_X)
                    preds =preds.reshape(len(df_test)-sequence_length)

                    model=init_model(train_X)
                    history,model = train_model(train_X,train_y,test_X,test_y,model)

                    #eval:
                    reframed = series_to_supervised(df_test, sequence_length, 1)
                    values = reframed.values
                    train = values
                    n_obs = sequence_length * n_features
                    val_X, val_y = train[:, :n_obs], train[:, -n_features]
                    val_X = val_X.reshape((val_X.shape[0], sequence_length, n_features))
                    preds = model.predict(val_X)
                    preds =preds.reshape(len(df_test)-sequence_length)


                    errors[trajectory][direction][threshold]['df'] = pd.DataFrame({"Real":val_y,"Predicted":preds,'SE':(val_y-preds)**2})
                    errors[trajectory][direction][threshold]['MSE'] = mse(preds,val_y)
                #save model
                
                path= '.\\{}\\{}\\{}\\'.format(trajectory,direction,threshold)
                if not os.path.exists(path):
                    os.makedirs(path)
                model.save(path+"model")
                
                print(mse(preds,val_y))
                intersection_arrays.append(isct_inc)

    return errors,dfs #errors
lstm, data = build_LSTMs(results)


In [None]:
with open('LSTM_results_april.pickle', 'wb') as file:
    pickle.dump([lstm,data],file)  

In [None]:
def plot_results(errors):
    df = pd.DataFrame()
    row = {}
    for trajectory in errors.keys():
        for direction in errors[trajectory].keys():
            row['model']="T{}_{}".format(trajectory,direction[0])
            for threshold in errors[trajectory][direction].keys():
                row["T{}".format(threshold)] = round(errors[trajectory][direction][threshold]['MSE'],3)
            df = df.append(row,ignore_index=True)
    return df
plotting_results = plot_results(lstm)

In [None]:
plotting_results.describe()

In [None]:
plotting_results.append(test.describe().iloc[1]) #add means

In [None]:
plotting_results.append(plotting_results.describe().iloc[1]) #add means

In [None]:
def outlier_mse_calc(lstm=lstm,data=data,results=results,outliers=5,target_intersections=target_intersections):
    """calculates MSE per model for top 'outliers' in test set"""
    mse_results=pd.DataFrame(columns=['Model','T 1.0','T 0.4','T 0.0'])
    thresholds = [1,0.4]
    for trajectory in ['T1','T2']:
        for direction in results[trajectory].keys():
            mses=[]
            for threshold in thresholds:
                #get preds, rawdata & lof dfs:
                print(trajectory,direction,threshold)
                preds = lstm[trajectory][direction][threshold]['df'].copy(deep=True)
                rawdata = data['{}_{}'.format(trajectory, direction)].iloc[-len(preds):].copy(deep=True)
                lof = results[trajectory][direction]['lof_df'].copy(deep=True)
                #take LOF df:
                lof = lof[lof.index> rawdata.iloc[0].name]
                
                #sort LOF df on outliers & take top 5 outliers for target intersection:
                lof.sort_values(by=target_intersections[trajectory][direction], ascending=False, inplace=True)
                outliers = lof.iloc[:5].index
                
                #insert dates on lstm preds df:
                preds['Time'] = rawdata.index
                outlier_df= pd.DataFrame()

                for outlier in outliers:
                    one_outlier = preds[(preds['Time']>=outlier) & (preds['Time']<(outlier+pd.Timedelta('1H')))]
                    outlier_df = pd.concat([outlier_df,one_outlier])
                mses.append(round(outlier_df['SE'].mean(),3))
            mse_results = mse_results.append({'Model':"{}_{}".format(trajectory,direction),
                                'T 1.0':mses[0],'T 0.4':mses[1]},ignore_index=True)
    return mse_results

In [None]:
lstm['T1']['North'][1]

In [None]:
mse_results = outlier_mse_calc()

In [None]:
mse_results

In [None]:
mse_results = mse_results.append({'Model':'Mean','T 1.0':round(mse_results.describe().iloc[1]['T 1.0'],3),
                                  "T 0.4":round(mse_results.describe().iloc[1]['T 0.4'],3)},ignore_index=True)

In [None]:
mse_results

In [None]:
mse_results

In [None]:
#save all extra stuff:
all_stuff = [lstm,data,results]
with open('march30_save.pickle', 'wb') as f:
    pickle.dump(all_stuff,f)