In [276]:
import pandas as pd
import datetime
import tensorflow as tf
import keras
import seaborn
import numpy as np

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from matplotlib import pyplot

In [277]:
# Create fixed-window sequences for training and validation data
def create_sequences(X, window_size):
    seq_X = []
    seq_y = []
    for i in range(len(X) - window_size):
        seq_X.append(X[i:i+window_size])
        seq_y.append(X[i+window_size])
    return seq_X, seq_y

In [278]:
def reprocess(y, details):
    # print(details)
    mean = details[2][0]
    std = details[2][1]
    PF = details[2][2]
    PFtype = details[2][3]
    time = details[3]

    # print("mean:", mean, "std:", std, "PF", PF, "type", PFtype, "time", time)

    if(PFtype == 1):
        return ((y * std) + mean) * np.exp(PF[0] * time + PF[1])    
    # print("here")
    return (y * std + mean) * (PF[0] * np.square(time) + PF[1] * time + PF[2])


In [300]:
def preprocess(data, smoothing, alpha = None):

    # #Exponential smoothing
    if smoothing:
        data = data.ewm(alpha=alpha, adjust=False).mean()

    PFtype = -1
    PF1 = np.polyfit(np.linspace(0,len(data) - 1,num=len(data)), np.log(data), 1)
    PF2 = np.polyfit(np.linspace(0,len(data) - 1,num=len(data)),data, 2)
    error1 = data - (np.exp(PF1[0] * np.linspace(0, len(data) - 1, num=len(data)) + PF1[1]))
    error2 = data - (PF2[0] * np.square(np.linspace(0,len(data) - 1,num=len(data))) + PF2[1] * np.linspace(0, len(data) - 1, num=len(data)) + PF2[2])
    
    #Otto: dit is de keuze voro welke je preporcessed. je kan die plotjes un commenten om de fit te zien
    if(np.sum(np.square(error1)) < np.sum(np.square(error2))):
        PF = PF1
        preprocessed = data / (np.exp(PF[0] * np.linspace(0,len(data) - 1,num=len(data)) + PF[1]))
        PFtype = 1
    else:
        PF = PF2
        preprocessed = data / (PF2[0] * np.square(np.linspace(0,len(data) - 1,num=len(data))) + PF2[1] * np.linspace(0,len(data) - 1,num=len(data)) + PF2[2])
        PFtype = 2
    
    m = np.mean(preprocessed)
    s = np.std(preprocessed)
    preprocessed = (preprocessed - m)/s
    details = [m, s, PF, PFtype]
    
    return preprocessed, details

In [280]:
def smape_clean(y_true, y_pred):
    smape = 100 * np.mean(2*np.abs(y_pred - y_true) / (y_true + y_pred))
    return smape

In [281]:
def smape(model, validation):
    validation_x = []
    validation_y = []
    for val in validation:
        validation_x.append(val[0])
        validation_y.append(val[1])
    validation_x = np.array(validation_x)
    validation_y = np.array(validation_y)
    smape = 0
    prediction = model.predict(validation_x, verbose=0)
    # _, acc = model.evaluate(validation_x, validation_y, verbose = 0)

    # print("sse val is; ", np.sum(np.square(validation_y - prediction))/len(validation_y), "val is ", acc)
    for i in range(len(validation)):
        observation = validation[i]
        pred = prediction[i]
        #print(pred, observation[1], pred - observation[1])
        x_hat = reprocess(pred, observation)
        x = reprocess(observation[1], observation)
        #print(x_hat, x, x_hat - x)

        smape += 2*np.abs(x_hat-x)/(x+x_hat)

    smape /= len(validation)
    smape *=100

    return smape

In [304]:
def build_model(x_train, y_train, x_validation, y_validation, window_size, options): #x_validation, y_validation
    # Build the FFNN model
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(window_size, 1))) 
    model.add(keras.layers.Dense(options.layers[0], activation='sigmoid'))

    if len(options.layers) > 2:
        for i in range(1,len(options.layers)-1):
            model.add(keras.layers.Dense(options.layers[i], activation=options.activation))
                
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mse'])

    #early stopping and saving the best model SOURCE: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=50, min_delta=0.001)
    mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=0, save_best_only=True)
    # fit model

    # Train the model
    history = model.fit(x_train, y_train, validation_data=(x_validation, y_validation), batch_size=options.batchSize, verbose = 0, callbacks=[es, mc], epochs= 1000) #fit the model with early stop

    ##this is the best model
    saved_model = load_model("best_model.h5")

    _, train_acc = saved_model.evaluate(x_train, y_train, verbose = 0)
    _, test_acc = saved_model.evaluate(x_validation, y_validation, verbose = 0)

    print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

    # # plot the different accuracies. maybe then dont do a early stop. 
    # pyplot.plot(history.history['loss'], label='train')
    # pyplot.plot(history.history['val_loss'], label='validation')
    # pyplot.legend()
    # pyplot.show()

    return saved_model

In [283]:
df = pd.read_excel("M3C.xls")
df = df.iloc[:146,6:26]

df_train = df.iloc[:,:14]
df_test = df.iloc[:,14:]

window_size = 3

In [284]:
def df_to_observations(data, window_size):
    observations = []
    details = []
    for index, row in data.iterrows():
        preprocessed, detail = preprocess(np.array(row))
        details.append(detail)
        for i in range(len(preprocessed) - window_size):
            observations.append([preprocessed[i:i+window_size],preprocessed[i+window_size], detail, i+window_size])

    return observations

In [285]:
def shuffle(data, v): 
    np.random.shuffle(np.array(data))

    train = data[:int(np.floor(len(data)*(1-v)))]
    validation = data[int(np.floor(len(data)*(1-v))):]

    return train, validation

In [286]:
def preprocess_and_split(data, fold, v, window_size, alpha):
    prep = []
    validationSet = []
    validation = data.iloc[fold-v:fold]
    train = data[~data.isin(validation)].dropna()

    for index, row in train.iterrows():
        p, details = preprocess(row, smoothing = 1, alpha = alpha)
        prep.append(p)

    train = np.array(prep)

    x_train = []
    y_train = []

    for i in range(len(train)):
        for j in range(train.shape[1] - window_size):
            x_train.append(train[i][j:j + window_size])
            y_train.append(train[i][j + window_size])

    x_train = np.array(x_train).reshape(len(x_train), window_size)
    y_train = np.array(y_train).reshape(len(y_train))

    for index, row in validation.iterrows():
        p, details = preprocess(row, smoothing = 0)
        prep.append(p)
        for i in range(len(p) - window_size):
            validationSet.append([p[i:i + window_size], p[i + window_size], details, i + window_size])

    x_validation = np.array([x[0] for x in validationSet]).reshape(len(validationSet), window_size)
    y_validation = np.array([x[1] for x in validationSet]).reshape(len(validationSet))

    return x_train, y_train, x_validation, y_validation, validationSet


In [287]:
def get_folds(rows, k):
    folds = []
    rows = np.array_split(np.arange(rows), k)
    for row in rows:
        folds.append(row[-1])
    return folds

In [297]:
def kfolds(df_train, k, options, window_size, val_frac, alpha):
    avgSmape = []
    train = df_train.sample(frac = 1)
    folds = get_folds(len(train), k) #Indicates the points to which data should be used in every fold

    validation_size = int(np.floor(val_frac*len(train)))

    for fold in folds:
        x_train = []
        y_train= []
        x_train, y_train, x_validation, y_validation, validationSet = preprocess_and_split(train, fold, validation_size, window_size, options.smoothingFactor) #takes df's returns np arrays


        model = build_model(x_train, y_train, x_validation, y_validation, window_size, options)

        validationPrediction = model.predict(x_validation, verbose=0)
        yHatReal = []
        yReal = []

        for i in range(len(y_validation)):
            yReal.append(reprocess(y_validation[i], validationSet[i]))
            yHatReal.append(reprocess(validationPrediction[i], validationSet[i]) )
        smapeVal = 0
        for i in range(len(yReal)):
            smapeVal += smape_clean(yReal[i], yHatReal[i])
        smapeVal /= len(yReal)
        print("smape equals", smapeVal, "with", fold, "time step as training")
        avgSmape.append(smapeVal)
    return np.mean(avgSmape), model

In [305]:
options = [[[3,3],'sigmoid','relu', 'sigmoid',16, 1, 0,0]]
options = pd.DataFrame(options)
options = options.set_axis(['layers', 'activation1', 'activation', 'activation2' 'batchSize', 'smoothingFactor', 'averageSmape', 'varianceSmape'], axis=1)
# print(options)
window_size = 3
val_frac = 0.1
smape, model = kfolds(df_train, 5, options.iloc[0], window_size, val_frac, options.iloc[0].smoothingFactor)
print(smape)


Train: 0.788, Test: 0.733
smape equals 12.460819688737514 with 29 time step as training
Train: 0.778, Test: 0.779
smape equals 6.524412001087878 with 58 time step as training
Train: 0.803, Test: 0.788
smape equals 8.535477329066701 with 87 time step as training
Train: 0.752, Test: 0.789
smape equals 7.671258583447182 with 116 time step as training
Train: 0.790, Test: 0.651
smape equals 15.208235313127863 with 145 time step as training
10.080040583093428


In [None]:
lays = [[1],[2],[2,2],[3,3],[2,5]]
batchSizes = [8,16,32]
acts1 = ['sigmoid', 'relu','linear']
activationFunctions = ['sigmoid', 'relu']
acts2 = ['sigmoid', 'relu','linear']
smoothingfactors = [0.9,1]
window_size = 3
val_frac = 0.1

In [308]:
lays = [[3,3]]
batchSizes = [16]
acts1 = ['sigmoid', 'relu','linear']
activationFunctions = ['sigmoid', 'relu']
acts2 = ['sigmoid', 'relu','linear']
smoothingfactors = [0.9,1]
window_size = 3
val_frac = 0.1

options = []

for layer in lays:
    for act1 in acts1:
        for act2 in acts2:
            for activation in activationFunctions:
                for smooth in smoothingfactors:
                    for batchSize in batchSizes:
                        options.append([layer, act1, activation, act2, batchSize, smooth, 0, 0])


options = pd.DataFrame(options)
options = options.set_axis(['layers', 'activation1', 'activation', 'activation2', 'batchSize', 'smoothingFactor', 'averageSmape', 'varianceSmape'], axis=1)

# print(options.iloc[0].layers)

for i in range(len(options)):
    smape_avg=[]
    for j in range(1):
        smp, model = kfolds(df_train, 5, options.iloc[0], 2, window_size, val_frac)
        smape_avg.append(smp)

    options.iat[i,3] = np.mean(smape_avg)
    options.iat[i,4] = np.std(smape_avg)


op = pd.DataFrame(options)
res = op.sort_values(by="averageSmape")
print(res)

Train: 0.800, Test: 0.712
smape equals 9.856578386244037 with 29 time step as training
Train: 0.802, Test: 0.721
smape equals 9.531836632992242 with 58 time step as training
Train: 0.827, Test: 0.709
smape equals 10.282762073831162 with 87 time step as training
Train: 0.793, Test: 0.721
smape equals 10.4800494499943 with 116 time step as training


KeyboardInterrupt: 

In [291]:
op = pd.DataFrame(options)
res = op.sort_values(by="averageSmape")
print(res)

res = options

   layers activation  batchSize  smoothingFactor  averageSmape  varianceSmape
0  [3, 3]       relu         16                1             0              0


In [292]:
x_train = []
y_train= []
val_frac = 0.1
validation_size = int(np.floor(val_frac*len(df_train)))
x_train, y_train, x_validation, y_validation, validationSet = preprocess_and_split(df_train, len(df_train), validation_size, window_size, res.iloc[0]['smoothingFactor']) #takes df's returns np arrays

model = build_model(x_train, y_train, x_validation, y_validation, window_size, res.iloc[0])

Train: 0.663, Test: 0.535


In [None]:
print(op.sort_values(by="averageSmape"))

   layers activation  batchSize  smoothingFactor  averageSmape  varianceSmape
0  [2, 2]       relu         16                1             0              0


In [306]:
##TESTING

predictions = pd.DataFrame()
observations = []
window_size = 3

df_full = pd.DataFrame()
df_full = df_train
df_full = df_full.drop(df_full.columns[14:], axis=1)

num_predictions = 6

# Make predictions using autoregressive approach
for pred in range(num_predictions):

    PF = []
    for index, row in df_full.iterrows():
        # print(row)
        if pred == 0:
            preprocessed, details = preprocess(row, smoothing = 1, alpha = res.iloc[0].smoothingFactor)
            PF.append(details[2:])
            observations.append([preprocessed[11+pred:14+pred],0, details, 14+pred]) #y is unknown and first time point to predict is 15(or 14?)`
            # df_full.loc[index,:] = preprocessed
        else:
            observations.append([row[11+pred:14+pred],0, details, 14+pred])

    # Reshape the input for prediction
    x = []
    for i in (range(len(observations))):
        x.append(observations[i][0])
    x = np.array(x).reshape(len(x),window_size)
    
    # Make the prediction
    prediction = model.predict(x)

    print(prediction.shape)

    y_u = []
    for i in range(len(prediction)):
        y_u.append(reprocess(prediction[i], observations[i]))
        # y_u.append(prediction[i])

    # print(pd.DataFrame(y_u).shape)
    predictions[15+pred] = pd.DataFrame(y_u)
    df_full[15+pred] = pd.DataFrame(y_u)

smapes = pd.DataFrame(columns=[i for i in range(num_predictions)])

for i in range(predictions.shape[0]):
    smape_row = []
    for j in range(num_predictions):
        smape_row.append(smape_clean(predictions.iloc[i, j], df_test.iloc[i, j]))
    smapes.loc[i] = smape_row

print(smapes)

smape_avgs = []
for i in range(num_predictions):
    smape_avgs.append(np.mean(smapes.iloc[:,i]))
print(smape_avgs)

(146, 1)
(292, 1)
(438, 1)
(584, 1)
(730, 1)
(876, 1)
             0          1          2          3          4          5
0     0.180527  13.681183  24.603234  37.541540  44.096648  52.126508
1    31.817585  46.799337  37.168746  36.997012  37.210670  39.879151
2    18.595027  34.270973  28.866578  31.503552  29.735731  16.531106
3     6.114288  17.667179  25.446289  14.746162  20.926489  13.414364
4    20.660160  13.779500   1.171077   6.277023   1.298368  12.144159
..         ...        ...        ...        ...        ...        ...
141   4.308522  11.924643   7.486099   5.044625  10.316634  11.865892
142  10.559050   4.979802  16.155487   7.800020   9.695947   2.917132
143  10.205582  22.056392   7.622262  24.178258  41.144204  57.523887
144   7.467979  17.214770  25.822511  38.044059  53.258172  76.461142
145  17.069596  17.199331   0.583792   7.876261  51.848553  66.608493

[146 rows x 6 columns]
[8.809862595456883, 12.677327454959855, 23.582137438359435, 24.360603024955584, 28

In [None]:
# y = []
# y_hat = []
# details = []
# offset = 7
# for i in range(0,10):
#     y.append(observations[i+offset][1])
#     y_hat.append(y[i] + 0.4)
#     details.append(observations[i+offset])

# repY = []
# repY_hat = []
# smape = 0
# for i in range(10):
#     repY.append(reprocess(y[i], details[i]))
#     repY_hat.append(reprocess(y_hat[i], details[i]))
#     smape += smape_clean(repY[i], repY_hat[i])

# smape /= len(repY)
# print(smape)
# pyplot.plot(repY, label='original')
# pyplot.plot(repY_hat, label='altered')
# pyplot.plot(df_train.iloc[1,3:10], label = 'og')
# pyplot.legend()
# pyplot.show()