In [3]:
import pandas as pd
import datetime
import tensorflow as tf
import keras
import seaborn
import numpy as np

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from matplotlib import pyplot

In [4]:
def reprocess(y, details):
    # print(details)
    mean = details[2][0]
    std = details[2][1]
    PF = details[2][2]
    PFtype = details[2][3]
    time = details[3]

    # print("mean:", mean, "std:", std, "PF", PF, "type", PFtype, "time", time)

    if(PFtype == 1):
        return ((y * std) + mean) * np.exp(PF[0] * time + PF[1])    
    # print("here")
    return (y * std + mean) * (PF[0] * np.square(time) + PF[1] * time + PF[2])


In [5]:
def preprocess(data, smoothing, alpha = None):

    # #Exponential smoothing
    if smoothing:
        data = data.ewm(alpha=alpha, adjust=False).mean()

    PFtype = -1
    PF1 = np.polyfit(np.linspace(0,len(data) - 1,num=len(data)), np.log(data), 1)
    PF2 = np.polyfit(np.linspace(0,len(data) - 1,num=len(data)),data, 2)
    error1 = data - (np.exp(PF1[0] * np.linspace(0, len(data) - 1, num=len(data)) + PF1[1]))
    error2 = data - (PF2[0] * np.square(np.linspace(0,len(data) - 1,num=len(data))) + PF2[1] * np.linspace(0, len(data) - 1, num=len(data)) + PF2[2])
    
    #Otto: dit is de keuze voro welke je preporcessed. je kan die plotjes un commenten om de fit te zien
    if(np.sum(np.square(error1)) < np.sum(np.square(error2))):
        PF = PF1
        preprocessed = data / (np.exp(PF[0] * np.linspace(0,len(data) - 1,num=len(data)) + PF[1]))
        PFtype = 1
    else:
        PF = PF2
        preprocessed = data / (PF2[0] * np.square(np.linspace(0,len(data) - 1,num=len(data))) + PF2[1] * np.linspace(0,len(data) - 1,num=len(data)) + PF2[2])
        PFtype = 2
    
    m = np.mean(preprocessed)
    s = np.std(preprocessed)
    preprocessed = (preprocessed - m)/s
    details = [m, s, PF, PFtype]
    
    return preprocessed, details

In [6]:
def smape_clean(y_true, y_pred):
    smape = 100 * np.mean(2*np.abs(y_pred - y_true) / (y_true + y_pred))
    return smape

In [7]:
def build_model(x_train, y_train, x_validation, y_validation, window_size, options): #x_validation, y_validation
    # Build the FFNN model
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(window_size, 1))) 
    model.add(keras.layers.Dense(options.layers[0], 
                                activation=options.activation1,
                                kernel_initializer=tf.initializers.HeNormal(), 
                                kernel_regularizer=tf.keras.regularizers.l2(options.regularization)))

    if len(options.layers) > 2:
        for i in range(1,len(options.layers)-1):
            model.add(keras.layers.Dense(options.layers[i], 
                                        activation=options.activation,
                                        kernel_initializer=tf.initializers.HeNormal(), 
                                        kernel_regularizer=tf.keras.regularizers.l2(options.regularization)))
                
    model.add(keras.layers.Dense(1, activation=options.activation2))

    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mse'])

    #early stopping and saving the best model SOURCE: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=50, min_delta=0.001)
    mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=0, save_best_only=True)
    # fit model

    # Train the model
    history = model.fit(x_train, y_train, validation_data=(x_validation, y_validation), batch_size=options.batchSize, verbose = 0, callbacks=[es, mc], epochs= 1000) #fit the model with early stop

    ##this is the best model
    saved_model = load_model("best_model.h5")

    _, train_acc = saved_model.evaluate(x_train, y_train, verbose = 0)
    _, test_acc = saved_model.evaluate(x_validation, y_validation, verbose = 0)

    print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

    # # plot the different accuracies. maybe then dont do a early stop. 
    # pyplot.plot(history.history['loss'], label='train')
    # pyplot.plot(history.history['val_loss'], label='validation')
    # pyplot.legend()
    # pyplot.show()

    return saved_model

In [8]:
df = pd.read_excel("M3C.xls")
df = df.iloc[:146,6:26]

df_train = df.iloc[:,:14]
df_test = df.iloc[:,14:]

window_size = 3

In [9]:
def preprocess_and_split(data, fold, v, window_size, alpha):
    prep = []
    validationSet = []
    validation = data.iloc[fold-v:fold]
    train = data[~data.isin(validation)].dropna()

    for index, row in train.iterrows():
        p, details = preprocess(row, smoothing = 1, alpha = alpha)
        prep.append(p)

    train = np.array(prep)

    x_train = []
    y_train = []

    for i in range(len(train)):
        for j in range(train.shape[1] - window_size):
            x_train.append(train[i][j:j + window_size])
            y_train.append(train[i][j + window_size])

    x_train = np.array(x_train).reshape(len(x_train), window_size)
    y_train = np.array(y_train).reshape(len(y_train))

    for index, row in validation.iterrows():
        p, details = preprocess(row, smoothing = 0)
        prep.append(p)
        for i in range(len(p) - window_size):
            validationSet.append([p[i:i + window_size], p[i + window_size], details, i + window_size])

    x_validation = np.array([x[0] for x in validationSet]).reshape(len(validationSet), window_size)
    y_validation = np.array([x[1] for x in validationSet]).reshape(len(validationSet))

    return x_train, y_train, x_validation, y_validation, validationSet


In [10]:
def get_folds(rows, k):
    folds = []
    rows = np.array_split(np.arange(rows), k)
    for row in rows:
        folds.append(row[-1])
    return folds

In [11]:
def kfolds(df_train, k, options, window_size, val_frac, alpha):
    avgSmape = []
    train = df_train.sample(frac = 1)
    folds = get_folds(len(train), k) #Indicates the points to which data should be used in every fold

    validation_size = int(np.floor(val_frac*len(train)))

    for fold in folds:
        x_train = []
        y_train= []
        x_train, y_train, x_validation, y_validation, validationSet = preprocess_and_split(train, fold, validation_size, window_size, options.smoothingFactor) #takes df's returns np arrays


        model = build_model(x_train, y_train, x_validation, y_validation, window_size, options)

        validationPrediction = model.predict(x_validation, verbose=0)
        yHatReal = []
        yReal = []

        for i in range(len(y_validation)):
            yReal.append(reprocess(y_validation[i], validationSet[i]))
            yHatReal.append(reprocess(validationPrediction[i], validationSet[i]) )
        smapeVal = 0
        for i in range(len(yReal)):
            smapeVal += smape_clean(yReal[i], yHatReal[i])
        smapeVal /= len(yReal)
        print("smape equals", smapeVal, "with", fold, "time step as training")
        avgSmape.append(smapeVal)
    return np.mean(avgSmape), model

In [13]:
options = [[[3,3],'sigmoid','relu', 'sigmoid', 1e-4, 16, 1, 0,0]]
options = pd.DataFrame(options)
options = options.set_axis(['layers', 'activation1', 'activation', 'activation2', 'regularization', 'batchSize', 'smoothingFactor', 'averageSmape', 'varianceSmape'], axis=1)
# print(options)
window_size = 3
val_frac = 0.1
smape, model = kfolds(df_train, 5, options.iloc[0], window_size, val_frac, options.iloc[0].smoothingFactor)
print(smape)


Train: 0.776, Test: 0.826
smape equals 7.606298726523755 with 29 time step as training
Train: 0.771, Test: 0.637
smape equals 12.743873720911596 with 58 time step as training
Train: 0.777, Test: 0.734
smape equals 10.67999713092607 with 87 time step as training
Train: 0.797, Test: 0.710
smape equals 10.691417434537033 with 116 time step as training
Train: 0.784, Test: 0.697
smape equals 11.701616141400844 with 145 time step as training
10.68464063085986


In [None]:
lays = [[1],[2],[2,2],[3,3],[2,5]]
batchSizes = [8,16,32]
acts1 = ['sigmoid', 'relu','linear']
activationFunctions = ['sigmoid', 'relu']
acts2 = ['sigmoid', 'relu','linear']
smoothingfactors = [0.9,1]
window_size = 3
val_frac = 0.1

In [None]:
lays = [[3,3]]
batchSizes = [16]
acts1 = ['sigmoid', 'relu','linear']
activationFunctions = ['sigmoid', 'relu']
acts2 = ['sigmoid', 'relu','linear']
regs = [1e-4]
smoothingfactors = [1]
window_size = 3
val_frac = 0.1

options = []

for layer in lays:
    for act1 in acts1:
        for act2 in acts2:
            for activation in activationFunctions:
                for reg in regs:
                    for smooth in smoothingfactors:
                        for batchSize in batchSizes:
                            options.append([layer, act1, activation, act2, reg, batchSize, smooth, 0, 0])


options = pd.DataFrame(options)
options = options.set_axis(['layers', 'activation1', 'activation', 'activation2','regularization', 'batchSize', 'smoothingFactor', 'averageSmape', 'varianceSmape'], axis=1)

# print(options.iloc[0].layers)

for i in range(len(options)):
    smape_avg=[]
    for j in range(1):
        smp, model = kfolds(df_train, 5, options.iloc[0], 2, window_size, val_frac)
        smape_avg.append(smp)

    options.iat[i,3] = np.mean(smape_avg)
    options.iat[i,4] = np.std(smape_avg)


op = pd.DataFrame(options)
res = op.sort_values(by="averageSmape")
print(res)

Train: 0.775, Test: 0.731
smape equals 9.53191671018765 with 29 time step as training
Train: 0.769, Test: 0.728
smape equals 9.495957032190901 with 58 time step as training


: 

: 

In [16]:
op = pd.DataFrame(options)
res = op.sort_values(by="averageSmape")
print(res)

res = options

   layers activation1 activation activation2  regularization  batchSize  \
0  [3, 3]     sigmoid       relu     sigmoid          0.0001         16   

   smoothingFactor  averageSmape  varianceSmape  
0                1             0              0  


In [None]:
x_train = []
y_train= []
val_frac = 0.1
validation_size = int(np.floor(val_frac*len(df_train)))
x_train, y_train, x_validation, y_validation, validationSet = preprocess_and_split(df_train, len(df_train), validation_size, window_size, res.iloc[0]['smoothingFactor']) #takes df's returns np arrays

model = build_model(x_train, y_train, x_validation, y_validation, window_size, res.iloc[0])

Train: 0.663, Test: 0.535


In [None]:
print(op.sort_values(by="averageSmape"))

   layers activation  batchSize  smoothingFactor  averageSmape  varianceSmape
0  [2, 2]       relu         16                1             0              0


In [17]:
##TESTING

predictions = pd.DataFrame()
observations = []
window_size = 3

df_full = pd.DataFrame()
df_full = df_train
df_full = df_full.drop(df_full.columns[14:], axis=1)

num_predictions = 6

# Make predictions using autoregressive approach
for pred in range(num_predictions):

    PF = []
    for index, row in df_full.iterrows():
        # print(row)
        if pred == 0:
            preprocessed, details = preprocess(row, smoothing = 1, alpha = res.iloc[0].smoothingFactor)
            PF.append(details[2:])
            observations.append([preprocessed[11+pred:14+pred],0, details, 14+pred]) #y is unknown and first time point to predict is 15(or 14?)`
            # df_full.loc[index,:] = preprocessed
        else:
            observations.append([row[11+pred:14+pred],0, details, 14+pred])

    # Reshape the input for prediction
    x = []
    for i in (range(len(observations))):
        x.append(observations[i][0])
    x = np.array(x).reshape(len(x),window_size)
    
    # Make the prediction
    prediction = model.predict(x)

    print(prediction.shape)

    y_u = []
    for i in range(len(prediction)):
        y_u.append(reprocess(prediction[i], observations[i]))
        # y_u.append(prediction[i])

    # print(pd.DataFrame(y_u).shape)
    predictions[15+pred] = pd.DataFrame(y_u)
    df_full[15+pred] = pd.DataFrame(y_u)

smapes = pd.DataFrame(columns=[i for i in range(num_predictions)])

for i in range(predictions.shape[0]):
    smape_row = []
    for j in range(num_predictions):
        smape_row.append(smape_clean(predictions.iloc[i, j], df_test.iloc[i, j]))
    smapes.loc[i] = smape_row

print(smapes)

smape_avgs = []
for i in range(num_predictions):
    smape_avgs.append(np.mean(smapes.iloc[:,i]))
print(smape_avgs)

(146, 1)
(292, 1)
(438, 1)
(584, 1)
(730, 1)
(876, 1)
             0          1          2          3          4          5
0     0.170762  13.671464  24.593617  37.532119  44.087357  52.117407
1    31.907790  46.886814  37.258096  37.086392  37.300013  39.968017
2    18.666267  34.340722  28.936940  31.573629  29.806002  16.602476
3     6.072573  17.625750  25.405210  14.704635  20.885191  13.372797
4    20.878399  13.999055   1.391686   6.056615   1.518975  12.363949
..         ...        ...        ...        ...        ...        ...
141   4.261588  11.877855   7.439210   4.997700  10.269803  11.819102
142  10.521921   4.942593  16.118498   7.762845   9.658802   2.954356
143   9.997748  21.850536   7.830317  23.972911  40.944614  57.332700
144   7.308143  17.055890  25.665108  37.889772  53.109436  76.324439
145  17.033094  17.162834   0.620561   7.912973  51.882849  66.641181

[146 rows x 6 columns]
[8.689954186237703, 12.275550270085102, 23.422265272628053, 24.230145733031186, 28

In [14]:
# y = []
# y_hat = []
# details = []
# offset = 7
# for i in range(0,10):
#     y.append(observations[i+offset][1])
#     y_hat.append(y[i] + 0.4)
#     details.append(observations[i+offset])

# repY = []
# repY_hat = []
# smape = 0
# for i in range(10):
#     repY.append(reprocess(y[i], details[i]))
#     repY_hat.append(reprocess(y_hat[i], details[i]))
#     smape += smape_clean(repY[i], repY_hat[i])

# smape /= len(repY)
# print(smape)
# pyplot.plot(repY, label='original')
# pyplot.plot(repY_hat, label='altered')
# pyplot.plot(df_train.iloc[1,3:10], label = 'og')
# pyplot.legend()
# pyplot.show()