In [9]:
# importing helper functions
from data import *

In [10]:
# import preprocessing tools
from sklearn import preprocessing
import numpy as np
import pandas as pd

# import learning/evaluation
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

In [23]:
# dataset creation
def make_target(df, h):
    if h == 1024:
        hh = 24
        y = df.shift(periods=-hh, freq='1H')["pc_ma_1D"]
        df.pop('pc15')
        df.pop('pc30')
        df.pop('pc45')
        y = y[hh:]
        X = df[:-hh]
    elif h > 100:
        hh = h - 100
        y = df.shift(periods=-hh, freq='1H')["pc_ma_6H"]
        df.pop('pc15')
        df.pop('pc30')
        df.pop('pc45')
        y = y[hh:]
        X = df[:-hh]
    elif h >= 1:
        y = df.shift(periods=-h, freq='1H')["pc"]
        df.pop('pc15')
        df.pop('pc30')
        df.pop('pc45')
        y = y[h:]
        X = df[:-h]
    else: 
        y = {
            0.25: df["pc15"],
            0.5: df["pc30"],
            0.75: df["pc45"]
        }[h]
        df.pop('pc15')
        df.pop('pc30')
        df.pop('pc45')        
        X = df
    return X, y

# load data
static_df = load_static()
weather_df = pd.read_pickle('data/weather/weather.pkl')

In [24]:
sensors = [1, 24, 35, 43, 68, 103, 105, 106, 108, 109, 110, 116, 117, 122, 124, 144, 145, 157, 168, 177,
    179, 223, 225, 236, 239, 245, 249, 253, 257, 259, 262, 285, 286, 290, 293, 301, 303, 312, 358, 359, 362,
    365, 367, 377, 378, 379, 382, 384, 385, 388, 389, 391, 392, 394, 395]
# horizons = [0.25, 0.5, 0.75, 1, 3, 6, 12, 24, 36]
horizons = [106, 112, 118, 124, 1024]

models = [
    ('LinReg', 'LinearRegression()'),
    ('DT', 'DecisionTreeRegressor()'),
    ('RF', 'RandomForestRegressor(n_jobs=10)'),
    ('GBT', 'GradientBoostingRegressor()')
    #('MLP', 'MLPRegressor(hidden_layer_sizes=(10,))')
]

result = []
for s in sensors:
    for h in horizons:
        for name, model in models: 
            output = "s{}h{}-{}".format(s, h, name)
            print(output)

            # load sensor - last parameter is id
            final_df = make_final(static_df, weather_df, s)

            # make dataset
            X_df, y_df = make_target(final_df, h)
            
            hh = h
            if (hh < 1):
                hh = 1
            selected_features = get_all_candidates(s)
            X = X_df[selected_features].values.astype(float)
            y = y_df.values.astype(float)

            tscv = TimeSeriesSplit(n_splits=50)

            reg = eval(model)

            i = 0            
            partial_result = { "sensor": s, "horizon": h, "model": name, "mape": [], "r2": []}
            for train_index, test_index in tscv.split(X):
                # print("TRAIN:", train_index, "TEST:", test_index)
                i = i + 1    
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_test)

                partial_result["mape"].append(mean_absolute_percentage_error(y_test, y_pred))
                partial_result["r2"].append(r2_score(y_test, y_pred))
                #print(i, r2_score(y_test, y_pred))

            result.append(partial_result)



s1h106-LinReg
s1h106-DT
s1h106-RF
s1h106-GBT
s1h112-LinReg
s1h112-DT
s1h112-RF
s1h112-GBT
s1h118-LinReg
s1h118-DT
s1h118-RF
s1h118-GBT
s1h124-LinReg
s1h124-DT
s1h124-RF
s1h124-GBT
s1h1024-LinReg
s1h1024-DT
s1h1024-RF
s1h1024-GBT
s24h106-LinReg
s24h106-DT
s24h106-RF
s24h106-GBT
s24h112-LinReg
s24h112-DT
s24h112-RF
s24h112-GBT
s24h118-LinReg
s24h118-DT
s24h118-RF
s24h118-GBT
s24h124-LinReg
s24h124-DT
s24h124-RF
s24h124-GBT
s24h1024-LinReg
s24h1024-DT
s24h1024-RF
s24h1024-GBT
s35h106-LinReg
s35h106-DT
s35h106-RF
s35h106-GBT
s35h112-LinReg
s35h112-DT
s35h112-RF
s35h112-GBT
s35h118-LinReg
s35h118-DT
s35h118-RF
s35h118-GBT
s35h124-LinReg
s35h124-DT
s35h124-RF
s35h124-GBT
s35h1024-LinReg
s35h1024-DT
s35h1024-RF
s35h1024-GBT
s43h106-LinReg
s43h106-DT
s43h106-RF
s43h106-GBT
s43h112-LinReg
s43h112-DT
s43h112-RF
s43h112-GBT
s43h118-LinReg
s43h118-DT
s43h118-RF
s43h118-GBT
s43h124-LinReg
s43h124-DT
s43h124-RF
s43h124-GBT
s43h1024-LinReg
s43h1024-DT
s43h1024-RF
s43h1024-GBT
s68h106-LinReg
s68h106-D

s262h1024-GBT
s285h106-LinReg
s285h106-DT
s285h106-RF
s285h106-GBT
s285h112-LinReg
s285h112-DT
s285h112-RF
s285h112-GBT
s285h118-LinReg
s285h118-DT
s285h118-RF
s285h118-GBT
s285h124-LinReg
s285h124-DT
s285h124-RF
s285h124-GBT
s285h1024-LinReg
s285h1024-DT
s285h1024-RF
s285h1024-GBT
s286h106-LinReg
s286h106-DT
s286h106-RF
s286h106-GBT
s286h112-LinReg
s286h112-DT
s286h112-RF
s286h112-GBT
s286h118-LinReg
s286h118-DT
s286h118-RF
s286h118-GBT
s286h124-LinReg
s286h124-DT
s286h124-RF
s286h124-GBT
s286h1024-LinReg
s286h1024-DT
s286h1024-RF
s286h1024-GBT
s290h106-LinReg
s290h106-DT
s290h106-RF
s290h106-GBT
s290h112-LinReg
s290h112-DT
s290h112-RF
s290h112-GBT
s290h118-LinReg
s290h118-DT
s290h118-RF
s290h118-GBT
s290h124-LinReg
s290h124-DT
s290h124-RF
s290h124-GBT
s290h1024-LinReg
s290h1024-DT
s290h1024-RF
s290h1024-GBT
s293h106-LinReg
s293h106-DT
s293h106-RF
s293h106-GBT
s293h112-LinReg
s293h112-DT
s293h112-RF
s293h112-GBT
s293h118-LinReg
s293h118-DT
s293h118-RF
s293h118-GBT
s293h124-LinReg
s293

In [25]:
# save features
with open("results/sklearn_general_additional.pkl", "wb") as f:
    pickle.dump(result, f)

In [16]:
list(final_df.columns)

['temperature0',
 'dewPoint0',
 'humidity0',
 'pressure0',
 'windSpeed0',
 'windBearing0',
 'cloudCover0',
 'visibility0',
 'temperature1',
 'dewPoint1',
 'humidity1',
 'pressure1',
 'windSpeed1',
 'windBearing1',
 'cloudCover1',
 'visibility1',
 'temperature2',
 'dewPoint2',
 'humidity2',
 'pressure2',
 'windSpeed2',
 'windBearing2',
 'cloudCover2',
 'visibility2',
 'temperature3',
 'dewPoint3',
 'humidity3',
 'pressure3',
 'windSpeed3',
 'windBearing3',
 'cloudCover3',
 'visibility3',
 'temperature4',
 'dewPoint4',
 'humidity4',
 'pressure4',
 'windSpeed4',
 'windBearing4',
 'cloudCover4',
 'visibility4',
 'temperature5',
 'dewPoint5',
 'humidity5',
 'pressure5',
 'windSpeed5',
 'windBearing5',
 'visibility5',
 'temperature6',
 'dewPoint6',
 'humidity6',
 'pressure6',
 'windSpeed6',
 'windBearing6',
 'visibility6',
 'temperature7',
 'dewPoint7',
 'humidity7',
 'pressure7',
 'windSpeed7',
 'windBearing7',
 'cloudCover7',
 'visibility7',
 'temperature8',
 'dewPoint8',
 'humidity8',
 'p