# Napelemek temelésének előrejelzése gépi tanulási algoritmusok segítségével
## A feladat
`count félrevezet; cov,corr INF-et ad; quantile-t nem tudom használni`

In [3]:
import math
import pandas as pd
import numpy as np

PATH_TO_DATA = '../solar/train15.csv'
DATE_FORMAT = '%Y%m%d %H:%M'
INDEX_COLUMN = "TIMESTAMP"
TARGET_COLUMN = "POWER"

df = pd.read_csv(PATH_TO_DATA)
dateparse = lambda x: pd.datetime.strptime(x, DATE_FORMAT)
df[INDEX_COLUMN] = df[INDEX_COLUMN].apply(dateparse)
ONE_DAY = 24
ONE_YEAR = 365 * ONE_DAY
ONE_WEEK = ONE_DAY * 7
PREDICT_INTERVAL = ONE_DAY*5

In [4]:
# Magyarázó változók hozzáadása a modellhez
columns_to_keep = ['TIMESTAMP','ZONEID','VAR78', 'VAR79', 'VAR134', 'VAR157', 'VAR164',
       'VAR165', 'VAR166', 'VAR167']

df_features = pd.read_csv('../solar/predictors15.csv')
df_features[INDEX_COLUMN] = df_features[INDEX_COLUMN].apply(dateparse)
df_features = df_features[columns_to_keep]

df = df.merge(df_features, how='left', on=[INDEX_COLUMN,'ZONEID'])
#df["WINDSPEED"] = df[["VAR165","VAR166"]].apply(lambda x: math.sqrt(pow(x[0],2) + pow(x[1],2)) , axis=1) 
#wo: 0.7892619896086219 wi:0.7886526090798553

In [5]:
df["MONTH"] = df[INDEX_COLUMN].apply(lambda x: x.month)
df["HOUR"] = df[INDEX_COLUMN].apply(lambda x: x.hour)
# for i in np.arange(1,13):
#     df["MONTH"+str(i)]= df["TIMESTAMP"].apply(lambda x: x.month == i)
# for i in np.arange(24):
#     df["HOUR"+str(i)] = df["TIMESTAMP"].apply(lambda x: x.hour == i)

In [None]:
# Zóna konvertálása kategorikus változóvá
df["ZONE_1"] = df["ZONEID"].apply(lambda x: x == 1)
df["ZONE_2"] = df["ZONEID"].apply(lambda x: x == 2)
df["ZONE_3"] = df["ZONEID"].apply(lambda x: x == 3)
df = df.drop("ZONEID",axis=1)

# Timestamp legyen az index
df = df.set_index('TIMESTAMP')

In [None]:
#Using one year to train and one month to predict
# df is the full data so we can calculate the rolling windows
df = df[:ONE_YEAR+PREDICT_INTERVAL]
y_test = df[ONE_YEAR:ONE_YEAR+PREDICT_INTERVAL].POWER.copy() # ez mekkora szopás referencia

In [None]:
def add_rolling(df,column):
    for i in range(0+1,PREDICT_INTERVAL+1):
        df.iloc[-1*i,df.columns.get_loc(column)] = None
    for i in range(PREDICT_INTERVAL,PREDICT_INTERVAL+ONE_WEEK):
        df["ROLLING_MEAN_"+column+"_"+str(i)] = df[column].rolling(window = i).mean().shift(i)
        df["ROLLING_MIN_"+column+"_"+str(i)] = df[column].rolling(window = i).min().shift(i)
        df["ROLLING_MAX_"+column+"_"+str(i)] = df[column].rolling(window = i).max().shift(i)
        df["ROLLING_SUM_"+column+"_"+str(i)] = df[column].rolling(window = i).sum().shift(i)
        df["ROLLING_MEDIAN_"+column+"_"+str(i)] = df[column].rolling(window = i).median().shift(i)

        df["ROLLING_STD_"+column+"_"+str(i)] = df[column].rolling(window = i).std().shift(i)
        df["ROLLING_VAR_"+column+"_"+str(i)] = df[column].rolling(window = i).var().shift(i)
        df["ROLLING_SKEW_"+column+"_"+str(i)] = df[column].rolling(window = i).skew().shift(i)
        df["ROLLING_KURT_"+column+"_"+str(i)] = df[column].rolling(window = i).kurt().shift(i)
    return df

In [None]:
df = add_rolling(df,TARGET_COLUMN)

In [None]:
feature_columns = ['VAR78', 'VAR79', 'VAR134', 'VAR157', 'VAR164',
       'VAR165', 'VAR166', 'VAR167']
for column in feature_columns:
    df = add_rolling(df,column)

In [None]:
from xgboost import XGBRegressor #Ezt külön fel kellett rakni
X_train = df[:ONE_YEAR].drop(TARGET_COLUMN,axis=1).drop(feature_columns, axis=1)
y_train = df[:ONE_YEAR].POWER

model = XGBRegressor(nthread=4)
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.metrics import explained_variance_score

X_test = df[ONE_YEAR:ONE_YEAR+PREDICT_INTERVAL].drop(TARGET_COLUMN,axis=1).drop(feature_columns, axis=1)
y_pred = model.predict(X_test)

y_true = y_test.values
print(explained_variance_score(y_pred,y_test))
print(median_absolute_error(y_true,y_pred))
print(mean_squared_log_error(y_true,y_pred))

In [None]:
from bokeh.plotting import figure,show

from bokeh.io import output_notebook

output_notebook()
p = figure()
p.line(np.arange(PREDICT_INTERVAL), y_true, legend="real")
p.line(np.arange(PREDICT_INTERVAL),y_pred ,legend="predicter", line_color="orange")
show(p)

In [None]:
# from bokeh.plotting import figure, output_notebook, show
# output_notebook()
# for column in df:
#     print(column)
#     p = figure(plot_width=400, plot_height=400)
#     p.line(df.index[:24*5],df[column][:24*5])
#     show(p)

In [None]:
# Feature nélküli modellen javított amint beraktam a hónap, óra változókat numerikusan
# vajon mi lenne kategorikusan