In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
import data_preperation as prep
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [99]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [48]:
#get measurement data and forecast data: 
df0=prep.prepare_measurement()
forecast = prep.prepare_forecast()
df0 = df0[['speed', 'cos_wind_dir', 'sin_wind_dir', 'temp', 'radiation', 'precip', 'season', 'day']]

read csv semester csv files from 2015s2 to 2020s1
smooth wind direction
generate seasonality categorical feature
generate day/night categorical feature
reading forecast data
smooth wind direction


In [3]:
# data_merge, data, forecast = prepare_data_with_forecast(data, keep_only_last=False)
def get_past_n_steps(df, steps_in):
    #rename column to most remote data
    df_out = df.copy().add_suffix('_t-'+str(steps_in))
    #t-i remote data
    for i in range(1, steps_in+1):
        df_temp = df.copy().add_suffix('_t-'+str(steps_in-i)) #rename column
        df_temp= df_temp.shift(periods=-i, axis=0) #shift down i row
        df_out=df_out.join(df_temp, how = 'inner')#join
    #shift index to present time (+steps_in)
    df_out['datetime_now']=df_out.index.to_series()+timedelta(hours=steps_in)
    df_out.set_index(pd.DatetimeIndex(df_out['datetime_now']), inplace=True)
    return df_out

In [12]:
def join_forecast(df, forecast, predict): 
    
    #crop out forecast if forecast period is less than prediction period
    forecast = forecast.loc[forecast['f_period']>= predict]
    forecast = prep.keep_last_forecast(forecast) 
    forecast = forecast.add_suffix('_forecast')
#     forecast.drop(['f_date','p_date'], axis=1, inplace=True)
    
    #join forecast with data 
    df['datetime_predict']=df['datetime_now']+timedelta(hours=predict)
    df.set_index(df['datetime_predict'], inplace=True)
    
    df_out = df.join(forecast, how= 'left')
    return df_out 




In [75]:
#define y according to x_df and prediction period 
def get_y(x_df, predict, param = 'speed_t-0'):
    y_df = x_df[param] 
    #shift df by prediction period 
    y_df = y_df[predict:]
    x_df = x_df[:-predict]
    return x_df, y_df 

#change x,y to array like 
def df_to_array(X_df, Y_df):
    #drop timestamp columns
    X_df.drop(['datetime_now','datetime_predict'], axis=1, inplace=True)
    #forward fill: use last forecast
    X_df.fillna(method='ffill', inplace=True)
    Y_df.fillna(method='ffill', inplace=True) 
    #dropna after filling nan 
    X_df.dropna(axis=1, inplace=True)
    X_df.dropna(axis=1, inplace=True)
    X = X_df.values
    Y = Y_df.values 
    return X, Y

def prepare_x_y(past_period, predict_period, historical_df, forecast_df, param = 'speed_t-0'):
    # concatenate past_n_steps data 
    df=get_past_n_steps(historical_df, past_period)
    # add forecast according to prediction period 
    x_df = join_forecast(df, forecast_df, predict_period)
    # define y according to x 
    x_df, y_df = get_y(x_df, predict_period, param)
    # change dataframe to array
    x, y = df_to_array(x_df, y_df) 
    return x, y

In [76]:
def get_x_y(steps_in, steps_out, param = 'speed_t-0'):
    df0 = prep.prepare_measurement()
    #keep only relevant features
    df0 = df0[['speed', 'cos_wind_dir', 'sin_wind_dir', 'temp', 'radiation', 'precip', 'season', 'day']]
    forecast = prep.prepare_forecast()
    x, y = prepare_x_y(steps_in, steps_out, df0, forecast, param)
    return x, y

In [89]:
x, y = get_x_y(1,1)

read csv semester csv files from 2015s2 to 2020s1
smooth wind direction
generate seasonality categorical feature
generate day/night categorical feature
reading forecast data
smooth wind direction


In [88]:
x.all()

0.0

In [93]:
def scale_data_X_y(steps_in, steps_out, test_size = 0.2, param = 'speed_t-0'):
    
    X, y = get_x_y(steps_in, steps_out, param)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle = False)
    scaler_X = MinMaxScaler(feature_range=(0, 1))
    scaler_y = MinMaxScaler(feature_range=(0, 1))
    
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1))
    y_train_scaled = y_train_scaled.reshape(-1)
    y_test_scaled = scaler_y.transform(y_test.reshape(-1,1))
    y_test_scaled = y_test_scaled.reshape(-1)
    
    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, X_train, X_test, y_train, y_test, scaler_X, scaler_y

In [104]:
_, _, _, _, X_train, X_test, y_train, y_test, _, _ = scale_data_X_y(24, 24)

read csv semester csv files from 2015s2 to 2020s1
smooth wind direction
generate seasonality categorical feature
generate day/night categorical feature
reading forecast data
smooth wind direction


In [105]:
xg_speed = XGBRegressor(max_depth = 5)
xg_speed.fit(X_train, y_train)
y_hat_speed = xg_speed.predict(X_test)



In [106]:
print("MSE speed is: ", mean_squared_error(y_test, y_hat_speed))
print("MAE speed is: ", mean_absolute_error(y_test, y_hat_speed))

MSE speed is:  0.8166834458382017
MAE speed is:  0.6939324695869339
