In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRegressor
from datetime import timedelta
import pickle
import warnings
warnings.filterwarnings('ignore')

#import local functions
from utils import utils_scenario as utils, data_preparation as prep, data_process as proc

In [2]:
from train_XGB import train_xgb

In [5]:
#get data
measurement=prep.prepare_measurement()
forecast = prep.prepare_forecast()
#keep useful columns

measurement= measurement[['speed', 'cos_wind_dir', 'sin_wind_dir', 'temp', 'radiation', 'precip','season']]
print('measurement features used to construct x_df:', measurement.columns.to_list())

#prediction steps
t_list= [2] #args.t_list # np.arange(1,49,1)
steps_in = 48 #args.steps_in #48
# 
#run three models per prediction step
for t in t_list:
    #run model and save model
    train_xgb(measurement, forecast, steps_in, t)

read csv semester csv files from 2015s2 to 2020s1
smooth wind direction
generate seasonality categorical feature
generate am/pm categorical feature
reading forecast data
smooth wind direction
measurement features used to construct x_df: ['speed', 'cos_wind_dir', 'sin_wind_dir', 'temp', 'radiation', 'precip', 'season']


In [2]:
# def run_xgb(steps_in, steps_out):
#     #Parameter list:
#     param_list =['speed','cos_wind_dir','sin_wind_dir']

#     predict = pd.DataFrame(columns={'speed','cos_wind_dir','sin_wind_dir'})
#     true = pd.DataFrame(columns={'speed','cos_wind_dir','sin_wind_dir'})
#     baseline = pd.DataFrame(columns={'speed','cos_wind_dir','sin_wind_dir'})

#     for param in param_list:
        
#         #train on the entire data 
#         x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in, steps_out, param)
#         xgb = XGBRegressor(max_depth = 5)
#         xgb.fit(x, y)

#         #save model into a pickle file 
#         pickle.dump(xgb, open('trained_models/'+str(param)+'_t_'+str(steps_out), 'wb')) 
#     return 

In [6]:
# #get test data (last 49 time steps of measurement data)

# #get the last 49 measurement data 
# measurement=prep.prepare_measurement()
# measurement= measurement[['speed', 'cos_wind_dir', 'sin_wind_dir', 'temp', 'radiation', 'precip','season']]
# df = measurement.tail(49)
# df.to_csv('data/test_measurement.csv')

# forecast = prep.prepare_forecast()
# df1 = prep.keep_last_forecast(forecast)
# df1 = df1.tail(49)
# df1.to_csv('data/test_forecast.csv')

In [7]:
#read test data 
measurement = pd.read_csv("data/test_measurement.csv")
forecast = pd.read_csv("data/test_forecast.csv")
measurement['datetime'] = pd.to_datetime(measurement['datetime'], format = '%Y-%m-%d %H:%M:%S')#change to datetime format 
forecast['f_date']= pd.to_datetime(forecast['f_date'], format = '%Y-%m-%d %H:%M:%S')#change to datetime format 

In [8]:
def prepare_x_test(measurement, forecast, past_n_steps, pred_period):
    
    #concat past n steps from measurement 
    df = measurement.set_index('datetime')
    df=proc.get_past_n_steps(df, past_n_steps)

    #calculate forecast_time
    df['forecast_time'] = df['present_time']+timedelta(hours=pred_period)

    #join forecast according to forecast time 
    forecast = forecast.set_index('f_date') 
    forecast = forecast.add_suffix('_forecast')
    df = pd.merge(df, forecast, how = 'left', left_on = 'forecast_time', right_on ='f_date')
    #add cos day
    df = proc.smooth_day_hour(df)
    #fill missing forecasts as 0
    df.fillna(value=0, inplace=True) 
    #keep first row 
    df = df.head(1)
    #drop timestamp columns
    df_out = df.drop(['present_time','forecast_time'], axis=1)
    return df_out

# test_df = prepare_x_test(measurement, forecast, past_steps, predict )


In [184]:
result  = pd.DataFrame(columns=['past_n_steps','pred_period','speed', 'cos_wind_dir','sin_wind_dir']) 
pred_list = [2]
past_n_steps = 48
for pred in pred_list: 
    #prepare data to be the same format as training data 
    x_test = prepare_x_test(measurement, forecast, past_n_steps, pred)
    x_test= np.array(x_test) #change to array 
    
    #read 3 models for speed, cos_wind, sin_wind
    xgb1= pickle.load(open('trained_models/speed_t_'+str(steps_out), 'rb'))
    xgb2 = pickle.load(open('trained_models/cos_wind_dir_t_'+str(steps_out), 'rb'))
    xgb3 = pickle.load(open('trained_models/sin_wind_dir_t_'+str(steps_out), 'rb'))
    
    #predict 
    speed = xgb1.predict(x_test)[0]
    cos_wind = xgb2.predict(x_test)[0]
    sin_wind = xgb3.predict(x_test)[0]
    
    #record accuracy
    result = result.append({'past_n_steps': str(past_n_steps),
                            'pred_period': str(pred),
                            'speed':round(speed,3),
                            'cos_wind_dir':cos_wind,
                            'sin_wind_dir':sin_wind}, ignore_index=True)    
    
#convert cos and sin to wind_dir:
result['wind_dir'] = result.apply(lambda row: utils.get_angle_in_degree(row['cos_wind_dir'],row['sin_wind_dir']),axis = 1)
result.to_csv('results/test_prediction.csv') 

In [129]:
x_test.update(test)

In [146]:
# # pd.concat([df1, forecast])
# # forecast
# df1
# x_df
x.shape

(42469, 351)

In [180]:
# concatenate past_n_steps data
past_n_steps = 48
df1 = get_past_n_steps(measurement, past_n_steps)
df1 = df1.dropna(axis=0, how = 'any') #drop empty rows 

predict=2
#calculate forecast_time
df1['forecast_time'] = df1['present_time']+ timedelta(hours=predict)

forecast = forecast.loc[forecast['forecast_time']==df1['forecast_time'][0]] #match forecast date 
forecast.drop_duplicates(subset = 'forecast_time', keep = 'last', inplace=True) #keep last forecast
forecast = forecast[['speed','temp','rad','precip','cos_wind_dir','sin_wind_dir']] #keep only useful columns 
forecast = forecast.add_suffix('_forecast')

df_out = pd.concat((df1, forecast), axis=1) 
df_out.fillna(value=0, inplace=True) #fill missing forecasts as 0
df_out

Unnamed: 0,speed_t-48,temp_t-48,radiation_t-48,precip_t-48,cos_wind_dir_t-48,sin_wind_dir_t-48,speed_t-47,temp_t-47,radiation_t-47,precip_t-47,...,cos_wind_dir_t-0,sin_wind_dir_t-0,present_time,forecast_time,speed_forecast,temp_forecast,rad_forecast,precip_forecast,cos_wind_dir_forecast,sin_wind_dir_forecast
1970-01-01 00:00:00.000000027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2020-05-06 14:00:00.000000000,3.428333,21.831667,764.216667,0.0,0.046986,-0.998896,4.593333,21.888333,632.866667,0.0,...,-0.920888,-0.389828,2020-05-06 14:00:00,2020-05-06 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
forecast

Unnamed: 0,speed_forecast,temp_forecast,rad_forecast,precip_forecast,cos_wind_dir_forecast,sin_wind_dir_forecast
27,8.374967,27.349969,696.266113,0.0,0.962573,0.271023


In [136]:
# forecast.loc[forecast['f_date']==df1['forecast_time'][0]]
df1['forecast_time'][0]

Timestamp('2020-05-06 16:00:00')

In [138]:
forecast['f_date']

dtype('O')

In [None]:
# def prepare_test_df(measurement, forecast, past_n_steps, pred_period, param='speed'):

    

    # add forecast data
    x_df = join_forecast(df1, forecast, pred_period)

    #smooth day and hour
    x_df = smooth_day_hour(x_df)

    # define y accordingly
    index = x_df['forecast_time']
    df2 = measurement[param]
    y_df = pd.merge(df2, index, left_on = 'datetime', right_on='forecast_time' )

    #fillna: use last measurements
    x_df.fillna(method='ffill', inplace=True)
    y_df.fillna(method='ffill', inplace=True)

    #dropna
    x_df.dropna(axis=1, inplace=True)
    y_df.dropna(axis=1, inplace=True)

    #select intersection of the forecast times:
    index2= y_df['forecast_time']
    x_df = pd.merge(x_df, index2, left_on = 'forecast_time', right_on='forecast_time')

    #set index:
    x_df.reset_index()

    #change df to array, drop datetime columns
    x, y = df_to_array(x_df, y_df)
    return x_df, y_df, x, y



In [52]:
def smooth_wind_dir(df):
    df['cos_wind_dir'] = np.cos(2 * np.pi * df['wind_dir'] / 360)
    df['sin_wind_dir'] = np.sin(2 * np.pi * df['wind_dir'] / 360)
    print('smooth wind direction')
    df.drop(columns=['wind_dir'], inplace=True)
    return df

# data_merge, data, forecast = prepare_data_with_forecast(data, keep_only_last=False)
def get_past_n_steps(df, steps_in):
    #rename column to most remote data
    df_out = df.copy().add_suffix('_t-'+str(steps_in))
    #t-i remote data
    for i in range(1, steps_in+1):
        df_temp = df.copy().add_suffix('_t-'+str(steps_in-i)) #rename column
        df_temp= df_temp.shift(periods=-i, axis=0) #shift down i row
        df_out=df_out.join(df_temp, how = 'inner')#join
    #shift index to present time (+steps_in)
    df_out['present_time']=df_out.index.to_series()+timedelta(hours=steps_in)
    df_out.set_index(pd.DatetimeIndex(df_out['present_time']), inplace=True)
    return df_out

def keep_last_forecast (df):
    df.sort_values(by=['f_date','p_date'], inplace=True)
    df.drop_duplicates(subset = 'f_date', keep = 'last', inplace=True)
    df.set_index('f_date', inplace=True)
    df.drop(['p_date','f_period'], axis=1, inplace=True)
    return df

speed_t-48                       3.42833
temp_t-48                        21.8317
radiation_t-48                   764.217
precip_t-48                            0
cos_wind_dir_t-48              0.0469857
sin_wind_dir_t-48              -0.998896
speed_t-47                       4.59333
temp_t-47                        21.8883
radiation_t-47                   632.867
precip_t-47                            0
cos_wind_dir_t-47               -0.22559
sin_wind_dir_t-47              -0.974222
speed_t-46                          5.98
temp_t-46                        21.2267
radiation_t-46                   468.517
precip_t-46                            0
cos_wind_dir_t-46              -0.425638
sin_wind_dir_t-46              -0.904894
speed_t-45                       5.26167
temp_t-45                        21.4517
radiation_t-45                     278.5
precip_t-45                            0
cos_wind_dir_t-45              -0.466739
sin_wind_dir_t-45              -0.884395
speed_t-44      