In [12]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from datetime import timedelta
import pickle

from utils import utils_scenario as utils, data_preparation as prep, data_process as proc

In [13]:
#read in processed measurement data and forecast data
measurement = pd.read_csv('./data/processed/last_measurement.csv')
measurement['datetime']= pd.to_datetime(measurement['datetime'],format='%Y-%m-%d %H:%M:%S')
forecast = pd.read_csv('./data/processed/last_forecast.csv')
forecast['f_date']= pd.to_datetime(forecast['f_date'],format='%Y-%m-%d %H:%M:%S')
# forecast['p_date']= pd.to_datetime(forecast['p_date'],format='%Y-%m-%d %H:%M:%S')

In [14]:
#prepare processed data into same format as training df 
def prepare_x_test(measurement, forecast, past_n_steps, pred_period):
    
    #concat past n steps from measurement 
    df = measurement.set_index('datetime')
    #drop am feature
#     df = df.drop(['am'], axis=1)
    df=proc.get_past_n_steps(df, past_n_steps)
    #calculate forecast_time
    df['forecast_time'] = df['present_time']+timedelta(hours=pred_period)

    #join forecast according to forecast time 
    forecast = forecast.set_index('f_date') 
    forecast = forecast.add_suffix('_forecast')
    df = pd.merge(df, forecast, how = 'left', left_on = 'forecast_time', right_on ='f_date')
    #add cos day
    df = proc.smooth_day_hour(df)
    #fillna
    df.fillna(value=0, inplace=True)
    df = df.iloc[:-past_n_steps]
    #keep the latest information
    df_out = df.iloc[-1:]
    return df_out

## Check if any columns are missing:

In [16]:
# #to change: make an assert error
# df= prepare_x_test(measurement, forecast, 48, 6)
# #load model columns 
# model_cols = pickle.load(open('model_cols_past_48.pkl', 'rb'))
# print('columns missing:', model_cols.difference(df.columns))
# #it's okay if am feature is missing - it's not included in the model

In [17]:
df

Unnamed: 0,speed_t-48,cos_wind_dir_t-48,sin_wind_dir_t-48,temp_t-48,radiation_t-48,precip_t-48,season_t-48,speed_t-47,cos_wind_dir_t-47,sin_wind_dir_t-47,...,present_time,forecast_time,speed_forecast,temp_forecast,rad_forecast,precip_forecast,cos_wind_dir_forecast,sin_wind_dir_forecast,cos_day,cos_hour
280,5.545,0.596844,-0.766326,28.508333,590.366667,0.0,3,4.923333,0.654128,-0.71467,...,2020-07-14 16:00:00,2020-07-14 22:00:00,4.6,23.1,0.0,0.0,0.75471,0.656059,0.9711,-0.5


In [30]:
#predict next 48 hours
predict_list = np.arange(1,49,1)
result = pd.DataFrame(columns={})
for pred in predict_list:
    x_test = prepare_x_test(measurement, forecast, 48, int(pred))
    #drop timestamp columns and change to array format
    f_date = x_test['forecast_time'].iloc[0]
    p_date = x_test['present_time'].iloc[0]
    x_test = x_test.drop(['present_time','forecast_time'], axis=1)

    #check missing features 
    xgb1 = pickle.load(open('../trained_models/speed_t_'+str(pred), 'rb'))
    model_features = xgb1.get_booster().feature_names
    missing_features = set(model_features).difference(x_test.columns)
    if len(missing_features)>=1:
        print('model not run because some features are missing:', missing_features)
        break 
    else:
        #re-order features 
        x_test = x_test[model_features]
        
    #read 3 models for speed, cos_wind, sin_wind
    xgb1= pickle.load(open('../trained_models/speed_t_'+str(pred), 'rb'))
    xgb2 = pickle.load(open('../trained_models/cos_wind_dir_t_'+str(pred), 'rb'))
    xgb3 = pickle.load(open('../trained_models/sin_wind_dir_t_'+str(pred), 'rb'))

    #predict 
    speed = xgb1.predict(x_test)[-1]
    cos_wind = xgb2.predict(x_test)[-1]
    sin_wind = xgb3.predict(x_test)[-1]
    
    #calculate wind_dir
    wind_dir = utils.get_angle_in_degree(cos_wind, sin_wind)

    #record result
    result = result.append({'forecast_time':f_date,
                            'present_time':p_date,
                            'speed':speed,
                            'cos_wind_dir':cos_wind,
                           'sin_wind_dir':sin_wind,
                           'wind_dir':wind_dir}, ignore_index=True)


In [31]:
result

Unnamed: 0,cos_wind_dir,forecast_time,present_time,sin_wind_dir,speed,wind_dir
0,0.84402,2020-07-14 17:00:00,2020-07-14 16:00:00,-0.441228,4.952895,212.432948
1,0.897209,2020-07-14 18:00:00,2020-07-14 16:00:00,-0.243845,4.951971,206.206432
2,0.887204,2020-07-14 19:00:00,2020-07-14 16:00:00,0.056082,4.772141,27.476031
3,0.867661,2020-07-14 20:00:00,2020-07-14 16:00:00,0.329359,5.035214,29.812082
4,0.796441,2020-07-14 21:00:00,2020-07-14 16:00:00,0.604113,5.064035,37.208429
5,0.718218,2020-07-14 22:00:00,2020-07-14 16:00:00,0.618791,5.090146,44.092426
6,0.683517,2020-07-14 23:00:00,2020-07-14 16:00:00,0.638492,4.246524,46.88094
7,0.489306,2020-07-15 00:00:00,2020-07-14 16:00:00,0.763767,3.881823,60.705
8,0.688048,2020-07-15 01:00:00,2020-07-14 16:00:00,0.839634,3.733252,46.524233
9,0.691381,2020-07-15 02:00:00,2020-07-14 16:00:00,0.697644,4.212215,46.260479


Hi Karim,

Please find my the updated notebook attached.\
I have made several changes to the code, and I will outline the changes made here: \
- crop out 7 hours of prediction data instead of 6
- when taking hourly average, now we are averaging to the 'right' instead of 'left' to reflect the hour reflect past hour data 
- xgb models are trained with column names. In addition, I've added several lines of code to compare model_features with testing_features, and the code will stop and raise an error flag of the missing features.

Models are all retrained to reflect the above amendments. I have pushed several directories to the github, and also sent the same files in a zip folder via Teams, including:
- trained_models: containing all the models in pickle files
- results: testing results (80% training / 20% testing data split) on the data set we have, to check performance after making the above 3 amendments. I have tested on max_depth=5, n_estimators = 100 and 200 respectively. Results are can be found in "xgboost_accuracy_in_48_depth_5_estim_100.csv" and "xgboost_accuracy_in_48_depth_5_estim_200.csv". I've compared this results with previously; now they are slightly worse (~3-4% decrease in accuracy), I think due to 1)croping out 1 more hour of forecast 2) shifting the hour to 1 hour later. Nevertheless, they are still good and consistent with accuracy at 83% for 3 hour prediction, 80% for 48 hour prediction.  

Please let me know if you have any problems with running the code, and good luck with the implementation process! 


Best,
Cynthia 