In [1]:
####################
### Dependencies ###
####################

import os 
import pickle
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from utils.util_functions import *
from datetime import timedelta

In [2]:
#################
### Load Data ###
#################

# Measurements
measurement_out = pd.read_csv('../data/processed/last_measurement.csv')
measurement_out['datetime'] = measurement_out['datetime'].map(lambda x : pd.to_datetime(x)) 
# Forecasts
forecast = pd.read_csv('../data/processed/last_forecast.csv')
forecast['f_date'] = forecast['f_date'].map(lambda x : pd.to_datetime(x))
forecast['p_date'] = forecast['p_date'].map(lambda x : pd.to_datetime(x))
forecast['file_creation_date'] = forecast['file_creation_date'].map(lambda x : pd.to_datetime(x))

In [3]:
#######################
### Data Processing ###
#######################

### Data Merge ###

# Save a copy of measurements to score results
Y_real = measurement_out.copy()

# 49 lag of measurements horizontal stack 
df_out = Y_real.add_suffix('_t-0')
for i in range(1, 49):
    df_temp = Y_real.copy().add_suffix('_t-'+str(i))
    df_out = pd.concat([df_out,df_temp.shift(i)],axis=1)
df_out = df_out.dropna(how='any')
#display(df_out.head(1))

# join measurements & forecast
df_joined = df_out.copy()
df_joined = df_joined.merge(forecast.add_suffix('_forecast'),
                 how='left',
                 left_on = 'datetime_t-0',
                 right_on='f_date_forecast')

# filter forecast files created after prediction time (same as crop out f_period > 7)
df_joined = df_joined.loc[df_joined['datetime_t-0'] >= df_joined['file_creation_date_forecast'],]


# Compute f_period
df_joined['f_period'] = df_joined[['datetime_t-0','p_date_forecast']] \
                         .apply(lambda row : get_f_period(row['datetime_t-0'],row['p_date_forecast']),axis=1)

# assert that file_creation_date_forecast is doing the job
assert((df_joined.f_period > 7).any()) 

# keep last forecast
df_joined = df_joined.groupby('datetime_t-0')['f_period'].min().reset_index() \
             .merge(df_joined,how='left')
    
# compute cos day and hour 
df_joined['cos_day'] = np.cos(2 * np.pi * df_joined['datetime_t-0'].dt.day / 365)
df_joined['cos_hour'] =  np.cos(2 * np.pi * df_joined['datetime_t-0'].dt.hour / 24)
#display(df_joined.head(1))

In [4]:
##############################
### New model adjustements ###
##############################

# Compute needed columns for updated models
df_joined['scenario_forecast'] = df_joined.apply(lambda row : get_int_scenario(row['speed_forecast'],
                                             row['cos_wind_dir_forecast'],
                                             row['sin_wind_dir_forecast']),
                  axis=1)

df_joined['dangerous_forecast'] = (df_joined['scenario_forecast'] > 3 ).map(int)

df_joined = df_joined.rename(columns={'f_period':'f_period_forecast'})

### Make predictions
- Load all models (regression & classification
- Merge predictions : model_feature_lag (prediction date is shifted to align with measurement t-0)

In [5]:
# To render results on app we need a branch here for datetime shift ###
from datetime import timedelta

# Load needed columns for all models 
columns_names = list(pd.read_csv('../models_09072021/column_names.csv')['0'])

# Loop lists
model_names = ['xgb','dt','mlp','rf']
features = ['speed','cos_wind_dir','sin_wind_dir','scenario','dangerous']
pred_periods = ['1','2','3']


models = dict()
for model_name in model_names:
    for feature in features:
        for pred_period in pred_periods:
            x = '_'.join([model_name,feature,pred_period])
            # Load model
            models[x] = pickle.load(open('../models_09072021/trained_models/' + x + '.pkl','rb'))
            df_temp = df_joined[['datetime_t-0']].copy()
            # Shift date
            df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
            # Predict
            df_temp[x] = models[x].predict(df_joined[columns_names])
            # Save
            df_joined = df_joined.merge(df_temp,how='left')
            del df_temp      

### Compute ensemble models columns
 - Proba for scenario (6 columns) and binary (2 columns) classifications

In [6]:
# Scenario clasffication
for model_name in ['xgb','dt','mlp']:
    feature = 'scenario'
    for pred_period in pred_periods:
        x = '_'.join([model_name,feature,pred_period])
        #print(x)
        # Shift date
        df_temp = df_joined[['datetime_t-0']].copy()
        df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
        # Get 6 columns predictions (proba)
        df_temp[[x + '_p' + str(i) for i in range(1,7)]] = pd.DataFrame(models[x].predict_proba(df_joined[columns_names]),
                                                                        columns=[x + '_p' + str(i) for i in range(1,7)])
        # Save
        df_joined = df_joined.merge(df_temp,how='left')
        del df_temp        
    
# Binary classification
for model_name in ['xgb','dt','mlp','rf']:
    feature = 'dangerous'
    for pred_period in pred_periods:
        x = '_'.join([model_name,feature,pred_period])
        # Shift date
        df_temp = df_joined[['datetime_t-0']].copy()
        df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
        # Get 6 columns predictions (proba)
        df_temp[[x + '_p' + str(i) for i in range(0,2)]] = pd.DataFrame(models[x].predict_proba(df_joined[columns_names]),
                                                                        columns=[x + '_p' + str(i) for i in range(0,2)])
        # Save
        df_joined = df_joined.merge(df_temp,how='left')
        del df_temp

In [7]:
### Keep last row for predictions 
df_joined = df_joined.dropna()
#df_joined = df_joined.tail(1).reset_index(drop=True)

### Ensemble Models
 - Load selected columns and trained models
 - Predict & save 

In [8]:
import pickle 

select_columns = pickle.load(open('../models_09072021/ensemble_models/selected_columns.p','rb'))

ensemble_models = dict()
model_names = ['lr_scenario','lr_dangerous']
pred_periods = ['1','2','3']

ensemble_models = dict()
# Predict scenario
model_name = 'lr_scenario'
for pred_period in pred_periods:
    x = '_'.join([model_name,pred_period])
    ensemble_models[x] = pickle.load(open('../models_09072021/ensemble_models/' + x + '.p','rb'))
    df_joined[x] = ensemble_models[x].predict(df_joined[select_columns[pred_period]])
# Predict binary probability
model_name = 'lr_dangerous'
for pred_period in pred_periods:
    x = '_'.join([model_name,pred_period])
    ensemble_models[x] = pickle.load(open('../models_09072021/ensemble_models/' + x + '.p','rb'))
    df_joined[x] = ensemble_models[x].predict(df_joined[select_columns[pred_period]])
    temp_p = ensemble_models[x].predict_proba(df_joined[select_columns[pred_period]])
    df_joined[x + '_p0'] = temp_p[:,0]
    df_joined[x + '_p1'] = temp_p[:,1]
    del temp_p

In [9]:
df_joined = df_joined.tail(1).reset_index(drop=True)

In [10]:
# Init regressions results
df_result = pd.DataFrame([df_joined['datetime_t-0'][0],
                          df_joined['datetime_t-0'][0],
                          df_joined['datetime_t-0'][0]],columns=['present_time'])
df_result['datetime'] = [df_joined['datetime_t-0'][0] + timedelta(hours=int(pred_period)) for pred_period in (1,2,3)]

forecast_for_results = forecast[['f_date','p_date','speed','cos_wind_dir','sin_wind_dir','scenario_legacy']] \
                       .rename(columns={'scenario_legacy' : 'scenario'}).add_prefix('numtech_').copy()

# Compute f_period
forecast_for_results['f_period'] = forecast_for_results.apply(lambda row : get_f_period(row['numtech_f_date'],row['numtech_p_date']),axis=1)

df_result = df_result.merge(forecast_for_results,
                how='left',
                left_on='datetime',
                right_on='numtech_f_date')

df_result = df_result.loc[df_result.groupby("datetime")["f_period"].idxmin()]

df_result['numtech_binary'] = df_result['numtech_scenario'].map(get_str_binary)

df_result.drop(columns={'numtech_f_date','numtech_p_date','f_period'},inplace=True)


# Loop lists
model_names = ['xgb', 'dt','mlp','rf']
features = ['speed','cos_wind_dir','sin_wind_dir']

# Regression results
for m in model_names:
    for f in features:
        df_result[m + '_' + f] = df_joined[[m + '_' + f + '_1',m + '_' + f + '_2',m + '_' + f + '_3']].values[0]
        
# Compute wind dir
for model_name in model_names + ['numtech']:
    df_result[model_name + '_wind_dir']= df_result.apply(
                                                lambda row : get_angle_in_degree(row[model_name + '_cos_wind_dir'],
                                                                                 row[model_name + '_sin_wind_dir']),
                                                axis=1
                                            )
# Scenario & binary classification
model_names = ['xgb', 'dt','mlp','rf','lr']
for m in model_names:
    for f in ('scenario','dangerous'):
        df_result[m + '_' + f] = df_joined[[m + '_' + f + '_1',m + '_' + f + '_2',m + '_' + f + '_3']].values[0]
    df_result = df_result.rename(columns={m + '_dangerous': m + '_binary'})

# Scenario from int to str 
sc_list = ['xgb_scenario', 'dt_scenario', 'mlp_scenario','rf_scenario', 'lr_scenario']
df_result[sc_list] = df_result[sc_list].applymap(scenario_int_to_str)
# Binary from int to str 
binary_list = ['xgb_binary', 'dt_binary', 'mlp_binary', 'rf_binary','lr_binary']
df_result[binary_list] = df_result[binary_list].applymap(binary_int_to_str)
# Dangerous binary probability
model_names = ['xgb', 'dt','mlp','rf','lr']
for m in model_names:
    df_result[m + '_binary_p1'] = df_joined[[m + '_dangerous_1_p1',m + '_dangerous_2_p1',m + '_dangerous_3_p1']].values[0]




In [12]:
df_result.to_csv('../data/processed/last_reg_results.csv',index=False)