### Dependencies

In [1]:
def get_f_period(p_date,f_date):
    d = pd.to_datetime(p_date) - pd.to_datetime(f_date)
    return d.days * 24  + d.seconds // 3600

import os 
import pickle
import pandas as pd
import numpy as np

### Load Data 
- Last Measurements (from 01/03/2021)
- Last 5 months of Forecasts

In [2]:
# Measurements
measurement_out = pd.read_csv('../data/processed/last_measurement_from_202001.csv')
measurement_out['datetime'] = measurement_out['datetime'].map(lambda x : pd.to_datetime(x)) 

In [3]:
measurement_out.loc[measurement_out.temp < 0, 'temp'] = 20

In [4]:
# Forecasts
forecast = pd.read_csv('../data/processed/last_forecast_from_202001.csv')
forecast['f_date'] = forecast['f_date'].map(lambda x : pd.to_datetime(x))
forecast['p_date'] = forecast['p_date'].map(lambda x : pd.to_datetime(x))
forecast['file_creation_date'] = forecast['file_creation_date'].map(lambda x : pd.to_datetime(x))

### Data Processing
- Shift measurements to get 49 lag
- Data Merge 
- Keep last forecasts

In [5]:
### Data Merge ###

# Save a copy of measurements to score results
Y_real = measurement_out.copy()

# 49 lag of measurements horizontal stack 
df_out = Y_real.add_suffix('_t-0')
for i in range(1, 49):
    df_temp = Y_real.copy().add_suffix('_t-'+str(i))
    df_out = pd.concat([df_out,df_temp.shift(i)],axis=1)
df_out = df_out.dropna(how='any')
#display(df_out.head(1))

# join measurements & forecast
df_joined = df_out.copy()
df_joined = df_joined.merge(forecast.add_suffix('_forecast'),
                 how='left',
                 left_on = 'datetime_t-0',
                 right_on='f_date_forecast')

# filter forecast files created after prediction time (same as crop out f_period > 7)
df_joined = df_joined.loc[df_joined['datetime_t-0'] >= df_joined['file_creation_date_forecast'],]


# Compute f_period
df_joined['f_period'] = df_joined[['datetime_t-0','p_date_forecast']] \
                         .apply(lambda row : get_f_period(row['datetime_t-0'],row['p_date_forecast']),axis=1)

# assert that file_creation_date_forecast is doing the job
assert((df_joined.f_period > 7).any()) 

# keep last forecast
df_joined = df_joined.groupby('datetime_t-0')['f_period'].min().reset_index() \
             .merge(df_joined,how='left')
    
# compute cos day and hour 
df_joined['cos_day'] = np.cos(2 * np.pi * df_joined['datetime_t-0'].dt.day / 365)
df_joined['cos_hour'] =  np.cos(2 * np.pi * df_joined['datetime_t-0'].dt.hour / 24)
#display(df_joined.head(1))

In [6]:
# Filter on 2021 data
df_joined = df_joined.loc[df_joined['datetime_t-0'].map(lambda x : str(x)[0:7] in ('2021-06','2021-07','2021-08')),:]

### New models adjustements
- Compute forecast scenario & Dangerous
- Rename f_period -> f_period_forecast

In [7]:
# Compute needed columns for updated models
def get_int_scenario(speed, cos, sin):
    if is_S1(speed, cos, sin):
        return 1
    elif is_S2(speed, cos, sin):
        return 2
    elif is_S2b(speed, cos, sin):
        return 3
    elif is_S3(speed, cos, sin):
        return 4
    elif is_S3b(speed, cos, sin):
        return 5
    elif is_S4(speed, cos, sin):
        return 6
    return np.nan

import sys
sys.path.append('../')
from utils.ui import *

df_joined['scenario_forecast'] = df_joined.apply(lambda row : get_int_scenario(row['speed_forecast'],
                                             row['cos_wind_dir_forecast'],
                                             row['sin_wind_dir_forecast']),
                  axis=1)

df_joined['dangerous_forecast'] = (df_joined['scenario_forecast'] > 3 ).map(int)

df_joined = df_joined.rename(columns={'f_period':'f_period_forecast'})

### Make predictions
- Load all models (regression & classification
- Merge predictions : model_feature_lag (prediction date is shifted to align with measurement t-0)

In [8]:
# To render results on app we need a branch here for datetime shift ###
from datetime import timedelta

# Load needed columns for all models 
columns_names = list(pd.read_csv('../models_09072021/column_names.csv')['0'])

# Loop lists
model_names = ['xgb','dt','mlp','rf']
features = ['speed','cos_wind_dir','sin_wind_dir','scenario','dangerous']
pred_periods = ['1','2','3']


models = dict()
for model_name in model_names:
    for feature in features:
        for pred_period in pred_periods:
            x = '_'.join([model_name,feature,pred_period])
            # Load model
            models[x] = pickle.load(open('../models_09072021/trained_models/' + x + '.pkl','rb'))
            df_temp = df_joined[['datetime_t-0']].copy()
            # Shift date
            df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
            # Predict
            df_temp[x] = models[x].predict(df_joined[columns_names])
            # Save
            df_joined = df_joined.merge(df_temp,how='left')
            del df_temp      

### Compute ensemble models columns
 - Proba for scenario (6 columns) and binary (2 columns) classifications

In [9]:
# Scenario clasffication
for model_name in ['xgb','dt','mlp']:
    feature = 'scenario'
    for pred_period in pred_periods:
        x = '_'.join([model_name,feature,pred_period])
        print(x)
        # Shift date
        df_temp = df_joined[['datetime_t-0']].copy()
        df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
        # Get 6 columns predictions (proba)
        df_temp[[x + '_p' + str(i) for i in range(1,7)]] = pd.DataFrame(models[x].predict_proba(df_joined[columns_names]),
                                                                        columns=[x + '_p' + str(i) for i in range(1,7)])
        # Save
        df_joined = df_joined.merge(df_temp,how='left')
        del df_temp        
    
# Binary classification
for model_name in ['xgb','dt','mlp','rf']:
    feature = 'dangerous'
    for pred_period in pred_periods:
        x = '_'.join([model_name,feature,pred_period])
        # Shift date
        df_temp = df_joined[['datetime_t-0']].copy()
        df_temp['datetime_t-0'] = df_temp['datetime_t-0'] + timedelta(hours=int(pred_period))
        # Get 6 columns predictions (proba)
        df_temp[[x + '_p' + str(i) for i in range(0,2)]] = pd.DataFrame(models[x].predict_proba(df_joined[columns_names]),
                                                                        columns=[x + '_p' + str(i) for i in range(0,2)])
        # Save
        df_joined = df_joined.merge(df_temp,how='left')
        del df_temp

xgb_scenario_1
xgb_scenario_2
xgb_scenario_3
dt_scenario_1
dt_scenario_2
dt_scenario_3
mlp_scenario_1
mlp_scenario_2
mlp_scenario_3


### Drop missing values
 - at least the 3 first rows due to timeshift

In [10]:
df_joined = df_joined.dropna()
df_joined.shape

(2138, 545)

### Ensemble Models
 - Load selected columns and trained models
 - Predict & save 

In [11]:
import pickle 

select_columns = pickle.load(open('../models_09072021/ensemble_models/selected_columns.p','rb'))

ensemble_models = dict()
model_names = ['lr_scenario','lr_dangerous']
pred_periods = ['1','2','3']

ensemble_models = dict()
for model_name in model_names:
    for pred_period in pred_periods:
        x = '_'.join([model_name,pred_period])
        ensemble_models[x] = pickle.load(open('../models_09072021/ensemble_models/' + x + '.p','rb'))
        df_joined[x] = ensemble_models[x].predict(df_joined[select_columns[pred_period]])

### Confusion Matrix

In [12]:
# Loop lists
model_names = ['xgb','dt','mlp','rf']
features = ['speed','cos_wind_dir','sin_wind_dir','scenario','dangerous']
pred_periods = ['1','2','3']

# Real scenario & dangerous #
df_joined['scenario_t-0'] = df_joined.apply(lambda row : get_int_scenario(row['speed_t-0'],
                                                                          row['cos_wind_dir_t-0'],
                                                                          row['sin_wind_dir_t-0']),
                                                    axis=1)
df_joined['dangerous_t-0'] = (df_joined['scenario_t-0'] > 3).map(int)


# Regression scenario & dangerous
for model_name in model_names:
    for pred_period in pred_periods:
        
        input_speed =  model_name + '_speed_' + pred_period
        input_cos = model_name + '_cos_wind_dir_' + pred_period
        input_sin = model_name + '_sin_wind_dir_' + pred_period
        output_scenario = model_name + '_regression_scenario_' + pred_period
        output_dangerous = model_name + '_regression_dangerous_' + pred_period
        
        df_joined[output_scenario] = df_joined.apply(lambda row : get_int_scenario(row[input_speed],
                                                                                   row[input_cos],
                                                                                   row[input_sin]),
                                                    axis=1)
        df_joined[output_dangerous] = (df_joined[output_scenario] > 3).map(int)
        
    
from sklearn.metrics import confusion_matrix

df_result = []

for model_name in model_names:
    #print(model_name)
    for pred_period in pred_periods:
        # regression
        cm = confusion_matrix(df_joined['dangerous_t-0'],df_joined[model_name + '_regression_dangerous_' + pred_period])
        df_result += [['regression',model_name,pred_period,cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]
        # scenario
        df_joined[model_name + '_sc_dangerous_' + pred_period] = (df_joined[model_name + '_scenario_' + pred_period] > 3).map(int)
        cm = confusion_matrix(df_joined['dangerous_t-0'],df_joined[model_name + '_sc_dangerous_' + pred_period])
        df_result += [['sc classification',model_name,pred_period,cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]
        # binary
        cm = confusion_matrix(df_joined['dangerous_t-0'],df_joined[model_name + '_dangerous_' + pred_period])
        df_result += [['binary classification',model_name,pred_period,cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]

# Ensemble models
for pred_period in pred_periods:
    # Binary classification
    cm = confusion_matrix(df_joined['dangerous_t-0'], df_joined['lr_dangerous_' + pred_period])
    df_result += [['binary classification','lr ensemble',pred_period,cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]
    # Compute dangerous from scenario classification
    df_joined['lr_sc_dangerous_' + pred_period] = (df_joined['lr_scenario_' + pred_period] > 3).map(int)
    cm = confusion_matrix(df_joined['dangerous_t-0'], df_joined['lr_sc_dangerous_' + pred_period])
    df_result += [['Scenario classification','lr ensemble',pred_period,cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]


cm = confusion_matrix(df_joined['dangerous_t-0'],df_joined['dangerous_forecast'])
df_result += [['','Numtech','',cm[0][0],cm[0][1],cm[1][0],cm[1][1]]]

df_result = pd.DataFrame(df_result,columns=['Type','Name','Period','Correct Safe','Wrong Dangerous','Wrong Safe','Correct Dangerous'])


In [13]:
df_result.sort_values(by=['Correct Dangerous']).to_excel('ensemble_results.xlsx',index=False)