## 2_build_baselines

Notebook to build baseline models to iterate on a new model

In [None]:
import random

import pandas as pd
import scipy
import numpy as np
from supervised.automl import AutoML
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
import wandb

evaluation_metrics = []

In [None]:
# Collect the data
dfp_train = pd.read_csv('./data/rtu/model_train_data.csv')
dfp_train['date'] = pd.to_datetime(dfp_train['date'])

dfp_test = pd.read_csv('./data/rtu/model_test_data.csv')
dfp_test['date'] = pd.to_datetime(dfp_test['date'])

In [None]:
# Defione the columns and features
columns_weather = [ 't2m_min_bordeaux',
       't2m_bordeaux', 't2m_max_bordeaux', 'prectot_bordeaux', 't2m_min_lille',
       't2m_lille', 't2m_max_lille', 'prectot_lille', 't2m_min_paris',
       't2m_paris', 't2m_max_paris', 'prectot_paris', 't2m_min_rennes',
       't2m_rennes', 't2m_max_rennes', 'prectot_rennes', 't2m_min_nantes',
       't2m_nantes', 't2m_max_nantes', 'prectot_nantes', 't2m_min_toulouse',
       't2m_toulouse', 't2m_max_toulouse', 'prectot_toulouse',
       't2m_min_marseille', 't2m_marseille', 't2m_max_marseille',
       'prectot_marseille', 't2m_min_lyon', 't2m_lyon', 't2m_max_lyon',
       'prectot_lyon', 't2m_min_nice', 't2m_nice', 't2m_max_nice',
       'prectot_nice', 't2m_min_strasbourg', 't2m_strasbourg',
       't2m_max_strasbourg', 'prectot_strasbourg', 't2m_min_montpellier',
       't2m_montpellier', 't2m_max_montpellier', 'prectot_montpellier',
       'weighted_t2m', 'weighted_t2m_min', 'weighted_t2m_max',
       'weighted_prectot']

columns_features = ['weekday', 'month', 'week_number'] + columns_weather

In [None]:
column_target = 'daily_electrical_consumption'
X_train, y_train = dfp_train[columns_features], dfp_train[column_target]
X_test, y_test = dfp_test[columns_features], dfp_test[column_target]

In [None]:
# First baseline
predictions = [random.randrange(y_train.min(), y_train.max()) for idx in range(len(y_test))]
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'random', 'rmse' : rmse})
print('RMSE on the test-set:', rmse)

In [None]:
# Second baseline, still a little bit random but a little bit of knowledge based
dfp_tmp = dfp_train.groupby(['month', 'weekday']).agg({column_target : ['min', 'max']})
dfp_tmp.columns = ['min', 'max']
dfp_tmp.reset_index(inplace=True)

dfp_tmp['key'] = dfp_tmp.apply(lambda row: f"{row['month']}-{row['weekday']}", axis=1)
dfp_tmp.set_index('key', drop=True, inplace=True)

dict_knowledge = dfp_tmp[['min', 'max']].to_dict(orient='index')
dict_knowledge['x-x'] = {'min' : y_train.min(), 'max' : y_train.max()}

def get_randomish_consumption(month, weekday, dict_knowledge):
    
    key = f"{month}-{weekday}"
    if key not in dict_knowledge:
        key = 'x-x'
    return random.randrange(dict_knowledge[key]['min'], dict_knowledge[key]['max'])
   
predictions = [get_randomish_consumption(row['month'], row['weekday'], dict_knowledge) for idx, row in dfp_test.iterrows()] 
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'randomish', 'rmse' : rmse})
print('RMSE on the test-set:', rmse)  

In [None]:
%%time
automl = AutoML() # mode=Explain, Perform, Compete
automl.fit(X_train, y_train)
predictions = automl.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'mljar-bm', 'rmse' : rmse})
print('RMSE on the test-set:', rmse)

In [None]:
def ptg_model(x, a, b, x0):
    return np.piecewise(x, [x < x0, x >= x0], [lambda x: a*x+b , lambda x:a*x0+b])

def get_model_ptg(x,y):
    x0_min = 0
    x0_max = 20
    a_min=-200000
    a_max=-50000
    b_min=1000000
    b_max=3000000
    bounds_min = [a_min, b_min, x0_min]
    bounds_max = [a_max, b_max, x0_max]
    bounds = (bounds_min, bounds_max)
    popt, pcov = scipy.optimize.curve_fit(ptg_model, x, y, bounds=bounds)
    a= popt[0]
    b = popt[1]
    x0 = popt[2]
    return a,b,x0

a, b, x0 = get_model_ptg(X_train['weighted_t2m'].tolist(),y_train)
predictions = [ptg_model(row['weighted_t2m'], a, b, x0) for idx, row in dfp_test.iterrows()] 
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'ptg-wt2m', 'rmse' : rmse})
print('RMSE on the test-set:', rmse)  

In [None]:
a, b, x0 = get_model_ptg(X_train['weighted_t2m_min'].tolist(),y_train)
predictions = [ptg_model(row['weighted_t2m_min'], a, b, x0) for idx, row in dfp_test.iterrows()] 
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'ptg-wt2mmin', 'rmse' : rmse})
print('RMSE on the test-set:', rmse)  

In [None]:
a, b, x0 = get_model_ptg(X_train['weighted_t2m_max'].tolist(),y_train)
predictions = [ptg_model(row['weighted_t2m_max'], a, b, x0) for idx, row in dfp_test.iterrows()] 
rmse = mean_squared_error(y_test, predictions, squared=False)
evaluation_metrics.append({'model' : 'ptg-wt2mmax', 'rmse' : rmse})
print('RMSE on the test-set:', rmse) 

In [None]:
dfp_evaluation_metrics = pd.DataFrame(evaluation_metrics)

In [None]:
# Store the results in wandb
wandb.init(project='french_electrical_consumption', entity='jmdaignan')
data = [[row['model'], row['rmse']] for idx, row in dfp_evaluation_metrics.iterrows()]
table = wandb.Table(data=data, columns = ["model", "rmse"])
wandb.log({"comparison_baseline" : wandb.plot.bar(table, "model", "rmse", title="Comparison baseline models")})