<a href="https://colab.research.google.com/github/jfvandem/dev_test_days/blob/master/intro_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dev test days 2019 - Meylan

basé sur https://www.kaggle.com/c/ashrae-energy-prediction/data

https://www.tensorflow.org/tutorials/keras/regression
https://www.tensorflow.org/tutorials/structured_data/time_series
https://www.tensorflow.org/tutorials/structured_data/time_series#part_2_forecast_a_multivariate_time_series


# Clone github

In [0]:
!rm -rf dev_test_days
!git clone https://github.com/jfvandem/dev_test_days

Cloning into 'dev_test_days'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 148 (delta 35), reused 0 (delta 0), pack-reused 82[K
Receiving objects: 100% (148/148), 131.91 MiB | 44.39 MiB/s, done.
Resolving deltas: 100% (75/75), done.


In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

# Import

In [0]:
import gc
import glob
import io
import itertools
import joblib
import math
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import re
import requests
import seaborn as sns
import tensorflow as tf
import time
import warnings
import zipfile

from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
from math import sqrt
from matplotlib.collections import PatchCollection
from nltk.tokenize import word_tokenize
from os import path

from scipy import stats
from scipy.stats import skew

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras import layers

warnings.filterwarnings("ignore")

In [0]:
print(tf.__version__)

In [0]:
RANDOM_SEED = 42

tf.random.set_seed(RANDOM_SEED)

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Chargement des données

In [0]:
!ls dev_test_days

In [0]:
data_dir='dev_test_days/dataset/'

In [0]:
df = pd.read_pickle(data_dir+'train_full_df.pkl.zip',compression='zip')
df.shape
df.head()

In [0]:
le=joblib.load(data_dir+'label_encoder.pkl')
le.classes_

In [0]:
df['primary_use_label']=le.inverse_transform(df['primary_use'])

In [0]:
len(df.columns)
df.columns

#Echantillonnage
Nous sélectionnons les batiments pour que l'apprentissage du modèle soit réalisé dans un temps raisonnable. Nous conservons uniquement les mesures réalisées sur le compteur d'électricité

In [0]:
df['primary_use_label'].drop_duplicates()

In [0]:
df.shape
df=df[(df['primary_use_label']=='Office')&(df['meter']==0)&(df['meter_reading']!=0)&(~df['primary_use'].isna())]
df.shape

In [0]:
df.head()

In [0]:
top5_building_id=df.groupby('building_id')['meter_reading'].count().sort_values(ascending=False).reset_index().head(1)['building_id']
top5_building_id

In [0]:
df.shape
df=df[df['building_id'].isin(top5_building_id)]
df.shape

In [0]:
df['meter'].value_counts()

In [0]:
print(df['timestamp'].min())
print(df['timestamp'].max())

In [0]:
print('fréquence des mesures : {}'.format(df.groupby('timestamp')['meter_reading'].mean().index.inferred_freq))

# Description des colonnes


* air_temperature : température de l'air (degré Celsius)
* building_id : foreign key pour les données sur les immeubles (building_metadata)
* cloud_coverage : couverture nuageuse (oktas)
* dew température : température de rosée (degré Celsius)
* floor_count : nombre d'étages dans l'immeuble
* meter : code du type de compteur {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}. Tous les immeubles n'ont pas tous les types de compteur
* meter_reading : Consommation d'énergie en kWh (avec erreurs de mesure possibles) (la "**target**")
* precip_depth_1_hr : hauteur de précipitation en 1h (millimètre)
* primary_use : indicateur de l'activité des occupants dans l'immeuble selon la définition EnergyStar
* sea_level_pressure : pression au niveau de la mer (Millibar/hectopascals)
* site_id : foreign key pour les données météo
* square_feet : surface au plancher de l'immeuble (pied carré)
* timestamp : date à laquelle la mesure a été prise. Le timestamp a été "éclaté" en plusieurs champs (heure, jour, année, jour dans la semaine...)
* wind_direction : direction du vent (0 à 360 degré)
* wind_speed : vitesse du vent (m/s)
* year_built : année de construction de l'immeuble



In [0]:
# Valeurs manquantes (substitution réalisée dans le preprocessing, -1 ou moyenne)
df.isna().sum()

#Data Visualisation

In [0]:

d=df.set_index('timestamp').groupby(['building_id']).resample('D')['meter_reading'].mean().reset_index()

plt.figure(figsize=(10,5))
sns.lineplot(x='timestamp',y='meter_reading',data=d,style='building_id',palette='ch:2.5,.25').set_title('consommation d\'électricité moyenne quotidienne par batiment')

plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='x-large'  
);



# Features

In [0]:
feature_cols=['primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'month_datetime',
 'weekofyear_datetime',
 'dayofyear_datetime',
 'hour_datetime',
 'day_week',
 'day_month_datetime',
 'week_month_datetime']
target_col='meter_reading'

#Train / test split

In [0]:
train_df=df[df['timestamp'].dt.month<11]
test_df=df[df['timestamp'].dt.month>=11]

train_df.shape
test_df.shape

#Normalisation des données

cf. https://jovianlin.io/why-is-normalization-important-in-neural-networks/

In [0]:
train_stats = train_df[feature_cols].describe()
train_stats = train_stats.transpose()
train_stats

In [0]:

X_train=train_df[feature_cols]
y_train=train_df[target_col]

X_test=test_df[feature_cols]
y_test=test_df[target_col]

normalizer=preprocessing.Normalizer()

X_train_norm=normalizer.fit_transform(X_train)
X_test_norm=normalizer.transform(X_test)


In [0]:
X_train.head()

In [0]:
pd.DataFrame(X_train_norm).head()

In [0]:
pd.DataFrame(X_train_norm).describe().transpose()

A noter, la normalisation devra être appliquée sur toute nouvelle donnée soumise au modèle par la suite

#Apprentissage

In [0]:
def build_model(feature_keys_len):
  model = keras.Sequential([
    layers.Dense(64, activation='relu',input_shape=[feature_keys_len]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])  
  optimizer = keras.optimizers.RMSprop(0.01)  
  model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mean_absolute_error', 'mean_squared_error'])
  
  return model

model=build_model(len(train_df[feature_cols].keys()))

In [0]:
model.summary()

In [0]:
BATCH_SIZE=16
EPOCHS = 200

# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 50 == 0: print('|')
    print('.', end='')

early_stop = keras.callbacks.EarlyStopping(monitor='mean_absolute_error', patience=10)

history = model.fit(
  X_train_norm, y_train,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS, 
  validation_split = 0.2, 
  verbose=0,
  callbacks=[early_stop,PrintDot()])


In [0]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [0]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  #plt.ylim([0,5])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  #plt.ylim([0,20])
  plt.legend()
  plt.show()


plot_history(history)

In [0]:
loss, mae, mse = model.evaluate(X_test_norm, y_test, verbose=2)

print("Testing set Mean Abs Error: {:5.2f}".format(mae))


#Prédiction

In [0]:
y_pred = model.predict(X_test_norm).flatten()

In [0]:
test_df['pred']=y_pred

In [0]:
test_df['building_id'].drop_duplicates()

In [0]:
data=test_df.set_index('timestamp')

building_list=test_df['building_id'].unique().tolist()

for b in building_list:
  plt.figure(figsize=(15,5))
  sns.lineplot(data=data[data['building_id']==b][[target_col,'pred']]).set_title(b)

