# Librairies nécessaires

In [1]:
import commentjson
import os
#os.chdir('/mnt/batch/tasks/shared/LS_root/mounts/clusters/pythonnb/code/Users/david.mouquet/modeling')


import missingno as msno
import sys
import importlib
import pandas as pd
import math
import datetime
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers


from sklearn.metrics import r2_score
from onnx.onnx_pb import StringStringEntryProto

sys.path.append("../SRC/")
import Utilitaires as utils
importlib.reload(utils)
import RapportModelisation as modelreport
import importFromUV as preprocdata
importlib.reload(preprocdata)
import keras_tuner as kt
from keras.layers import Input, Embedding, LSTM, Dense, concatenate

# Fonctions annexes

In [2]:
import matplotlib.pyplot as plt

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error')
  plt.legend()
  plt.grid(True)


def plot_model_mesure(mesure,model):

  import plotly.graph_objects as go

  # Create traces
  fig = go.Figure()


  fig.add_trace(go.Scatter(x=mesure.index, y=mesure,
                          mode='lines',
                          name="Mesure"))
  fig.add_trace(go.Scatter(x=model.index, y=model,
                          mode='lines',
                          name="Model"))

  fig.update_layout(title='Comparaison Modele/Mesure',
                    width=800, 
                    height=600,
                    xaxis_title='Date',
                    yaxis_title='IPE')                        

  fig.show()











# Lecture des données

In [None]:
file_model_param   = "models_param/DK-Cokerie.json"

model_id           = "B6_Conso_Energ_Gaz_Hour"
site               = "DK-Cokerie"

###############################################################

with open(file_model_param, encoding='utf-8') as file:
    dico_model_all = commentjson.load(file)

dico_model = dico_model_all[model_id]



#pkl_model_name      = "resu/models/"+model_id+"_"+ dico_model['type_model'] + "_" +dico_model['freq']+".pkl"

nom_model_registre  = dico_model['nom_model_registre']
freq                = dico_model['freq']
uv_mangling         = dico_model['mangling']
nom_data_store      = dico_model['data_store']

onnx_rep = "C:/Users/33623/Dropbox (Ultiwatt)/D - ULTIVISION INDUSTRIES/4-ARCELORMITTAL/2- DEPLOIEMENT AMF/2-Projet LOT 2/10-Models/onnx"
onnx_model_name     = onnx_rep + "/" + site + "/" + uv_mangling+"."+model_id+".onnx"


ref_periode_debut = datetime.datetime.strptime(dico_model['ref_periode_debut'], '%d/%m/%Y %H:%M:%S').isoformat()
ref_periode_fin   = datetime.datetime.strptime(dico_model['ref_periode_fin'], '%d/%m/%Y %H:%M:%S').isoformat()


data, clean_report = preprocdata.Charger_Preparer_Data(ref_periode_debut = ref_periode_debut, 
                                         ref_periode_fin   = ref_periode_fin,
                                         ipe_tag           = dico_model["tag_modelise"],
                                         dico_du_model     = dico_model,
                                         use_seuil_min     = True,
                                         use_seuil_max     = True,
                                         clean_data        = False,
                                         concat_after      = True,
                                         load_unused_feature = True,
                                         zscore            = 3)

## Suppression périodes avec données abberantes

In [12]:
periode_ab = ["2019-06-01 06:00:00","2019-06-30 06:00:00"]
index_list= data[(data.index >= periode_ab[0]) & (data.index <= periode_ab[1])].index.tolist()
data.drop(index_list , inplace = True)

## Recherche valeurs manquantes dans les colonnes

### Pourcentage de valeur manquante

In [None]:
def num_missing(x):
    perc_missing = round(100*sum(x.isnull())/len(x),1)
    return perc_missing

print("Valeurs manquantes par colonne:")
df_missing = data.apply(num_missing, axis=0)
print(df_missing)

## Histogrammes

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

list_unit = [dico_model['tag_unit']] + [f['unit'] for f in dico_model['facteurs'].values() if f['used']]
list_unit = [dico_model['tag_unit']] + [f['unit'] for f in dico_model['facteurs'].values()]


var_numerique = [v for v in data.columns if data[v].dtypes != 'object']

var_numerique_unit = [v+' '+u for v,u in zip(var_numerique, list_unit)]

data_num = data[var_numerique]

n_var_num = len(var_numerique)
var_numerique
if n_var_num%2 == 0:
    nrows = int(n_var_num/2)
else:
    nrows = int((n_var_num+1)/2)

fig = make_subplots(rows=nrows, cols=2,subplot_titles=tuple(var_numerique_unit))

if n_var_num%2 == 0:
    for row in range(nrows):
        fig.add_trace(go.Histogram(x=data_num[var_numerique[2*row]]),row=row+1, col=1)
        fig.add_trace(go.Histogram(x=data_num[var_numerique[2*row+1]]),row=row+1, col=2)
else:
    for row in range(nrows-1):
        fig.add_trace(go.Histogram(x=data_num[var_numerique[2*row]]),row=row+1, col=1)
        fig.add_trace(go.Histogram(x=data_num[var_numerique[2*row+1]]),row=row+1, col=2)

    fig.add_trace(go.Histogram(x=data_num[var_numerique[2*nrows-2]]),row=nrows, col=1)
    

fig.update_annotations(font_size=12)
fig.update_layout(
    title_text="Histogramme des variables numériques",
    autosize=False,
    width=1000,
    height=nrows*300)

#for i, unit in enumerate(list_unit): 
#    fig['layout']['xaxis{}'.format(i+1)]['title']=unit



fig.show()





## Box plots

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


list_unit = [dico_model['tag_unit']] + [f['unit'] for f in dico_model['facteurs'].values() if f['used']]

var_numerique = [v for v in data.columns if data[v].dtypes != 'object']
data_num = data[var_numerique]

n_var_num = len(var_numerique)
var_numerique
if n_var_num%2 == 0:
    nrows = int(n_var_num/2)
else:
    nrows = int((n_var_num+1)/2)

fig = make_subplots(rows=nrows, cols=2,subplot_titles=tuple(var_numerique))

if n_var_num%2 == 0:
    for row in range(nrows):

        b_plot_1 = go.Box(
            y=data_num[var_numerique[2*row]],
            boxpoints='outliers'
        )
        b_plot_2 = go.Box(
            y=data_num[var_numerique[2*row+1]],
            boxpoints='outliers'
        )

        fig.add_trace(b_plot_1,row=row+1, col=1)
        fig.add_trace(b_plot_2,row=row+1, col=2)
else:
    for row in range(nrows-1):

        b_plot_1 = go.Box(
            y=data_num[var_numerique[2*row]],
            boxpoints='outliers' # only outliers

        )
        b_plot_2 = go.Box(
            y=data_num[var_numerique[2*row+1]],
            boxpoints='outliers'
        )

        fig.add_trace(b_plot_1,row=row+1, col=1)
        fig.add_trace(b_plot_2,row=row+1, col=2)

    b_plot_3 = go.Box(
                y=data_num[var_numerique[2*nrows-2]],
                boxpoints='outliers'
    )

    fig.add_trace(b_plot_3,row=nrows, col=1)
    

fig.update_annotations(font_size=12)
fig.update_layout(
    title_text="Box plot des variables numériques",
    autosize=False,
    width=1000,
    height=nrows*300)


#for i, unit in enumerate(list_unit): 
#    fig['layout']['yaxis{}'.format(i+1)]['title']=unit
#    fig['layout']['xaxis{}'.format(i+1)]['title']=''


fig.show()

# Coefficient de corrélation

In [6]:
df_num_corr = utils.Compute_Corr_Coef(data=data, dico_model =dico_model)

## Suppression des features non utilisées

In [7]:
with open(file_model_param, encoding='utf-8') as file:
    dico_model_all = commentjson.load(file)

dico_model = dico_model_all[model_id]
features_kept = [f['nom'] for tag, f in dico_model['facteurs'].items() if f['used']]

data = data[[dico_model['tag_name']] + features_kept]

# Preprocessing

### Découpage des données en train et test

In [8]:
data = data.dropna()

train_dataset = data.sample(frac=0.8, random_state=0)
test_dataset = data.drop(train_dataset.index)
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop(dico_model["tag_name"])
test_labels = test_features.pop(dico_model["tag_name"])

### Normalisation des facteurs numériques

In [9]:
inputs = []

for name, column in train_features.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32    

  inputs.append(tf.keras.Input(shape=(1,), name=name, dtype=dtype))


In [10]:
numeric_inputs = {input.name:input for input in inputs
                  if input.dtype==tf.float32}


x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(train_features[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

preprocessed_inputs = [all_numeric_inputs]


In [None]:
for input in inputs:
  if input.dtype == tf.float32:
    continue

  lookup = layers.StringLookup(vocabulary=np.unique(train_features[input.name]))
  one_hot = layers.CategoryEncoding(max_tokens=lookup.vocab_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)

In [12]:
preprocessed_inputs

[<KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'normalization')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'category_encoding')>]

# Modèle

In [15]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
first_layer = layers.Dense(64, activation='relu')(preprocessed_inputs_cat)
terminal_layer = layers.Dense(1,name='target')(first_layer)
main_model = tf.keras.models.Model(inputs = inputs,outputs=terminal_layer,name = 'model')
main_model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))


In [None]:
main_model.summary()

In [18]:
features_dict = {name: np.array(value) 
                         for name, value in train_features.items()}
features_dict_test = {name: np.array(value) 
                         for name, value in test_features.items()}

In [None]:
history = main_model.fit(x=features_dict, y=train_labels, epochs=600,validation_data=(features_dict_test,test_labels))

## Analyse des résultats

In [None]:
plot_loss(history)

In [None]:


df_train_predictions = main_model.predict(features_dict).flatten()
df_test_predictions = main_model.predict(features_dict_test).flatten() 
df_train_predictions = pd.Series(index=train_features.index,data=df_train_predictions)
df_test_predictions  = pd.Series(index=test_features.index,data=df_test_predictions)

df_pred = pd.concat([df_train_predictions,df_test_predictions])
df_mesure = pd.concat([train_labels,test_labels])
df_pred.sort_index(inplace=True)
df_mesure.sort_index(inplace=True)


print(r2_score(train_labels.values,df_train_predictions),r2_score(test_labels.values,df_test_predictions))


plot_model_mesure(df_mesure,df_pred)

## Réglage Hyperparamètres

In [20]:
def model_builder(hp):


  hp_units = hp.Int('units', min_value=32, max_value=64, step=10)
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  hp_activation = hp.Choice("activation", ["relu", "tanh"])

  preprocessed_inputs_cat = layers.Concatenate(name="concate_input")(preprocessed_inputs)
  couche_1 = layers.Dense(units=hp_units, activation=hp_activation,name="first_layer")(preprocessed_inputs_cat)
  sortie = layers.Dense(1,name='target')(couche_1)
  main_model = tf.keras.models.Model(inputs = inputs,outputs=sortie)
  main_model.compile(loss="mean_squared_error", metrics=["mean_squared_error"],
                optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate))


  return main_model

In [None]:
import shutil
 
shutil.rmtree('my_dir')

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective="val_mean_squared_error",
                     max_epochs=400,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(x=features_dict, y=train_labels, epochs=400,validation_data=(features_dict_test,test_labels), callbacks=[stop_early])

In [None]:
tuner.results_summary()

In [None]:
best_model = tuner.get_best_models()[0]

df_train_predictions = best_model.predict(features_dict).flatten()
df_test_predictions = best_model.predict(features_dict_test).flatten() 
df_train_predictions = pd.Series(index=train_features.index,data=df_train_predictions)
df_test_predictions  = pd.Series(index=test_features.index,data=df_test_predictions)

df_pred = pd.concat([df_train_predictions,df_test_predictions])
df_mesure = pd.concat([train_labels,test_labels])
df_pred.sort_index(inplace=True)
df_mesure.sort_index(inplace=True)


print(r2_score(train_labels.values,df_train_predictions),r2_score(test_labels.values,df_test_predictions))


plot_model_mesure(df_mesure,df_pred)

# Export vers ONNX

In [28]:
df_num_corr

{'Humidite_PAC': -0.0447015855944182,
 'Perc_Matiere_volatile_PAC': -0.1135628220447223,
 'Cokerie_Production_de_Coke': 0.8907968874530585,
 'Taux_Cendre': 0.14641037781061178,
 'Temps_Cuisson': -0.8386313651090204}

In [None]:
importlib.reload(modelreport)

model_type = "réseau de neuronnes"

modelreport_json = modelreport.BuildModelReport(model_type  = model_type,
                                                ref_periode_debut  = datetime.datetime.strftime(data.index[0], '%Y-%m-%d %H:%M:%S')  ,
                                                ref_periode_fin= datetime.datetime.strftime(data.index[-1], '%Y-%m-%d %H:%M:%S'),
                                                clean_report = clean_report,
                                                description = '',
                                                test_data_set = test_dataset,
                                                train_data_set = train_dataset,
                                                fitted_model = best_model,
                                                df_num_corr = df_num_corr,
                                                dico_model = dico_model,
                                                data = data)

In [32]:
import tf2onnx
(model_onnx, storage) = tf2onnx.convert.from_keras(best_model)
model_onnx.metadata_props.append(StringStringEntryProto(key="ReportModel", value = modelreport_json))

with open(onnx_model_name, "wb") as f:
    f.write(model_onnx.SerializeToString()) 

In [33]:
onnx_model_name

'C:/Users/33623/Dropbox (Ultiwatt)/D - ULTIVISION INDUSTRIES/4-ARCELORMITTAL/2- DEPLOIEMENT AMF/2-Projet LOT 2/10-Models/onnx/DK-Cokerie/Tech.DK.CK.CK_B6_B7.Cokerie_Conso_Energ_Gaz_Inv.onnx'

In [34]:
modelreport_json

import json
json_rep = json.loads(modelreport_json)

In [35]:
json_rep['uv_formula']

{'formula': '[model]  .Arg("Humidite_PAC", [tag_33643], 7.467802861111114, 12.0081104109589) .Arg("Perc_Matiere_volatile_PAC", [tag_33644], 22.106071805555555, 24.78209244755245) .Arg("Cokerie_Production_de_Coke", [tag_33562], 1823.76, 4344.84) .Arg("Taux_Cendre", [tag_34371], 7.4, 11.1) .Arg("Temps_Cuisson", [tag_37912], 19.239522988505744, 38.66184444444445) .Outputs("variable_out1")'}