# Import

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split

# Exploration

In [2]:
df = pd.read_excel(io="data/historique_interventions_IA.xlsx", sheet_name="liste inter CATEGORISEES LIGHT")

In [3]:
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Description    599 non-null    object
 1   Activity text  599 non-null    object
 2   S-ens          598 non-null    object
 3   CAT            256 non-null    object
dtypes: object(4)
memory usage: 18.8+ KB


Unnamed: 0,Description,Activity text,S-ens,CAT
0,Surcharge convoyeur TX124,Surcharge convoyeur TX124,convoyeur principal,SURCHARGE
1,casse entree etoile entrée.,investigation,etoile entrée,
2,Mauvais bouchonnage,Mauvais bouchonnage,poste bouchonnage,BOUCHONNAGE
3,presence bavure sur un format,bavure,format,
4,Mauvais bouchonnage,Mauvais bouchonnage,poste bouchonnage,BOUCHONNAGE
5,pas d amener bouchon suite doublons,controle et reglage,poste bouchonnage,ACHEMINEMENT BOUCHONS
6,PB TREMIE BOUCHON,PB TREMIE BOUCHON,tremie,
7,PB CELLULE FLACON COUCHE,PB CELLULE,cellules,FLACONS COUCHES
8,instabilité balance IPC Gross,Investigation,IPC,BALANCE
9,Mauvaise detection presence bouchon,Mauvaise detection presence bouchon,poste bouchonnage,DETECTION BOUCHON


In [4]:
df.columns

Index(['Description', 'Activity text', 'S-ens', 'CAT'], dtype='object')

In [5]:
df.columns = [title.lower().replace(" ", "_") for title in df.columns]
df.columns

Index(['description', 'activity_text', 's-ens', 'cat'], dtype='object')

In [6]:
df['s-ens'].unique()

array(['convoyeur principal ', 'etoile entrée', 'poste bouchonnage',
       'format', 'tremie ', 'cellules ', 'IPC', 'HMI', 'bol bouchon',
       'table tournante', 'communication', 'vis sans fin', 'climet',
       'poste remplissage', 'roue entrée', 'ipc', 'pesée',
       'servo commande', 'hmi', 'autom', 'barre ionisante',
       'convoyeur entrée', 'tiroir', 'jog', 'tapis entrée',
       'roue sortie ', 'porte', 'soufflet', 'bi', 'courroie', 'EV',
       'CUVE BOUCHONS', nan, 'BI'], dtype=object)

In [7]:
df['s-ens_cleaned'] = df['s-ens'].str.lower()
df = df[['description','s-ens_cleaned']]
df.dropna(inplace=True)
df.columns = ['description', "labels"]

In [8]:
count = df.groupby('labels').count().reset_index()


nb_min_to_keep = 10 

result = count[count['description'] > nb_min_to_keep]
result['labels']
nb_targets = result['labels'].nunique()
display(count)

Unnamed: 0,labels,description
0,autom,13
1,barre ionisante,3
2,bi,5
3,bol bouchon,25
4,cellules,61
5,climet,6
6,communication,18
7,convoyeur entrée,9
8,convoyeur principal,30
9,courroie,1


In [9]:
# mask_divers = df['labels'].isin(result['labels'])
df_cleaned = df[df['labels'].isin(result['labels'])]

# Modelisation

In [10]:
from sklearn.preprocessing import LabelEncoder

# Création de l'encodeur
le = LabelEncoder()

# Entraînement de l'encodeur et transformation des labels
df_cleaned['labels_encoded'] = le.fit_transform(df_cleaned['labels'])

y = df_cleaned['labels_encoded']

X_train_nlp, X_test_nlp, y_train_nlp, y_test_nlp = train_test_split(np.array(df_cleaned['description']), y,
                                                   test_size=0.2,
                                                   random_state = 0,
                                                   stratify = y,
                                                   shuffle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['labels_encoded'] = le.fit_transform(df_cleaned['labels'])


In [11]:
y

0       4
2       7
4       7
5       7
6      12
       ..
592     5
593     5
594     5
596     7
597     7
Name: labels_encoded, Length: 521, dtype: int64

In [12]:
df['labels'].isin(result['labels'])

0       True
1      False
2       True
3      False
4       True
       ...  
594     True
595    False
596     True
597     True
598    False
Name: labels, Length: 598, dtype: bool

In [13]:
import tensorflow as tf
from transformers import TFCamembertModel, CamembertTokenizer, CamembertConfig,TFBertModel
from tensorflow.keras.layers import Input, Dense 
from tensorflow.keras.models import Model

# 1. BERT from Hugging Face
tokenizer = CamembertTokenizer.from_pretrained("jplu/tf-camembert-base")
bert_model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
for layer in bert_model.layers:
    layer.trainable = False

max_seq_length = 32

text_input = Input(shape=(max_seq_length,), dtype=tf.int32, name="text_input")
# attention_mask = Input(shape=(max_seq_length,), name='attention_mask_cam', dtype='int32')
embedding_layer = bert_model(text_input)[0]
bert_output = embedding_layer[:, 0, :]
output = Dense(nb_targets, activation="softmax")(bert_output)

model = Model(inputs=text_input, outputs=output)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


def encode_texts(texts, tokenizer, max_seq_length):
    input_ids = []
    for text in texts:
        encoded = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=max_seq_length)
        input_ids.append(encoded["input_ids"][0])
    return np.array(input_ids)

X_train_nlp_encoded = encode_texts(X_train_nlp, tokenizer, max_seq_length=max_seq_length)
X_test_nlp_encoded = encode_texts(X_test_nlp, tokenizer, max_seq_length=max_seq_length)

  from .autonotebook import tqdm as notebook_tqdm
Some layers from the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFCamembertModel were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.


In [14]:
early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [15]:
X_train_nlp.shape

(416,)

In [16]:
y_train_nlp.shape

(416,)

In [17]:
# Entraînement du modèle
history_db = model.fit(
	x=X_train_nlp_encoded, y=y_train_nlp,  batch_size=32, epochs=1, verbose='auto',
    callbacks=[early], validation_split=0.1, shuffle=True,
    class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None,
    validation_steps=None, validation_batch_size=32, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False)



In [18]:
test1 = 'defaut de com ipc'
test2 = 'blabla hmi cassé'

In [19]:
to_test = encode_texts([test1, test2], tokenizer, max_seq_length=max_seq_length)
to_test.shape
proba = model.predict(to_test)
indexes = np.argmax(proba, axis=1)



In [20]:
le.inverse_transform(indexes)

array(['hmi', 'hmi'], dtype=object)

In [21]:
model.save_weights('weights')

In [22]:
model = keras.models.load_model('test.keras')

OSError: No file or directory found at test.keras

In [None]:
model_test = tf.saved_model.load('trained_model')



In [None]:
model_test.predict(to_test)
indexes = np.argmax(proba, axis=1)
le.inverse_transform(indexes)

AttributeError: '_UserObject' object has no attribute 'predict'