In [253]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer
nltk.download('stopwords')
nltk.download('punkt')
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /home/lahad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/lahad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Séparation l'ensemble d'entraînement en 2 parties: entrainement et validation

In [244]:
# Charger les données
data = pd.read_csv("../data/train.csv")

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_data.to_csv("../data/train_split.csv", index=False)
val_data.to_csv("../data/validation_split.csv", index=False)


### Separer les données en attributs et étiquettes


In [245]:
# Charger les données
train_data = pd.read_csv("../data/train_split.csv")
val_data = pd.read_csv("../data/validation_split.csv")
test_data = pd.read_csv("../data/test.csv")

# Prétraitement des données d'entraînement
train_data['text'] = train_data['titre'] + " " + train_data['recette']
train_data = train_data[['text', 'type']]

# Prétraitement des données de validation
val_data['text'] = val_data['titre'] + " " + val_data['recette']
val_data = val_data[['text', 'type']]

# Prétraitement des données de test
test_data['text'] = test_data['titre'] + " " + test_data['recette']
test_data = test_data[['text', 'type']]

# Diviser les données en attributs et étiquettes
X_train = train_data['text']
y_train = train_data['type']
X_val = val_data['text']
y_val = val_data['type']
X_test = test_data['text']
y_test = test_data['type']

###  Nettoyage des donnees (Supprimer les caractères non alphabétiques,Tokenization,Supprimer les stopwords, convertir en minuscules, supprimer les mots vides, lemmatisation)

In [246]:
from nltk.tokenize import RegexpTokenizer
from gensim.utils import simple_preprocess

stopwords_fr = set(stopwords.words('french'))
stemmer = FrenchStemmer()
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text):
    # Lowercase conversion
    text = text.lower()

    # Expanding contractions
    text = re.sub(r"l'", "le ", text)
    text = re.sub(r"d'", "de ", text)

    # Tokenization (split the text into words)
    words = simple_preprocess(text, min_len=1)  # Min_len=1 to include all words

    # Removing stopwords
    words = [word for word in words if word not in stopwords_fr]

    # Stemming
    words = [stemmer.stem(word) for word in words]

    # Joining words
    cleaned_text = ' '.join(words)

    return cleaned_text

# Appliquer le nettoyage sur les données 
X_train = X_train.apply(clean_text)
X_val = X_val.apply(clean_text)
X_test = X_test.apply(clean_text)


## Baseline

In [247]:

# Créer un classificateur baseline
dummy = DummyClassifier(strategy='stratified')
dummy.fit(X_train, y_train)

# Prédire sur les données de validation
y_pred_val = dummy.predict(X_val)

# Afficher le rapport de classification pour les données de validation
print("Classification Report for Validation Data:")
print(classification_report(y_val, y_pred_val))

# Prédire sur les données de test
y_pred_test = dummy.predict(X_test)

# Afficher le rapport de classification pour les données de test
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test))


Classification Report for Validation Data:
                precision    recall  f1-score   support

       Dessert       0.30      0.30      0.30       726
        Entrée       0.24      0.23      0.24       611
Plat principal       0.46      0.46      0.46      1158

      accuracy                           0.36      2495
     macro avg       0.33      0.33      0.33      2495
  weighted avg       0.36      0.36      0.36      2495

Classification Report for Test Data:
                precision    recall  f1-score   support

       Dessert       0.29      0.29      0.29       407
        Entrée       0.28      0.28      0.28       337
Plat principal       0.47      0.46      0.46       644

      accuracy                           0.37      1388
     macro avg       0.34      0.34      0.34      1388
  weighted avg       0.37      0.37      0.37      1388



#### Run2: TF-IDF + SVM

In [248]:
# Vectorisation des textes
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Création du modèle SVM avec noyau gaussien (RBF)
svm_rbf = SVC(kernel='rbf', C=10, random_state=42)

# Entraînement du modèle
svm_rbf.fit(X_train_tfidf, y_train)

# Prédiction sur les données de validation
y_pred_val = svm_rbf.predict(X_val_tfidf)

# Evaluation sur les données de validation
accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
class_report_val = classification_report(y_val, y_pred_val)

# Affichage des résultats sur les données de validation
print("Evaluation Metrics sur les donnees de validationl:")
print(f"Accuracy: {accuracy_val}")
print(f"Precision: {precision_val}")
print(f"Recall: {recall_val}")
print(f"F1 Score: {f1_val}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(class_report_val)

# Prédiction sur les données de test
y_pred_test = svm_rbf.predict(X_test_tfidf)

# Evaluation sur les données de test
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

# Affichage des résultats sur les données de test
print("\nEvaluation Metrics sur les donnees de Test:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1 Score: {f1_test}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(class_report_test)


Evaluation Metrics sur les donnees de validationl:
Accuracy: 0.8717434869739479
Precision: 0.869176410114016
Recall: 0.8717434869739479
F1 Score: 0.8693536118470676
Confusion Matrix:
[[ 722    1    3]
 [   8  418  185]
 [   7  116 1035]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.98      0.99      0.99       726
        Entrée       0.78      0.68      0.73       611
Plat principal       0.85      0.89      0.87      1158

      accuracy                           0.87      2495
     macro avg       0.87      0.86      0.86      2495
  weighted avg       0.87      0.87      0.87      2495


Evaluation Metrics sur les donnees de Test:
Accuracy: 0.8811239193083573
Precision: 0.8795279172786075
Recall: 0.8811239193083573
F1 Score: 0.8801761658134365
Confusion Matrix:
[[405   1   1]
 [  3 250  84]
 [  6  70 568]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.98      1.00     

### Run3: Word2Vec + SVM

In [249]:
# Modèle Word2Vec
corpus = [sentence.split() for sentence in X_train]
word2vec_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, sg=1)

# Calcul des vecteurs de mots moyens pour chaque document
def document_vector(word2vec_model, doc):
    """Calculer le vecteur moyen pour un document en utilisant les vecteurs de mots Word2Vec"""
    # Filtrer les mots absents dans le vocabulaire
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) != 0:
        return np.mean(word2vec_model.wv[doc], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Préparation des données de test pour Word2Vec
X_train_word2vec = np.array([document_vector(word2vec_model, doc.split()) for doc in X_train])
X_val_word2vec = np.array([document_vector(word2vec_model, doc.split()) for doc in X_val])
X_test_word2vec = np.array([document_vector(word2vec_model, doc.split()) for doc in X_test])

# Modèle SVM
svm_classifier = SVC(kernel='rbf', C=10, random_state=42)
svm_classifier.fit(X_train_word2vec, y_train)

# Prédiction sur les données de validation
y_pred_val = svm_classifier.predict(X_val_word2vec)

# Prédiction sur les données de test
y_pred_test = svm_classifier.predict(X_test_word2vec)

# Evaluation
accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
class_report_val = classification_report(y_val, y_pred_val)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

# Affichage des résultats
print("Evaluation Metrics sur les donnees de  Validation :")
print(f"Accuracy: {accuracy_val}")
print(f"Precision: {precision_val}")
print(f"Recall: {recall_val}")
print(f"F1 Score: {f1_val}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(class_report_val)

print("\nEvaluation Metrics sur les donnees de Test:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1 Score: {f1_test}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(class_report_test)


Evaluation Metrics sur les donnees de  Validation :
Accuracy: 0.8569138276553107
Precision: 0.8536574850386751
Recall: 0.8569138276553107
F1 Score: 0.8516822730995288
Confusion Matrix:
[[ 724    1    1]
 [  10  369  232]
 [   7  106 1045]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.98      1.00      0.99       726
        Entrée       0.78      0.60      0.68       611
Plat principal       0.82      0.90      0.86      1158

      accuracy                           0.86      2495
     macro avg       0.86      0.83      0.84      2495
  weighted avg       0.85      0.86      0.85      2495


Evaluation Metrics sur les donnees de Test:
Accuracy: 0.8710374639769453
Precision: 0.8684543996490107
Recall: 0.8710374639769453
F1 Score: 0.8681430322144671
Confusion Matrix:
[[404   2   1]
 [  4 225 108]
 [  4  60 580]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.98      0.99   

### Run4: CountVectorizer + SVM

In [250]:
# Vectorisation des textes
vectorizer = CountVectorizer()

# Transformation des données textuelles en vecteurs de compte
X_train_counts = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

# Création et entraînement du modèle SVM
svm_classifier = SVC(kernel="rbf", C=10, random_state=42)
svm_classifier.fit(X_train_counts, y_train)

# Prédiction sur les données de validation
y_pred_val = svm_classifier.predict(X_val_counts)

# Evaluation sur les données de validation
accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
class_report_val = classification_report(y_val, y_pred_val)

# Affichage des résultats sur les données de validation
print("Evaluation Metrics sur les donnees de Validation :")
print(f"Accuracy: {accuracy_val}")
print(f"Precision: {precision_val}")
print(f"Recall: {recall_val}")
print(f"F1 Score: {f1_val}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(class_report_val)

# Prédiction sur les données de test
y_pred_test = svm_classifier.predict(X_test_counts)

# Evaluation sur les données de test
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

# Affichage des résultats sur les données de test
print("\nEvaluation Metrics sur les donnees de Tests:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1 Score: {f1_test}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(class_report_test)


Evaluation Metrics sur les donnees de Validation :
Accuracy: 0.8605210420841684
Precision: 0.8573821927823406
Recall: 0.8605210420841684
F1 Score: 0.8577882055848638
Confusion Matrix:
[[ 721    0    5]
 [  11  403  197]
 [   6  129 1023]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.98      0.99      0.98       726
        Entrée       0.76      0.66      0.71       611
Plat principal       0.84      0.88      0.86      1158

      accuracy                           0.86      2495
     macro avg       0.86      0.85      0.85      2495
  weighted avg       0.86      0.86      0.86      2495


Evaluation Metrics sur les donnees de Tests:
Accuracy: 0.8717579250720461
Precision: 0.869753610749032
Recall: 0.8717579250720461
F1 Score: 0.8705807387946785
Confusion Matrix:
[[405   1   1]
 [  4 244  89]
 [  8  75 561]]
Classification Report:
                precision    recall  f1-score   support

       Dessert       0.97      1.00    

In [251]:
# from sklearn.model_selection import GridSearchCV

# parameters = {'kernel':('linear', 'rbf'), 'C':[1,10]}
# svc = SVC(random_state=42)
# clf = GridSearchCV(svc, parameters)
# clf.fit(X_train_tfidf, y_train)
# #Print the best parameters
# print("Best parameters: ", clf.best_params_)
# # Use the best estimator to make predictions
# y_pred = clf.best_estimator_.predict(X_test_tfidf)

# # Evaluation
# print(classification_report(y_testt, y_pred))

In [252]:
# # Récupérer les documents mal classifiés
# misclassified_indices = (y_pred_test != y_test)
# misclassified_documents = test_data[misclassified_indices]

# print("\n### Y-a-t-il des régularités dans les documents bien/mal classifiés ?")
# print("Documents mal classifiés:")
# print(misclassified_documents)

# # Matrice de confusion
# print("\n### Où est-ce que l'approche se trompe ? (Matrice de confusion)")
# print("Matrice de confusion :")
# print(conf_matrix_test)

# # Affichage des descripteurs les plus décisifs
# if hasattr(svm_rbf, 'coef_'):
#     print("\n### Si votre méthode le permet: quels sont les descripteurs les plus décisifs ?")
#     coef_dict = {}
#     feature_names = tfidf.get_feature_names_out()
#     for i, class_label in enumerate(svm_rbf.classes_):
#         coef_dict[class_label] = dict(zip(feature_names, svm_rbf.coef_[i]))
#     for class_label, coef in coef_dict.items():
#         print(f"\nClasse : {class_label}")
#         top_features = sorted(coef, key=lambda x: coef[x], reverse=True)[:10]
#         print("Descripteurs les plus décisifs :", top_features)


