## Pré-processing général

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.preprocessing import normalize
import fasttext
from tqdm import tqdm
import pickle

In [2]:
import psutil

print("=== CPU ===")
cpu_physical = psutil.cpu_count(logical=False)
cpu_logical = psutil.cpu_count(logical=True)
cpu_freq = psutil.cpu_freq().max
print(f"Cœurs physiques : {cpu_physical}")
print(f"Cœurs logiques : {cpu_logical}")
print(f"Fréquence max : {cpu_freq:.2f} MHz")

print("\n=== RAM ===")
ram = psutil.virtual_memory()
print(f"RAM totale : {ram.total / (1024**3):.2f} GB")
print(f"RAM disponible : {ram.available / (1024**3):.2f} GB")

print("\n=== GPU ===")
try:
    import torch
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        print(f"Nombre de GPU : {num_gpus}")
        for i in range(num_gpus):
            name = torch.cuda.get_device_name(i)
            total_mem = torch.cuda.get_device_properties(i).total_memory / 1e9
            print(f"GPU {i} : {name} | Mem totale : {total_mem:.2f} GB")
    else:
        print("Aucun GPU disponible")
except ImportError:
    print("Torch non installé, impossible de vérifier le GPU")



=== CPU ===
Cœurs physiques : 8
Cœurs logiques : 16
Fréquence max : 0.00 MHz

=== RAM ===
RAM totale : 15.27 GB
RAM disponible : 14.06 GB

=== GPU ===
Nombre de GPU : 1
GPU 0 : NVIDIA GeForce RTX 4070 Laptop GPU | Mem totale : 8.59 GB


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
df = pd.read_csv('../data/raw/reviews_trust.csv')
df

Unnamed: 0,Commentaire,star,date,client,reponse,source,company,ville,maj,date_commande,ecart
0,"Bonjour , Ca doit faire 5 ans environ que je s...",1,2021-06-20 00:00:00+00:00,AUDREY Du 62,,TrustPilot,ShowRoom,,,,
1,Vente lacoste article manquant photo prise sur...,1,2021-06-20 00:00:00+00:00,Nanasky De Verteuil,,TrustPilot,ShowRoom,,,,
2,"Vente Lacoste Honteuse , article erroné , arti...",1,2021-06-19 00:00:00+00:00,Vanessa L,,TrustPilot,ShowRoom,,,,
3,J'ai commandé des mules de la marque Moosefiel...,2,2021-06-19 00:00:00+00:00,Valery PERRAULT,"Bonjour , Je suis sincèrement navré d'apprendr...",TrustPilot,ShowRoom,,,,
4,Commande téléphone etat A+ . Livraison d un vi...,1,2021-06-19 00:00:00+00:00,JULIE DRINGENBERG,"Bonjour Julie , Je suis sincèrement désolé de ...",TrustPilot,ShowRoom,,,,
...,...,...,...,...,...,...,...,...,...,...,...
19858,Pas de probleme,4,2015-10-07 00:00:00+00:00,Dark dav,,TrustPilot,VeePee,,,,
19859,Cliente depuis 2008 sans encombre jusqu ' à ju...,1,2015-10-06 00:00:00+00:00,Bertho,,TrustPilot,VeePee,,,,
19860,Je ne sais pas si VP cherche à vendre ou à fai...,1,2015-10-21 00:00:00+00:00,Anne laure,,TrustPilot,VeePee,,,,
19861,Je suis client sur ce site depuis plusieurs an...,5,2015-10-02 00:00:00+00:00,Thomas GUILLAUME,,TrustPilot,VeePee,,,,


In [6]:
# Suppression des doublons et des commentaires vides ou trop courts
df = df.dropna(subset=["Commentaire"])
df = df.drop_duplicates(subset=['Commentaire', 'client'], keep='first')

def count_words(text):
    text = re.sub(r'[^\w\s]', '', text)
    return len(text.split())
df = df[df['Commentaire'].apply(count_words) > 3]

# df = df[df['Commentaire'].apply(lambda x: len(x.split())) >= 3]
df.shape

(15859, 11)

In [7]:
# Nettoyage du texte avant la vectorisation
def clean_text(text):
    text = text.lower().strip()

    # Suppression des éléments inutiles à l'aide de regex
    text = re.sub(r"(http|www)\S+", "", text) # URLs
    text = re.sub(r"\S+@\S+", "", text) # emails
    text = re.sub(r"(@|#)\w+", "", text) # mentions @ et #
    text = re.sub(r"\s+", " ", text).strip() # espaces multiples, sauts de lignes, tabulations

    return text

df["clean_comment"] = df["Commentaire"].apply(clean_text)

In [8]:
# Suppression des commentaires pas en français
# langue 1 = fr -> c'est bien du français
# langue 1 != fr et langue 2 = fr -> si langue 2 > 0.15 et langue 1 < 0.86 -> français

model = fasttext.load_model('../models/language_detection/lid.176.ftz')
texts = df["clean_comment"].tolist()

labels, probs = model.predict(texts, k=2)

mask = []
for lbls, pbs in zip(labels, probs):
    lang1 = lbls[0] if len(lbls) > 0 else None
    lang2 = lbls[1] if len(lbls) > 1 else None
    prob1 = pbs[0] if len(pbs) > 0 else 0.0
    prob2 = pbs[1] if len(pbs) > 1 else 0.0

    # Cas 1 : langue 1 = français
    if lang1 == "__label__fr":
        mask.append(True)
    # Cas 2 : langue 2 = français avec seuils
    elif lang2 == "__label__fr" and prob1 < 0.86 and prob2 > 0.15:
        mask.append(True)
    else:
        mask.append(False)

df = df[mask].reset_index(drop=True)

In [9]:
df.shape

(15077, 12)

In [10]:
# Pour regarder les seuils correspondant à chaque langue

# labels, probs = model.predict(texts, k=3)

# def get_label(lbls, probs, idx):
#     if idx < len(lbls):
#         return lbls[idx].replace("__label__", ""), float(probs[idx])
#     else:
#         return None, 0.0

# lang_1, prob_1 = zip(*[get_label(lbls, pb, 0) for lbls, pb in zip(labels, probs)])
# lang_2, prob_2 = zip(*[get_label(lbls, pb, 1) for lbls, pb in zip(labels, probs)])
# lang_3, prob_3 = zip(*[get_label(lbls, pb, 2) for lbls, pb in zip(labels, probs)])

# df["lang_1"], df["prob_1"] = lang_1, prob_1
# df["lang_2"], df["prob_2"] = lang_2, prob_2
# df["lang_3"], df["prob_3"] = lang_3, prob_3

# df.to_excel("../outputs/detect_lang.xlsx", index=False)

In [11]:
#!python -m spacy download fr_core_news_md
nlp = spacy.load("fr_core_news_md")

# Traitement Spacy -> lemmatisation, suppression des stopwords, des tokens de moins de 3 caractères et des tokens qui ne contiennent pas de lettres

def preprocess(doc):
    # doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop and len(token) > 2
    ]
    return tokens

Collecting fr-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl (45.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')


## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Application du preprocessing
docs = list(tqdm(nlp.pipe(df["clean_comment"], batch_size=64, n_process=1), total=len(df))) # à adapter selon les coeurs du CPU et la RAM de la machine
df["clean_comment_tfidf"] = [" ".join(preprocess(doc)) for doc in docs]

100%|█████████████████████████████████████████████████████| 15077/15077 [01:42<00:00, 146.91it/s]


In [14]:
# Vectorisation TF-IDF
# Suppression des tokens qui apparaissent dans plus de 95% des documents et dans moins de 5 documents
vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
tfidf_embeddings = vectorizer.fit_transform(df["clean_comment_tfidf"])
tfidf_embeddings = normalize(tfidf_embeddings, norm='l2')

In [15]:
tfidf_embeddings.shape

(15077, 3349)

In [16]:
# with open("../data/embeddings/emb_tfidf.pkl", "wb") as f:
#   pickle.dump(tfidf_embeddings, f)

In [17]:
feature_names = vectorizer.get_feature_names_out()
tfidf_means = tfidf_embeddings.mean(axis=0).A1  # moyenne par colonne
top_indices = tfidf_means.argsort()[::-1][:20]

print("Mots les plus importants (TF-IDF moyen) :")
for i in top_indices:
    print(feature_names[i], tfidf_means[i])

Mots les plus importants (TF-IDF moyen) :
commande 0.05054856412900579
livraison 0.04935130901497308
produit 0.0424129195436405
article 0.03519112359887456
recevoir 0.032132954811432246
bon 0.03133007772437427
bien 0.03049317746912926
qualité 0.02986965830413731
colis 0.02604152968238578
conforme 0.025675625375853335
rapide 0.023663429268749755
service 0.023208041469008508
site 0.02264409802849971
commander 0.022066366041785038
client 0.0220480786063209
être 0.021064123665642236
délai 0.020383574882915426
rien 0.020177907865057996
satisfait 0.019592053033456694
long 0.019398823312061376


## Sentence embeddings

In [18]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


#### Sentence-BERT multilingue


In [19]:
# Vectorisation (modèle au choix, il faudra en tester plusieurs)
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") # modèle multilingue de type sentence-bert

# pour utiliser le GPU en locale
# if torch.cuda.is_available():
#     embedding_model = embedding_model.to('cuda')

sentences = df["clean_comment"].tolist()
sbert_embeddings = embedding_model.encode(sentences)
sbert_embeddings = normalize(sbert_embeddings, norm='l2')
sbert_embeddings.shape

(15077, 384)

In [None]:
# with open("../data/embeddings/emb_sbert_multi.pkl", "wb") as f:
#   pickle.dump(sbert_embeddings, f)

#### CamemBERT (BERT français)



In [20]:
# Vectorisation (modèle au choix, il faudra en tester plusieurs)
embedding_model = SentenceTransformer('camembert-base') # modèle français de type bert

# pour utiliser le GPU en locale
# if torch.cuda.is_available():
#     embedding_model = embedding_model.to('cuda')

sentences = df["clean_comment"].tolist()
sbert_embeddings = embedding_model.encode(sentences)
sbert_embeddings = normalize(sbert_embeddings, norm='l2')
sbert_embeddings.shape

No sentence-transformers model found with name camembert-base. Creating a new one with mean pooling.


(15077, 768)

In [21]:
# with open("../data/embeddings/emb_sbert_fr.pkl", "wb") as f:
#   pickle.dump(sbert_embeddings, f)

#### Sentence-BERT français (ne fonctionne pas)

In [22]:
# # Vectorisation (modèle au choix, il faudra en tester plusieurs)
# embedding_model = SentenceTransformer("Lajavaness/sentence-camembert-large") # modèle français de type sentence-bert

# pour utiliser le GPU en locale
# if torch.cuda.is_available():
#     embedding_model = embedding_model.to('cuda')

# sentences = df["clean_comment"].tolist()
# sbert_embeddings = embedding_model.encode(sentences)
# sbert_embeddings = normalize(sbert_embeddings, norm='l2')
# sbert_embeddings.shape

#### BERTopic (Topic modeling)


In [23]:
# BERTopic

# Création de BERTopic
topic_model = BERTopic(embedding_model=embedding_model, language="french")
topics, probs = topic_model.fit_transform(df["clean_comment"])

# Visualisation des thèmes
print(topic_model.get_topic_info())
print(topic_model.get_topic(0))  # mots du thème 0

   Topic  Count           Name                                 Representation  \
0      0   9681  0_de_et_la_le  [de, et, la, le, je, pas, un, très, les, est]   
1      1   5396  1_de_le_et_je    [de, le, et, je, la, un, pas, que, est, ai]   

                                 Representative_Docs  
0  [j ’ ai commandé 2 tapis sur shoroomprive , il...  
1  [2 fois vêtements non conforme : taille pour u...  
[('de', np.float64(0.1066725604959604)), ('et', np.float64(0.07632593950548544)), ('la', np.float64(0.07278120445709092)), ('le', np.float64(0.06896547591261862)), ('je', np.float64(0.06572863358527252)), ('pas', np.float64(0.05661326087719123)), ('un', np.float64(0.05398633396788755)), ('très', np.float64(0.050848486029881325)), ('les', np.float64(0.05050912132983464)), ('est', np.float64(0.0500404712010009))]


## Word embeddings

In [24]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.fasttext import load_facebook_model

In [25]:
docs = list(tqdm(nlp.pipe(df["clean_comment"], batch_size=128, n_process=8), total=len(df))) # à adapter selon les coeurs du CPU et la RAM de la machine
df["clean_tokens_word2vec"] = [preprocess(doc) for doc in docs]
sentences = df["clean_tokens_word2vec"].tolist()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

#### Modèle Word2Vec entraîné avec nos données

In [26]:
model_w2v_custom = Word2Vec(sentences, vector_size=300, sg=1)

def get_sentence_vector(tokens, model):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

w2v_custom_embeddings = np.array([get_sentence_vector(tokens, model_w2v_custom) for tokens in sentences])
w2v_custom_embeddings = normalize(w2v_custom_embeddings, norm='l2')

w2v_custom_embeddings.shape

(15077, 300)

In [27]:
# with open("../data/embeddings/emb_w2v_custom.pkl", "wb") as f:
#   pickle.dump(w2v_custom_embeddings, f)

#### Modèle Word2Vec déjà existant
https://fauconnier.github.io/#data

In [28]:
# Modèle au choix
model_path = "../models/embeddings/frWac_no_postag_no_phrase_500_skip_cut100.bin"
# model_path = "../models/embeddings/frWac_postag_no_phrase_700_skip_cut50.bin"
# model_path = "../models/embeddings/frWac_postag_no_phrase_1000_skip_cut100.bin"

model_w2v_pretrained = KeyedVectors.load_word2vec_format(model_path, binary=True)

def get_sentence_vector(tokens, model):
    vecs = [model[word] for word in tokens if word in model.key_to_index]
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

w2v_pretrained_embeddings = np.array([get_sentence_vector(tokens, model_w2v_pretrained) for tokens in sentences])
w2v_pretrained_embeddings = normalize(w2v_pretrained_embeddings, norm='l2')

w2v_pretrained_embeddings.shape

(15077, 500)

In [29]:
# with open("../data/embeddings/emb_w2v_pretrained.pkl", "wb") as f:
#   pickle.dump(w2v_pretrained_embeddings, f)

#### Modèle Fasttext avec uniquement les vecteurs de mots

In [30]:
model_path = "../models/embeddings/fasttext_fr.kv"
model_ft_words = KeyedVectors.load(model_path, mmap='r')

def get_sentence_vector(tokens, model):
    vecs = [model[word] for word in tokens if word in model]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

ft_words_embeddings = np.array([get_sentence_vector(tokens, model_ft_words) for tokens in sentences])
ft_words_embeddings = normalize(ft_words_embeddings, norm='l2')

ft_words_embeddings.shape

(15077, 300)

In [31]:
# with open("../data/embeddings/emb_ft_words.pkl", "wb") as f:
#   pickle.dump(ft_words_embeddings, f)

#### Modèle FastText
https://fasttext.cc/docs/en/crawl-vectors.html

In [34]:
model_path = "../models/embeddings/cc.fr.300.bin"
model_ft_full = fasttext.load_model(model_path)

def get_sentence_vector(tokens, model):
    vecs = [model.get_word_vector(word) for word in tokens]
    if not vecs:
        return np.zeros(model.get_dimension())
    return np.mean(vecs, axis=0)

ft_full_embeddings = np.array([get_sentence_vector(tokens, model_ft_full) for tokens in sentences])
ft_full_embeddings = normalize(ft_full_embeddings, norm='l2')

ft_full_embeddings.shape

(15077, 300)

In [35]:
# with open("../data/embeddings/emb_ft_full.pkl", "wb") as f:
#   pickle.dump(ft_full_embeddings, f)

## Enregistrement final

In [36]:
# with open("../data/processed/dataset_final.pkl", "wb") as f:
#   pickle.dump(df, f)