# Prétraitement du texte

## Importation des librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec
from sklearn import preprocessing
import gensim.downloader as api

## Chargement des données 

In [2]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 50

df = pd.read_csv('./csv/text_clean.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...
2,2,64d5d4a258243731dc7bbb1eef49ad74,eurospa cotton terry face towel set,key feature cotton terry face towel set size s...,Baby Care,0,key feature cotton terry face towel set size s...


# Features exctractions 

### Word2Vec

In [4]:
# Préparer les données pour l'entraînement du modèle Word2Vec
sentences = [text.split() for text in df['description_name']]

In [12]:
def vectorize(list_of_docs, model, model_wv):
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model_wv:
                try:
                    vectors.append(model_wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    print(f'Nombre de vecteurs : {len(features)}')
    print(f'Taille du vecteur : {len(features[0])}')
    return features


In [13]:
# model word2Vec s'entrainant sur les phrases du dataframe
local_model = Word2Vec(sentences, vector_size=300, workers=1)
local_model_vectors = vectorize(sentences, model=local_model, model_wv=local_model.wv)

Nombre de vecteurs : 1050
Taille du vecteur : 300


In [7]:
# chargement d'un model pré-entrainé 
pretrain_model = api.load('word2vec-google-news-300')

In [8]:
pretrain_model_vectors = vectorize(sentences, model=pretrain_model, model_wv=pretrain_model)

Nombre de vecteurs : 1050
Taille du vecteur : 300


In [14]:
df['local_vectors_means'] = local_model_vectors
df['pretrain_vectors_means'] = pretrain_model_vectors
df.head(2)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name,local_vectors_means,pretrain_vectors_means
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...,"[-0.09874754, 0.19643843, 0.07696229, 0.096146...","[0.048451997, 0.052741077, -0.017014096, 0.035..."
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...,"[-0.11512422, 0.22598363, 0.06695108, 0.094727...","[0.018274853, 0.06604099, 0.012273788, 0.05909..."


In [15]:
def print_classification_accuracy(vectors_columns, model_name):
    train_data, test_data, train_labels, test_labels = train_test_split(vectors_columns, df.main_category_num, test_size=0.2, random_state=42)
    train_data = np.array(train_data.tolist())
    test_data = np.array(test_data.tolist())
    classifier = SVC()
    classifier.fit(train_data,train_labels)
    predictions = classifier.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Accuracy with {model_name} : {round((accuracy * 100),2)}%")

In [16]:
print_classification_accuracy(df.local_vectors_means, model_name = 'word2vec')
print_classification_accuracy(df.pretrain_vectors_means, model_name = 'word2vec-google-news-300')

Accuracy with word2vec : 45.71%
Accuracy with word2vec-google-news-300 : 93.81%


Le modèle pré entrainé donne de meilleurs résultats que le modèle entrainé sur les nos datas.