# Prétraitement du texte

## Importation des librairies

In [1]:
import pandas as pd
import nltk
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec
from sklearn import preprocessing
import gensim.downloader as api

## Chargement des données 

In [2]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 50

df = pd.read_csv('./csv/text_clean.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...
2,2,64d5d4a258243731dc7bbb1eef49ad74,eurospa cotton terry face towel set,key feature cotton terry face towel set size s...,Baby Care,0,key feature cotton terry face towel set size s...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          1050 non-null   int64 
 1   uniq_id             1050 non-null   object
 2   product_name_clean  1050 non-null   object
 3   description_clean   1050 non-null   object
 4   main_category       1050 non-null   object
 5   main_category_num   1050 non-null   int64 
 6   description_name    1050 non-null   object
dtypes: int64(2), object(5)
memory usage: 57.5+ KB


# Features exctractions 

### Word2Vec

In [5]:
# Préparer les données pour l'entraînement du modèle Word2Vec
sentences = [text.split() for text in df['description_name']]

In [50]:
# model word2Vec s'entrainant sur les phrases du dataframe
model = Word2Vec(sentences, vector_size=100, workers=1)

In [7]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(sentences, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(1050, 100)

In [8]:
df['vectors_means'] = vectorized_docs
df.head(2)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name,vectors_means
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...,"[-0.10184959, -0.039871898, -0.061881114, 0.12..."
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...,"[-0.08345177, -0.14484063, -0.07083861, 0.1442..."


In [9]:
df['vectors_means']

0       [-0.10184959, -0.039871898, -0.061881114, 0.12...
1       [-0.08345177, -0.14484063, -0.07083861, 0.1442...
2       [-0.140494, -0.02810181, -0.04000086, 0.143009...
3       [-0.04486203, -0.15910017, -0.073009774, 0.116...
4       [-0.08563257, -0.10234223, -0.065558225, 0.135...
                              ...                        
1045    [-0.1534988, 0.076541804, -0.016842853, 0.1115...
1046    [-0.19656068, 0.094294995, -0.021363508, 0.148...
1047    [-0.5309979, 0.5764248, 0.014826372, 0.3146439...
1048    [-0.48244742, 0.52018255, 0.006087722, 0.28895...
1049    [-0.541187, 0.5855924, 0.013153279, 0.3191772,...
Name: vectors_means, Length: 1050, dtype: object

In [51]:
# chargement d'un model pré-entrainé 
wv = api.load('word2vec-google-news-300')

In [18]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in wv:
                try:
                    vectors.append(wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(sentences, model=wv)
len(vectorized_docs), len(vectorized_docs[0])

(1050, 300)

In [25]:
df['pre_vectors_means'] = vectorized_docs
df['pre_vectors_means']

0       [0.048451997, 0.052741077, -0.017014096, 0.035...
1       [0.018274853, 0.06604099, 0.012273788, 0.05909...
2       [0.020904202, 0.09313281, 0.00014671043, 0.059...
3       [-0.0043165246, 0.051694907, -0.045088943, 0.0...
4       [0.010901001, 0.048768923, -0.0025513915, 0.07...
                              ...                        
1045    [0.013211982, 0.055592205, 0.007732161, 0.0511...
1046    [-0.00069127965, 0.053532355, 0.018493427, 0.0...
1047    [0.055739265, 0.06684657, -0.038131714, 0.1533...
1048    [0.11247452, 0.09372877, 0.0034365447, 0.21565...
1049    [0.07366333, 0.099365234, -0.0085723875, 0.187...
Name: pre_vectors_means, Length: 1050, dtype: object

In [45]:
def classification_accuracy(X, model_name):
    train_data, test_data, train_labels, test_labels = train_test_split(X, df.main_category_num, test_size=0.2, random_state=42)
    train_data = np.array(train_data.tolist())
    test_data = np.array(test_data.tolist())
    classifier = SVC()
    classifier.fit(train_data,train_labels)
    predictions = classifier.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Accuracy with {model_name} : {round((accuracy * 100),2)}%")

In [48]:
classification_accuracy(df.pre_vectors_means, model_name = 'word2vec-google-news-300')

Accuracy with word2vec-google-news-300 : 93.81%


In [49]:
acc = classification_accuracy(df.vectors_means, model_name = 'word2vec')

Accuracy with word2vec : 52.86%


Le modèle pré entrainé donne de meilleurs résultats que le modèle entrainé sur les nos datas.