# Prétraitement du texte

## Importation des librairies

In [106]:
import pandas as pd
import nltk
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec
from sklearn import preprocessing
import gensim.downloader as api
from seaborn import countplot

In [107]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justinelv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/justinelv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/justinelv/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/justinelv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Chargement des données 

In [108]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 50

df = pd.read_csv('./csv/text_clean.csv')

In [109]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...
2,2,64d5d4a258243731dc7bbb1eef49ad74,eurospa cotton terry face towel set,key feature cotton terry face towel set size s...,Baby Care,0,key feature cotton terry face towel set size s...


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          1050 non-null   int64 
 1   uniq_id             1050 non-null   object
 2   product_name_clean  1050 non-null   object
 3   description_clean   1050 non-null   object
 4   main_category       1050 non-null   object
 5   main_category_num   1050 non-null   int64 
 6   description_name    1050 non-null   object
dtypes: int64(2), object(5)
memory usage: 57.5+ KB


# Features exctractions 

## Bag of words vs TF-IDF (test avec les trois colonnes : description, product_name, description+product_name)

In [111]:
vectorizers = [CountVectorizer(),TfidfVectorizer()]
columns = [df['description_clean'],df['product_name_clean'],df['description_name']]

def creat_text_vectors(vectorizer, columns):
    
    vect = vectorizer
    bow = vectorizer.fit_transform(columns)
    bow = pd.DataFrame(bow.toarray(),columns = vectorizer.get_feature_names_out())
    
    return bow

all_df = []

for vectorizer in vectorizers :
    for column in columns:
        all_df.append(creat_text_vectors(vectorizer, column))
        

In [112]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Définir les noms de colonnes et de vectorizers
column_names = ['description_clean', 'product_name_clean', 'description_name']
vectorizer_names = ['CountVectorizer', 'TfidfVectorizer']

# Définir les noms de vos dataframes
df_names = []
for col in column_names:
    for vec in vectorizer_names:
        df_names.append(f"{vec}_{col}")

# Définir les combinaisons de colonnes et de vectorizers
combinations = [(CountVectorizer(), df['description_clean']),
                (CountVectorizer(), df['product_name_clean']),
                (CountVectorizer(), df['description_name']),
                (TfidfVectorizer(), df['description_clean']),
                (TfidfVectorizer(), df['product_name_clean']),
                (TfidfVectorizer(), df['description_name'])]

# Créer les dataframes et les assigner à des variables avec des noms explicites
for i, (vectorizer, column) in enumerate(combinations):
    name = df_names[i]
    bow = vectorizer.fit_transform(column)
    bow = pd.DataFrame(bow.toarray(), columns = vectorizer.get_feature_names_out())
    bow.columns = [f"{name}_{col}" for col in bow.columns]
    globals()[name] = bow

# Faire une liste avec les dataframes créés
df_list = ['CountVectorizer_description_clean','CountVectorizer_product_name_clean','CountVectorizer_description_name','TfidfVectorizer_description_clean','TfidfVectorizer_product_name_clean','TfidfVectorizer_description_name']

### Test CountVectorizer vs TfidfVectorizer

In [120]:
def classification_accuracy(i):
    train_data, test_data, train_labels, test_labels = train_test_split(eval(df_names[i]), df.main_category_num, test_size=0.2, random_state=42)
    classifier = SVC()
    classifier.fit(train_data,train_labels)
    predictions = classifier.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Accuracy with {df_names[i]} {round((accuracy * 100),2)}%")

In [121]:
for i in range(len(df_names)):
    classification_accuracy(i)

Accuracy with CountVectorizer_description_clean 80.48%
Accuracy with TfidfVectorizer_description_clean 89.52%
Accuracy with CountVectorizer_product_name_clean 88.1%
Accuracy with TfidfVectorizer_product_name_clean 92.38%
Accuracy with CountVectorizer_description_name 91.43%
Accuracy with TfidfVectorizer_description_name 94.29%


La meilleur accuracy pour la classification est avec la méthode Tf-idf avec la colonne qui contient les description et le product name.