# Prétraitement du texte

## Importation des librairies

In [141]:
import pandas as pd
import nltk
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec
from sklearn import preprocessing
import gensim.downloader as api

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justinelv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/justinelv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/justinelv/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/justinelv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Chargement des données 

In [104]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 50

df = pd.read_csv('cleaned.csv')

In [105]:
df.head(3)

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,...,description,product_rating,overall_rating,brand,product_specifications
0,55b85ea15a1536d46b7190ad6fff8ce7,2016-04-30 03:22:56 +0000,http://www.flipkart.com/elegance-polyester-mul...,Elegance Polyester Multicolor Abstract Eyelet ...,"[""Home Furnishing >> Curtains & Accessories >>...",...,Key Features of Elegance Polyester Multicolor ...,No rating available,No rating available,Elegance,"{""product_specification""=>[{""key""=>""Brand"", ""v..."
1,7b72c92c2f6c40268628ec5f14c6d590,2016-04-30 03:22:56 +0000,http://www.flipkart.com/sathiyas-cotton-bath-t...,Sathiyas Cotton Bath Towel,"[""Baby Care >> Baby Bath & Skin >> Baby Bath T...",...,Specifications of Sathiyas Cotton Bath Towel (...,No rating available,No rating available,Sathiyas,"{""product_specification""=>[{""key""=>""Machine Wa..."
2,64d5d4a258243731dc7bbb1eef49ad74,2016-04-30 03:22:56 +0000,http://www.flipkart.com/eurospa-cotton-terry-f...,Eurospa Cotton Terry Face Towel Set,"[""Baby Care >> Baby Bath & Skin >> Baby Bath T...",...,Key Features of Eurospa Cotton Terry Face Towe...,No rating available,No rating available,Eurospa,"{""product_specification""=>[{""key""=>""Material"",..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   uniq_id                  1050 non-null   object 
 1   crawl_timestamp          1050 non-null   object 
 2   product_url              1050 non-null   object 
 3   product_name             1050 non-null   object 
 4   product_category_tree    1050 non-null   object 
 5   pid                      1050 non-null   object 
 6   retail_price             1049 non-null   float64
 7   discounted_price         1049 non-null   float64
 8   image                    1050 non-null   object 
 9   is_FK_Advantage_product  1050 non-null   bool   
 10  description              1050 non-null   object 
 11  product_rating           1050 non-null   object 
 12  overall_rating           1050 non-null   object 
 13  brand                    712 non-null    object 
 14  product_specifications  

## Sélection des colonnes utiles

In [106]:
df_process = df[['uniq_id','product_name','description','product_category_tree']].copy()
df_process.head(1)

Unnamed: 0,uniq_id,product_name,description,product_category_tree
0,55b85ea15a1536d46b7190ad6fff8ce7,Elegance Polyester Multicolor Abstract Eyelet ...,Key Features of Elegance Polyester Multicolor ...,"[""Home Furnishing >> Curtains & Accessories >>..."


## Nettoyage des données textes

### Fonction pour nettoyer les données 

In [107]:
# pour enlever la ponctuation
ponctuations = string.punctuation + '�' + '°' + '…' + '±' + '•' + '’' + '–' + '“' + '”' 

In [108]:
# stopwords, ce sont les mots qui apparaissent très fréquemment mais qui n’apporte pas de sens à la phrase (comme « de », « le », « une »)
stopwords = nltk.corpus.stopwords.words('english')
stopwords_custom = stopwords
stopwords_custom.append('yes')

In [109]:
# mots (words) qui proviennent d’un dictionnaire anglais (directement intégré à la librairie nltk)
words = set(nltk.corpus.words.words())

In [110]:
# un lemmatizer, cette objet nous permet de préserver la racine des mots de tel sorte que deux mots ayant une même souche seront considérés comme un seul et même mot 
lemmatizer = WordNetLemmatizer()

In [111]:
def text_prepocessing_description(df):
    
    preprocess_list = []
    
    for sentence in df :
        
        sentence_w_punct = "".join([i.lower() for i in sentence if i not in ponctuations])

        sentence_w_num = ''.join(i for i in sentence_w_punct if not i.isdigit())

        tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_num)

        words_w_stopwords = [i for i in tokenize_sentence if i not in stopwords_custom]

        words_lemmatize = (lemmatizer.lemmatize(w) for w in words_w_stopwords)

        sentence_clean = ' '.join(w for w in words_lemmatize if w.lower() in words or not w.isalpha())
        
        remove_one_two_letters_word = re.sub(r'\b[a-zA-Z]{1,2}(?=\s|$)','',sentence_clean)
        
        preprocess_list.append(remove_one_two_letters_word)
        
    return preprocess_list 

In [112]:
def text_prepocessing_product_name(df):
    
    preprocess_list = []
    
    for sentence in df :
        
        sentence_w_punct = "".join([i.lower() for i in sentence if i not in ponctuations])

        sentence_w_num = ''.join(i for i in sentence_w_punct if not i.isdigit())

        tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_num)

        words_w_stopwords = [i for i in tokenize_sentence if i not in stopwords_custom]

        words_lemmatize = (lemmatizer.lemmatize(w) for w in words_w_stopwords)

        sentence_clean = ' '.join(w for w in words_lemmatize)
        
        remove_one_two_letters_word = re.sub(r'\b[a-zA-Z]{1,2}(?=\s|$)','',sentence_clean)
        
        preprocess_list.append(remove_one_two_letters_word)
        
    return preprocess_list 

### Nettoyage de la colonne description

In [113]:
description_clean = text_prepocessing_description(df_process['description'])

In [130]:
df_process['description_clean'] = description_clean
df_process['description_clean'][0]

'key feature elegance polyester multicolor abstract eyelet door curtain floral polyester multicolor abstract eyelet door curtain height pack price  curtain look curtain made high quality polyester feature eyelet style stitch metal make room environment romantic curtain ant wrinkle anti shrinkage elegant home bright modernistic appeal design attention sure steal heart contemporary eyelet valance curtain slide smoothly draw apart first thing morning welcome bright sun ray want wish good morning whole world draw close evening create special moment joyous beauty given soothing print bring home elegant curtain softly filter light room get right amount elegance polyester multicolor abstract eyelet door curtain height pack general brand elegance designed door type eyelet model name abstract polyester door curtain set model  duster color multicolor dimension length box number content sale package pack sale package curtain body design material polyester'

In [117]:
#df_process.description_clean.to_csv('description_clean.csv', index = False)

### Nettoyage de la colonne product_name

In [118]:
product_name_clean = text_prepocessing_product_name(df_process['product_name'])

In [129]:
df_process['product_name_clean'] = product_name_clean
df_process['product_name_clean'][0]

'elegance polyester multicolor abstract eyelet door curtain'

In [120]:
#df_process.product_name_clean.to_csv('product_name_clean.csv', index = False)

### Nettoyage des catégories

In [132]:
# définir une fonction lambda pour extraire la catégorie principale
extract_category = lambda x: x.split(">>")[0].strip().strip(']["')

# appliquer la fonction à la colonne 'product_category_tree' du DataFrame
df_process['main_category'] = df_process['product_category_tree'].apply(extract_category)
df_process['main_category'][0]

'Home Furnishing'

In [133]:
# définir une fonction lambda pour extraire la sous-catégorie
extract_sub_category = lambda x: x.split(">>")[1].strip(' "').strip(' "]\'')

# appliquer la fonction à la colonne 'product_category_tree' du DataFrame
df_process['sub_category'] = df_process['product_category_tree'].apply(extract_sub_category)
df_process['sub_category'][0]

'Curtains & Accessories'

In [135]:
main_category = df_process.groupby('main_category')['uniq_id'].nunique()
main_category

main_category
Baby Care                     150
Beauty and Personal Care      150
Computers                     150
Home Decor & Festive Needs    150
Home Furnishing               150
Kitchen & Dining              150
Watches                       150
Name: uniq_id, dtype: int64

In [136]:
sub_category = df_process.groupby('sub_category')['uniq_id'].nunique()
sub_category 

sub_category
Baby & Kids Gifts        15
Baby Bath & Skin         14
Baby Bedding             15
Baby Grooming             4
Bar & Glassware           8
                       ... 
Tableware & Cutlery       3
Tidy Home Furnishing      1
Wall Decor & Clocks      22
Women's Hygiene           1
Wrist Watches           149
Name: uniq_id, Length: 62, dtype: int64

Les catégories représentées dans la colonne main_category sont mieux réparties que les cotégories de la colonne sub_category. La colonne main_category sera donc choisit pour être le label.

In [138]:
label_encoder = preprocessing.LabelEncoder()
label_encoder .fit(df_process['main_category'])
df_process['main_category_num']=label_encoder.transform(df_process['main_category'])
df_process['main_category_num'][0:3]

0    4
1    0
2    0
Name: main_category_num, dtype: int64

In [139]:
main_category_num = df_process.groupby('main_category_num')['uniq_id'].nunique()
main_category_num

main_category_num
0    150
1    150
2    150
3    150
4    150
5    150
6    150
Name: uniq_id, dtype: int64

# Features exctractions 

## Bag of words

In [76]:
count_vectorizer = CountVectorizer()
bag_of_words_description = count_vectorizer.fit_transform(df_process['description_clean'])
bag_of_words_description = pd.DataFrame(bag_of_words_description.toarray(),columns = count_vectorizer.get_feature_names_out())
bag_of_words_description

Unnamed: 0,ability,able,abode,abrasion,abroad,absolute,absorbency,absorbent,absorber,absorbing,abstract,accent,access,accessory,accident,accidental,accommodate,according,ache,aching,acid,acne,across,acrylic,act,...,wrap,wrapped,wrapper,wring,wrinkle,wrist,write,wrought,yarn,year,yellow,yet,yield,york,youd,young,youth,youthful,youve,zero,zinc,zip,zipper,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1047,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1048,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [77]:
bag_of_words_product_name = count_vectorizer.fit_transform(df_process.product_name_clean)
bag_of_words_product_name = pd.DataFrame(bag_of_words_product_name.toarray(),columns = count_vectorizer.get_feature_names_out())
bag_of_words_product_name

Unnamed: 0,aapno,aari,aarika,abklgrngrngrn,abklplplpnk,abklplpnkpnk,abstract,abstrcts,acer,aci,acne,acrylic,actu,actx,adaa,adapter,addarkorange,addiction,adf,adhesive,adi,adidas,adino,adjustable,adsl,...,wrap,wrapper,wrn,wsc,wyb,xemex,xgb,yardley,ybscr,year,yellow,ygs,yiboo,york,young,youth,yuva,yves,zaicus,zero,zikrak,zingalalaa,zone,zora,zyxel
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1047,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1048,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##  TF-IDF

In [79]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df_process['description_clean'])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
tf_idf_description = pd.DataFrame(denselist, columns=feature_names)

In [80]:
tf_idf_description 

Unnamed: 0,ability,able,abode,abrasion,abroad,absolute,absorbency,absorbent,absorber,absorbing,abstract,accent,access,accessory,accident,accidental,accommodate,according,ache,aching,acid,acne,across,acrylic,act,...,wrap,wrapped,wrapper,wring,wrinkle,wrist,write,wrought,yarn,year,yellow,yet,yield,york,youd,young,youth,youthful,youve,zero,zinc,zip,zipper,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.181157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07926,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.201118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.074847,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.046048,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1046,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1047,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1048,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
vectors = vectorizer.fit_transform(df_process['product_name_clean'])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
tf_idf_product_name = pd.DataFrame(denselist, columns=feature_names)

In [82]:
tf_idf_product_name

Unnamed: 0,aapno,aari,aarika,abklgrngrngrn,abklplplpnk,abklplpnkpnk,abstract,abstrcts,acer,aci,acne,acrylic,actu,actx,adaa,adapter,addarkorange,addiction,adf,adhesive,adi,adidas,adino,adjustable,adsl,...,wrap,wrapper,wrn,wsc,wyb,xemex,xgb,yardley,ybscr,year,yellow,ygs,yiboo,york,young,youth,yuva,yves,zaicus,zero,zikrak,zingalalaa,zone,zora,zyxel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.315748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379881,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1046,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1047,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1048,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Test CountVectorizer vs TfidfVectorizer

In [83]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(bag_of_words_description, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('bag of words description', accuracy * 100))

Accuracy with bag of words description features: 80.48%


In [84]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(bag_of_words_product_name, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('bag of words product name', accuracy * 100))

Accuracy with bag of words product name features: 89.52%


In [85]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(tf_idf_description, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Tf-idf description', accuracy * 100))

Accuracy with Tf-idf description features: 92.38%


In [86]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(tf_idf_product_name, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Tf-idf product name', accuracy * 100))

Accuracy with Tf-idf product name features: 91.43%


In [89]:
df_process['description_name'] = df_process.description_clean + df_process.product_name_clean

In [88]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df_process['description_name'])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
tf_idf_description_name = pd.DataFrame(denselist, columns=feature_names)
tf_idf_description_name

Unnamed: 0,aari,ability,abklgrngrngrn,abklplplpnk,abklplpnkpnk,able,abode,abodecocovey,abrasion,abroad,absolute,absorbency,absorbent,absorber,absorbing,abstract,abstrcts,accent,access,accessory,accident,accidental,accommodate,according,acer,...,wrought,wsc,wyb,xgb,yarn,ybscr,year,yellow,yet,yiboo,yield,york,youd,young,youth,youthful,youve,yuva,zero,zinc,zip,zipper,zone,zoom,zora
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.209474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.185393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068888,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.042383,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(tf_idf_description_name, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Tf-idf description and product name', accuracy * 100))

Accuracy with Tf-idf description and product name features: 94.29%


In [91]:
count_vectorizer = CountVectorizer()
bag_of_words_description_name = count_vectorizer.fit_transform(df_process['description_name'])
bag_of_words_description_name = pd.DataFrame(bag_of_words_description_name.toarray(),columns = count_vectorizer.get_feature_names_out())
bag_of_words_description_name

Unnamed: 0,aari,ability,abklgrngrngrn,abklplplpnk,abklplpnkpnk,able,abode,abodecocovey,abrasion,abroad,absolute,absorbency,absorbent,absorber,absorbing,abstract,abstrcts,accent,access,accessory,accident,accidental,accommodate,according,acer,...,wrought,wsc,wyb,xgb,yarn,ybscr,year,yellow,yet,yiboo,yield,york,youd,young,youth,youthful,youve,yuva,zero,zinc,zip,zipper,zone,zoom,zora
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1047,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1048,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
# Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(bag_of_words_description_name, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Bags of words description and product name', accuracy * 100))

Accuracy with Bags of words description and product name features: 88.10%


### Word2Vec

In [92]:
# Préparer les données pour l'entraînement du modèle Word2Vec
sentences = [text.split() for text in df_process['description_name']]

In [93]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(sentences, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(1050, 100)

In [142]:
df_process['vectors_means'] = vectorized_docs
df_process['vectors_means']

0       [0.048451997, 0.052741077, -0.017014096, 0.035...
1       [0.018274853, 0.06604099, 0.012273788, 0.05909...
2       [0.020904202, 0.09313281, 0.00014671043, 0.059...
3       [-0.0043165246, 0.051694907, -0.045088943, 0.0...
4       [0.010901001, 0.048768923, -0.0025513915, 0.07...
                              ...                        
1045    [0.013211982, 0.055592205, 0.007732161, 0.0511...
1046    [-0.00069127965, 0.053532355, 0.018493427, 0.0...
1047    [0.055739265, 0.06684657, -0.038131714, 0.1533...
1048    [0.11247452, 0.09372877, 0.0034365447, 0.21565...
1049    [0.07366333, 0.099365234, -0.0085723875, 0.187...
Name: vectors_means, Length: 1050, dtype: object

In [95]:
#Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(vectorized_docs, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Word2Vec description and product name', accuracy * 100))

Accuracy with Word2Vec description and product name features: 52.38%


In [143]:
wv = api.load('word2vec-google-news-300')

KeyboardInterrupt: 

In [144]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in wv:
                try:
                    vectors.append(wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(sentences, model=wv)
len(vectorized_docs), len(vectorized_docs[0])

(1050, 300)

In [145]:
df_process['pre_vectors_means'] = vectorized_docs
df_process['pre_vectors_means']

0       [0.048451997, 0.052741077, -0.017014096, 0.035...
1       [0.018274853, 0.06604099, 0.012273788, 0.05909...
2       [0.020904202, 0.09313281, 0.00014671043, 0.059...
3       [-0.0043165246, 0.051694907, -0.045088943, 0.0...
4       [0.010901001, 0.048768923, -0.0025513915, 0.07...
                              ...                        
1045    [0.013211982, 0.055592205, 0.007732161, 0.0511...
1046    [-0.00069127965, 0.053532355, 0.018493427, 0.0...
1047    [0.055739265, 0.06684657, -0.038131714, 0.1533...
1048    [0.11247452, 0.09372877, 0.0034365447, 0.21565...
1049    [0.07366333, 0.099365234, -0.0085723875, 0.187...
Name: pre_vectors_means, Length: 1050, dtype: object

In [146]:
#Séparation des données en ensembles d'entraînement et de test
train_data, test_data, train_labels, test_labels = train_test_split(vectorized_docs, df_process.main_category_num, test_size=0.2, random_state=42)

# Entraînement d'un modèle de classification avec les features extraites
classifier = SVC()
classifier.fit(train_data,train_labels)

# Évaluation de la performance du modèle avec l'ensemble de test
predictions = classifier.predict(test_data)
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy with {} features: {:.2f}%".format('Word2Vec description and product name', accuracy * 100))

Accuracy with Word2Vec description and product name features: 93.81%


### BERT

### USE (Universel Sentence Encoder)