# Feature extraction : Bag of words

## Importation des librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import preprocessing
from seaborn import countplot

## Chargement des données 

In [2]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 50

df = pd.read_csv('./csv/text_clean.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,uniq_id,product_name_clean,description_clean,main_category,main_category_num,description_name
0,0,55b85ea15a1536d46b7190ad6fff8ce7,elegance polyester multicolor abstract eyelet ...,key feature elegance polyester multicolor abst...,Home Furnishing,4,key feature elegance polyester multicolor abst...
1,1,7b72c92c2f6c40268628ec5f14c6d590,sathiyas cotton bath towel,specification cotton bath towel bath towel red...,Baby Care,0,specification cotton bath towel bath towel red...
2,2,64d5d4a258243731dc7bbb1eef49ad74,eurospa cotton terry face towel set,key feature cotton terry face towel set size s...,Baby Care,0,key feature cotton terry face towel set size s...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          1050 non-null   int64 
 1   uniq_id             1050 non-null   object
 2   product_name_clean  1050 non-null   object
 3   description_clean   1050 non-null   object
 4   main_category       1050 non-null   object
 5   main_category_num   1050 non-null   int64 
 6   description_name    1050 non-null   object
dtypes: int64(2), object(5)
memory usage: 57.5+ KB


# Features exctractions 

## Bag of words vs TF-IDF (test avec les trois colonnes : description, product_name, description+product_name)

In [5]:
# Définir les noms de colonnes et de vectorizers
column_names = ['description_clean', 'product_name_clean', 'description_name']
vectorizers = [{'name' :'Count', 'vector' : CountVectorizer()},
                {'name' : 'Tfidf', 'vector' : TfidfVectorizer()}]

# Générer les noms des dataframes et les différentes combinaisons
dataframe_dict = []

for col in column_names:
    for vect_item in vectorizers:
        df_name = f"{vect_item['name']}_{col}"
        vectorizer = vect_item['vector']
        dataframe_dict.append((df_name, vectorizer, col))

# Créer les dataframes et les assigner à des variables avec des noms explicites
for df_name, vectorizer, col in dataframe_dict:
    bow = vectorizer.fit_transform(df[col])
    bow = pd.DataFrame(bow.toarray(), columns = vectorizer.get_feature_names_out())
    globals()[df_name] = bow

### Test CountVectorizer vs TfidfVectorizer

In [6]:
def classification_accuracy(df_name):
    train_data, test_data, train_labels, test_labels = train_test_split(eval(df_name), df.main_category_num, test_size=0.2, random_state=42)
    classifier = SVC()
    classifier.fit(train_data,train_labels)
    predictions = classifier.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Accuracy with {df_name} {round((accuracy * 100),2)}%")

In [7]:
for df_name, vectorizer, col in dataframe_dict:
    classification_accuracy(df_name)

Accuracy with Count_description_clean 80.48%
Accuracy with Tfidf_description_clean 92.38%
Accuracy with Count_product_name_clean 89.52%
Accuracy with Tfidf_product_name_clean 91.43%
Accuracy with Count_description_name 88.1%
Accuracy with Tfidf_description_name 94.29%


La meilleur accuracy pour la classification est avec la méthode Tf-idf avec la colonne qui contient les description et le product name.