In [1]:
import pandas as pd
import numpy as np
import re


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer


french_stop_words = stopwords.words('french')
stemmer = FrenchStemmer()


file_path = '../../data/ecommerce_sales.xlsb'
sheet_name = '20210614 Ecommerce sales'

In [2]:
test = pd.read_csv('../../data/category_data.csv')

In [3]:
data = pd.read_csv('../../data/category_data_unique_title.csv')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data['Libellé produit'], data['Nature'], test_size=0.2, random_state=42
)

In [5]:
def clean_and_tokenize(text):  
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text, language='french')
    tokens = [stemmer.stem(word) for word in tokens if word not in set(french_stop_words)]
    return tokens
    

In [6]:

tfidf = TfidfVectorizer(stop_words=french_stop_words, max_features=5000, use_idf=True, analyzer='word', tokenizer=clean_and_tokenize, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [7]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

In [8]:
y_pred = clf.predict(X_test_tfidf)


In [9]:
print("Model performance:")
print(classification_report(y_test, y_pred))

Model performance:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

               abat jour       0.00      0.00      0.00         4
             abattant wc       1.00      0.17      0.29        12
          abri de jardin       1.00      0.11      0.20         9
          acc telephonie       0.36      0.90      0.51        98
      access photo video       0.00      0.00      0.00         8
     access. pc tablette       0.00      0.00      0.00        28
         accessoire aspi       0.00      0.00      0.00        23
    accessoire autoradio       0.00      0.00      0.00         2
     accessoire barbecue       0.00      0.00      0.00         2
      accessoire biberon       0.00      0.00      0.00         1
       accessoire biblio       0.00      0.00      0.00         1
       accessoire bureau       0.00      0.00      0.00        17
      accessoire camping       0.00      0.00      0.00        14
     accessoire cave vin       0.00      0.00      0.00         1
    acces

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5689330825047031


In [11]:

pred_y_df = pd.DataFrame({'x_test': X_test,'y_pred': y_pred, 'y_test': y_test})
pred_y_df

Unnamed: 0,x_test,y_pred,y_test
23352,lot de 2 chaises velours gris et pieds metal n...,chaise,chaise
20787,matelas merinos mimic 90x190 cm matelas ressorts,matelas,matelas
971,drap plat uni en coton 240 x 290 cm bleu,drap housse,drap housse
3914,canape droit convertible 2 5 places,canape droit,canape droit
21339,pack complet lit tiroir junior petit elephant ...,lit jeune,ensemble chambre
...,...,...,...
624,altolattes sommier 2x20 lattes 140x190cm,sommier,sommier
9846,cible jeu de flechettes electronique 27 jeux j...,meuble tv,jeu de flechettes
7990,housse de couette 220x240 cm avec ses 2 taies ...,housse de couette,housse de couette
18337,chaise fauteuil scandinave frida tissu gris clair,fauteuil,fauteuil


In [12]:
X_all_tfidf = tfidf.transform(data['Libellé produit'])
data['Predicted_Category'] = clf.predict(X_all_tfidf)

In [13]:
data['Differently_Categorized'] = data['Nature'] != data['Predicted_Category']


In [14]:
data['Recategorized_Nature'] = data['Predicted_Category']

In [15]:
# Print statistics
print("Number of differently categorized items:", data['Differently_Categorized'].sum())
print("Percentage of differently categorized items: {:.2f}%".format(data['Differently_Categorized'].mean() * 100))

# Display some examples of recategorized items
print("\nExamples of recategorized items:")
print(data[data['Differently_Categorized']][['Libellé produit', 'Nature', 'Recategorized_Nature']].head(10))



Number of poorly categorized items: 22394
Percentage of poorly categorized items: 40.12%

Examples of recategorized items:
                                      Libellé produit        Nature  \
1                   ours en peluche geant 150 cm brun       peluche   
2                  ours en peluche geant 100 cm blanc       peluche   
9             console vintage leoni motifs graphiques        bureau   
10  banc d entree capitonne en bambou tissu gris f...   banc de lit   
14  panneau decoratif universel 160x60 cm flower gris       panneau   
15       bibliotheque vintage noemi bois pied epingle  bibliotheque   
24                   lot de 3 etageres murales grises  bibliotheque   
27  armoire vitrine led bibliotheque 90 cm commode...       vitrine   
46  ecouteur sans fil bluetooth pour smartphone et...    microphone   
51         vestiaire d entree pedro design industriel     vestiaire   

    Recategorized_Nature  
1   tapis de salon et ch  
2              meuble tv  
9   tapis de s