In [44]:
import pandas as pd
import numpy as np
import re


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer


french_stop_words = stopwords.words('french')
stemmer = FrenchStemmer()


file_path = '../../data/ecommerce_sales.xlsb'
sheet_name = '20210614 Ecommerce sales'

In [45]:
test = pd.read_csv('../../data/category_data.csv')

In [46]:

data = pd.read_csv('../../data/category_data_unique_title.csv')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    data['Libellé produit'], data['Nature'], test_size=0.20, random_state=42
)

In [48]:
def clean_and_tokenize(text):  
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text, language='french')
    tokens = [stemmer.stem(word) for word in tokens if word not in set(french_stop_words)]
    return tokens
    

In [49]:

tfidf = TfidfVectorizer(stop_words=french_stop_words, max_features=5000, use_idf=True, analyzer='word', tokenizer=clean_and_tokenize, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [50]:
clf = LGBMClassifier(learning_rate=3e-5, objective='multiclass')
clf.fit(X_train_tfidf, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.328245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136803
[LightGBM] [Info] Number of data points in the train set: 44648, number of used features: 4227
[LightGBM] [Info] Start training from score -9.097127
[LightGBM] [Info] Start training from score -6.814744
[LightGBM] [Info] Start training from score -6.945365
[LightGBM] [Info] Start training from score -4.683117
[LightGBM] [Info] Start training from score -9.097127
[LightGBM] [Info] Start training from score -7.528511
[LightGBM] [Info] Start training from score -6.006084
[LightGBM] [Info] Start training from score -10.013418
[LightGBM] [Info] Start training from score -6.516910
[LightGBM] [Info] Start training from score -7.873351
[LightGBM] [Info] Start training from score -10.706565
[LightGBM] [Info] Start training from score -7.

In [51]:
y_pred = clf.predict(X_test_tfidf)


In [52]:
print("Model performance:")
print(classification_report(y_test, y_pred))

Model performance:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

               abat jour       0.00      0.00      0.00         4
             abattant wc       0.00      0.00      0.00        12
          abri de jardin       0.00      0.00      0.00         9
          acc telephonie       0.00      0.00      0.00        98
      access photo video       0.00      0.00      0.00         8
     access. pc tablette       0.00      0.00      0.00        28
         accessoire aspi       0.00      0.00      0.00        23
    accessoire autoradio       0.00      0.00      0.00         2
     accessoire barbecue       0.00      0.00      0.00         2
      accessoire biberon       0.00      0.00      0.00         1
       accessoire biblio       0.00      0.00      0.00         1
       accessoire bureau       0.00      0.00      0.00        17
      accessoire camping       0.00      0.00      0.00        14
     accessoire cave vin       0.00      0.00      0.00         1
    acces

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.0546448087431694


In [54]:
pred_y_df = pd.DataFrame({'x_test': X_test,'y_pred': y_pred, 'y_test':y_test})
pred_y_df

Unnamed: 0,x_test,y_pred,y_test
23352,lot de 2 chaises velours gris et pieds metal n...,matelas,chaise
20787,matelas merinos mimic 90x190 cm matelas ressorts,matelas,matelas
971,drap plat uni en coton 240 x 290 cm bleu,matelas,drap housse
3914,canape droit convertible 2 5 places,matelas,canape droit
21339,pack complet lit tiroir junior petit elephant ...,matelas,ensemble chambre
...,...,...,...
624,altolattes sommier 2x20 lattes 140x190cm,matelas,sommier
9846,cible jeu de flechettes electronique 27 jeux j...,matelas,jeu de flechettes
7990,housse de couette 220x240 cm avec ses 2 taies ...,matelas,housse de couette
18337,chaise fauteuil scandinave frida tissu gris clair,matelas,fauteuil


In [55]:
X_all_tfidf = tfidf.transform(data['Libellé produit'])
all_pred = clf.predict(X_all_tfidf)
data['Predicted_Category'] = all_pred

In [56]:
data['Differently_Categorized'] = data['Nature'] != data['Predicted_Category']


In [57]:
data['Recategorized_Nature'] = data['Predicted_Category']

In [58]:
# Print statistics
print("Number of differently categorized items:", data['Differently_Categorized'].sum())
print("Percentage of differently categorized items: {:.2f}%".format(data['Differently_Categorized'].mean() * 100))

# Display some examples of recategorized items
print("\nExamples of recategorized items:")
print(data[data['Differently_Categorized']][['Libellé produit', 'Nature', 'Recategorized_Nature']].head(10))



Number of differently categorized items: 52844
Percentage of differently categorized items: 94.68%

Examples of recategorized items:
                                     Libellé produit               Nature  \
0       table basse carree detroit design industriel          table basse   
1                  ours en peluche geant 150 cm brun              peluche   
2                 ours en peluche geant 100 cm blanc              peluche   
3    lot de 4 chaises mia noires pour salle a manger               chaise   
4                 meuble tv falko bois blanc et gris            meuble tv   
5  meuble a chaussures imitation h tre 3 portes b...  meuble a chaussures   
6  meuble a chaussures gris 3 portes blanches ave...  meuble a chaussures   
7   tagere a double position tania bois imitation...         bibliotheque   
8  table a manger georgia 6 personnes blanche et ...                table   
9            console vintage leoni motifs graphiques               bureau   

  Recategorized_Nat