### Notebook to train using embeddings using FastText and CatBoost

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
# there are total of 1451 rows in dataset, I have removed 49 rows for which image was not downloadable
def get_data_without_images()->list:
    df = pd.read_csv("../data/processed/cleaned_results.csv")
    X = df[['name','brandName']]
    Y = df['categoryName']

    # Cannot use Statrify here since there are categories with only 1 value, so it's not possible to split them
    return train_test_split(X, Y, test_size=0.2, random_state=42)

In [3]:
# adding imports
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import spacy

In [4]:
# this working fine the data is split into 80% train and 20% test
X_train,X_test,y_train,y_test = get_data_without_images()

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:

def tokenize_text(text):
    return [token.text.lower() for token in nlp(str(text)) if not token.is_punct and not token.is_space]

def tokenize_column(column):
    return [tokenize_text(text) or ['<unk>'] for text in column]

In [7]:
# Apply tokenisation
# this can be optimised by using np.array
product_tokens = tokenize_column(X_train['name'])
brand_tokens = tokenize_column(X_train['brandName'])

In [8]:
# -------------------------------
# Step 3: Train FastText model
# -------------------------------
combined_corpus = product_tokens + brand_tokens

In [9]:
# training a fast-text model to generate word embeddings
ft_model:FastText = FastText(
    sentences=combined_corpus,
    vector_size=100,
    window=3,
    min_count=1,
    sg=4,
    epochs=100
)

In [10]:
# checking the word vector for a word
ft_model.wv['revlon']

array([-0.19380973, -0.17312033,  0.14489503, -0.2472115 ,  0.55954975,
       -0.7385266 , -0.16032806,  0.12783523,  1.0224581 , -0.03216781,
        0.12178689,  0.5559957 , -0.12568694, -0.07842773, -0.37182537,
        0.3525978 , -0.31145954, -2.010154  , -0.81038016, -0.92585725,
        0.19795062,  0.43108428, -1.0330576 , -0.8915576 , -0.7708586 ,
       -0.1356315 , -1.3975393 , -1.343079  , -0.14235164, -0.15676445,
       -0.7619604 ,  0.4027679 ,  0.11804128,  0.32898068,  0.82878506,
        0.1668406 , -1.1847748 ,  0.47173795, -0.5412962 ,  0.24479671,
        0.04600191,  0.11579774,  0.0387099 , -0.6512558 , -0.14157335,
       -0.8928325 , -0.0169696 , -0.02891628, -0.26630569, -0.04782018,
       -0.7522785 ,  0.35130247, -0.6882591 ,  0.73537785, -0.6475959 ,
       -0.7879068 ,  0.1858443 , -0.4022274 , -0.18064018,  0.23419975,
       -0.81957287, -0.5313862 , -0.4055738 ,  0.9982064 , -0.12088407,
        0.03194728,  0.5622262 , -0.19736207,  0.14331809, -0.87

In [11]:
# -------------------------------
# Step 4: Get sentence vectors
# -------------------------------
def sentence_vector(text):
    tokens = tokenize_text(text)
    vectors = [ft_model.wv[t] if t in ft_model.wv else ft_model.wv['<unk>'] for t in tokens]
    return np.mean(vectors, axis=0)

In [12]:

# Generate sentence vectors for product and brand
X_product = np.vstack([sentence_vector(text) for text in X_train['name']])
X_brand = np.vstack([sentence_vector(text) for text in X_train['brandName']])
X = np.concatenate([X_product, X_brand], axis=1)

In [13]:
X.shape

(1160, 200)

In [14]:
# -------------------------------
# Step 5: Encode target labels
# -------------------------------
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [15]:
np.unique(y)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122])

In [16]:
# Creating pool for train data
from catboost import Pool
train_pool = Pool(
    data=X,
    label=y,
)

In [17]:
# -------------------------------
# Step 6: Train CatBoostClassifier
# -------------------------------
model = CatBoostClassifier(
    iterations=1000,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    auto_class_weights="Balanced",
    verbose=100,
    task_type='GPU',
    devices='0',
    early_stopping_rounds=50,
)

model.fit(train_pool)

Learning rate set to 0.066901
0:	learn: 0.1227962	total: 153ms	remaining: 2m 33s
100:	learn: 0.9967362	total: 12.1s	remaining: 1m 47s
200:	learn: 0.9991093	total: 24s	remaining: 1m 35s
300:	learn: 0.9998120	total: 35.3s	remaining: 1m 22s
400:	learn: 0.9999191	total: 46.9s	remaining: 1m 10s
500:	learn: 1.0000000	total: 58.2s	remaining: 58s
600:	learn: 1.0000000	total: 1m 9s	remaining: 46.2s
700:	learn: 1.0000000	total: 1m 20s	remaining: 34.5s
800:	learn: 1.0000000	total: 1m 32s	remaining: 22.9s
900:	learn: 1.0000000	total: 1m 43s	remaining: 11.4s
999:	learn: 1.0000000	total: 1m 54s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8ec31fcad0>

In [18]:
# -------------------------------
# Step 7: Prediction function
# -------------------------------
def predict_category(product_text, brand_text):
    prod_vec = sentence_vector(product_text)
    brand_vec = sentence_vector(brand_text)
    full_vec = np.concatenate([prod_vec, brand_vec]).reshape(1, -1)
    pred_class = model.predict(full_vec)
    return label_encoder.inverse_transform(pred_class.astype(int))[0]

In [19]:
example_product = "Revlon hair dye 10ml"
example_brand = "Revlon"
prediction = predict_category(example_product, example_brand)
print(f"Predicted category: {prediction}")

Predicted category: Hair Dye


  y = column_or_1d(y, warn=True)


In [20]:
# creating a test_pool using X_test
def create_test_pool(X_test):
    X_product = np.vstack([sentence_vector(text) for text in X_test['name']])
    X_brand = np.vstack([sentence_vector(text) for text in X_test['brandName']])
    X = np.concatenate([X_product, X_brand], axis=1)
    test_pool = Pool(
        data=X,
        label=y_test,
    )
    return test_pool

In [21]:
X_test_pool = create_test_pool(X_test)

In [22]:
y_pred = model.predict(X_test_pool)

In [23]:
y_pred_cat = label_encoder.inverse_transform(y_pred.ravel())

In [30]:
label_encoder.classes_.shape

(123,)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_cat)


                                precision    recall  f1-score   support

                          Acne       0.00      0.00      0.00         1
        Anti-Aging Facial Care       0.00      0.00      0.00         2
               Anti-Aging Mask       0.00      0.00      0.00         1
              Anti-Aging Serum       0.00      0.00      0.00         5
                  Baby & Child       0.00      0.00      0.00         1
                     Bleaching       0.00      0.00      0.00         2
                   Body Butter       0.00      0.00      0.00         1
                   Body Lotion       0.83      1.00      0.91         5
                     Body Mist       0.00      0.00      0.00         0
               Cleansing Cream       0.00      0.00      0.00         3
                   Color Rinse       0.00      0.00      0.00         1
                     Concealer       0.00      0.00      0.00         1
             Concealer Brushes       0.00      0.00      0.00  

In [36]:
import json

# Generate the classification report as a dictionary
report = classification_report(y_test, y_pred_cat, zero_division=0, output_dict=True)

# Write the report to a JSON file
with open("classification_report.json", "w") as json_file:
    json.dump(report, json_file, indent=4)
