In [1]:
import keras
import os
import sklearn
import numpy as np
import pandas as pd

In [2]:
# load the data
csv = os.path.join(os.getcwd(), 'dataset', 'ecommerceDataset.csv')
df = pd.read_csv(csv, header=None, names=['target', 'train'])
df

Unnamed: 0,target,train
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  50425 non-null  object
 1   train   50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [4]:
df.isna().sum()

target    0
train     1
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(22622)

In [6]:
# since there are 50k instance of data and only 1 na value we can drop the row
df = df.dropna()
df.isna().sum()

target    0
train     0
dtype: int64

In [7]:
# looking wheter we should drop the duplicated value
print('label distribution:', df['target'].value_counts()) #distribution of classes before dropping the duplicate
# create a copy of our original df to check the distribution after droping dups 
df_copy = df.copy()
df_copy = df_copy.drop_duplicates()
print('label distribution after removing duplicates:', df_copy['target'].value_counts())
# there is still apparent imbalance of data if we remove the dups so we can remove the dups to cut down on training time
df = df.drop_duplicates()
print('label distribution after removing duplicates from original df:', df['target'].value_counts())

label distribution: target
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: count, dtype: int64
label distribution after removing duplicates: target
Household                 10564
Books                      6256
Clothing & Accessories     5674
Electronics                5308
Name: count, dtype: int64
label distribution after removing duplicates from original df: target
Household                 10564
Books                      6256
Clothing & Accessories     5674
Electronics                5308
Name: count, dtype: int64


In [8]:
nClass = len(df['target'].unique())
nClass # see how many unique class

4

In [9]:
# split feature and label
feature = df['train'].values
label = df['target'].values
# encode
encoder = sklearn.preprocessing.LabelEncoder()
label_encoded = encoder.fit_transform(label)
print(np.unique(label_encoded))
print(encoder.inverse_transform(np.unique(label_encoded)))

[0 1 2 3]
['Books' 'Clothing & Accessories' 'Electronics' 'Household']


In [10]:
# split data for train test and validation
seed = 42
x_train, x_split, y_train, y_split = sklearn.model_selection.train_test_split(feature, label_encoded, train_size=0.7, random_state=seed, stratify=label_encoded)
x_val, x_test, y_val, y_test = sklearn.model_selection.train_test_split(x_split, y_split, train_size=0.5, random_state=seed, stratify=y_split)

In [11]:
# removing stop word to improve performance
from nltk.corpus import stopwords
stopwords_eng = np.array(stopwords.words('english'))
sample_string = x_train[0]
sample_string_removed = [[x for x in sample_string.split() if x not in stopwords_eng] for string_data in x_train]
sample_string_removed_joined = np.array([' '.join(x) for x in sample_string_removed])

def remove_stopwords(np_data,stopword_list):
    sample_string_removed = [[x for x in string_data.split() if x not in stopword_list] for string_data in np_data]
    sample_string_removed_joined = np.array([' '.join(x) for x in sample_string_removed])
    return sample_string_removed_joined.astype('object')

x_train_removed = remove_stopwords(x_train,stopwords_eng)
x_val_removed = remove_stopwords(x_val, stopwords_eng)
x_test_removed = remove_stopwords(x_test, stopwords_eng)
print(x_train_removed[0]) 

ATOM MZ1 Steel Mortise Handle Legend Double Action Lock (175 mm, Teak Wood) Lock Size : 65 mm Type : Double Stage Locking Levers : 6 Levers Key Set : 3 Keys Lock Metal : Iron, S.S. Brass Lock Coror : Grey Note : Lock color may differ shown image. Material Base Plate : Steel Material Handle : Zamak Material Handle Pipe : MS Mechanism Spring : Spiral Spring, Stainless Steel 304 Handle Set Finish : Teak Wood Handle Set Size : 175 mm (7”) Our products prepaired consummated professional surface treatment anticorrosion technology.


In [12]:
# compare with the stop words included
print(x_train[0])
# we can see word such as with is removed

ATOM MZ1 Steel Mortise Handle with Legend Double Action Lock (175 mm, Teak Wood) Lock Size : 65 mm Type : Double Stage Locking Levers : 6 Levers Key Set : 3 Keys Lock Metal : Iron, S.S. and Brass Lock Coror : Grey Note : Lock color may be differ from shown in image. Material Base Plate : Steel Material Handle : Zamak Material Handle Pipe : MS Mechanism Spring : Spiral Spring, Stainless Steel 304 Handle Set Finish : Teak Wood Handle Set Size : 175 mm (7”) Our products are prepaired by the consummated and professional surface treatment and anticorrosion technology.


In [13]:
# process nlp by tokenizing
vocab_size = 5000
tokenizer = keras.layers.TextVectorization(max_tokens=vocab_size, output_sequence_length=250)
tokenizer.adapt(x_train_removed) #adapting our removed stop word for tokenization

In [14]:
# see our tokenized word and our original word
sample_text = x_train_removed[:2]
sample_token = tokenizer(sample_text)
print(sample_text[0])
print(sample_token[0])

ATOM MZ1 Steel Mortise Handle Legend Double Action Lock (175 mm, Teak Wood) Lock Size : 65 mm Type : Double Stage Locking Levers : 6 Levers Key Set : 3 Keys Lock Metal : Iron, S.S. Brass Lock Coror : Grey Note : Lock color may differ shown image. Material Base Plate : Steel Material Handle : Zamak Material Handle Pipe : MS Mechanism Spring : Spiral Spring, Stainless Steel 304 Handle Set Finish : Teak Wood Handle Set Size : 175 mm (7”) Our products prepaired consummated professional surface treatment anticorrosion technology.
tf.Tensor(
[   1    1   41    1  204    1  404  822  504    1  245 3890  227  504
   14 1848  245  162  404 1660 2080    1   74    1  364    6   37  946
  504  241  432 2942  930  504    1  497  333  504   26  150 3913 1227
  565   34  372  762   41   34  204    1   34  204 1936 1449 1914 1910
    1 1910   71   41 4279  204    6  127 3890  227  204    6   14    1
  245    1  215   68    1    1  328  295 2038    1  137    0    0    0
    0    0    0    0    0    0  

In [15]:
# embedding
embedding = keras.layers.Embedding(input_dim=vocab_size, output_dim=64)
sample_embedding = embedding(sample_token)
print(sample_embedding[0]) # seeour sample embedding

tf.Tensor(
[[ 0.03824066  0.01448122  0.03922582 ... -0.02309498 -0.03289719
   0.03837833]
 [ 0.03824066  0.01448122  0.03922582 ... -0.02309498 -0.03289719
   0.03837833]
 [ 0.03717271  0.01472297 -0.04631726 ... -0.04804343 -0.04691308
  -0.00213372]
 ...
 [ 0.01341579 -0.03251721 -0.03987427 ... -0.04766048  0.03762862
  -0.00856088]
 [ 0.01341579 -0.03251721 -0.03987427 ... -0.04766048  0.03762862
  -0.00856088]
 [ 0.01341579 -0.03251721 -0.03987427 ... -0.04766048  0.03762862
  -0.00856088]], shape=(250, 64), dtype=float32)


In [16]:
# create model
model = keras.Sequential()
# add nlp layers
model.add(tokenizer)
model.add(embedding)
# add rnn layers
reg = keras.regularizers.l1_l2() # add l2 to prevent over fitting
model.add(keras.layers.Bidirectional(keras.layers.LSTM(16, return_sequences=False, kernel_regularizer=reg)))
model.add(keras.layers.Dense(nClass, activation='softmax'))

In [17]:
loss = keras.losses.SparseCategoricalCrossentropy() #int label, softmax
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [18]:
import mlflow
import tensorboard
mlflow.set_experiment('Ecommerce Text Classification')

<Experiment: artifact_location='file:///c:/Users/suhaimi/Desktop/Nidzam/capstone_project/capstone%202/mlruns/974647785270208707', creation_time=1741658183982, experiment_id='974647785270208707', last_update_time=1741658183982, lifecycle_stage='active', name='Ecommerce Text Classification', tags={}>

In [19]:
# train model, log into mlflow and tensorboard
with mlflow.start_run() as run:
    mlflow_callback = mlflow.keras.MLflowCallback(run)
    run_id = run.info.run_id
    log_path = f'logs/{run_id}'
    ts = keras.callbacks.TensorBoard(log_dir=log_path)
    history = model.fit(x_train_removed, y_train, validation_data=(x_val_removed,y_val), batch_size=16, epochs=10, callbacks=[mlflow_callback,ts])
    mlflow.keras.save.log_model(model, artifact_path='model')



Epoch 1/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 50ms/step - accuracy: 0.7831 - loss: 0.6232 - val_accuracy: 0.9403 - val_loss: 0.2148
Epoch 2/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 50ms/step - accuracy: 0.9637 - loss: 0.1374 - val_accuracy: 0.9465 - val_loss: 0.1893
Epoch 3/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 51ms/step - accuracy: 0.9773 - loss: 0.0870 - val_accuracy: 0.9424 - val_loss: 0.2055
Epoch 4/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 64ms/step - accuracy: 0.9839 - loss: 0.0612 - val_accuracy: 0.9465 - val_loss: 0.2138
Epoch 5/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 64ms/step - accuracy: 0.9871 - loss: 0.0486 - val_accuracy: 0.9420 - val_loss: 0.2415
Epoch 6/10
[1m1217/1217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 64ms/step - accuracy: 0.9917 - loss: 0.0283 - val_accuracy: 0.9429 - val_loss: 0.2581
Epoc



In [20]:
model.summary()

In [21]:
y_pred = model.predict(x_test_removed)
print(y_pred[0])
y_pred_class = np.argmax(y_pred,axis=1)
print(y_pred_class[0])
y_pred_it = encoder.inverse_transform(y_pred_class)
print(y_pred_it[0])

[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step
[9.2402925e-06 3.7064467e-06 3.1903415e-05 9.9995518e-01]
3
Household


In [22]:
f1 = sklearn.metrics.f1_score(y_test, y_pred_class, average='weighted')
print('f1 score: ', f1)

f1 score:  0.9363751852771114


In [23]:
import pickle
with open('encoder.pkl','wb') as file:
    pickle.dump(encoder, file)