In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
pre_trained_model = Word2Vec.load("AmazonReviews/amazonreviews_2024-01-18.kvmodel")

In [3]:
vocab = list(pre_trained_model.wv.key_to_index)

In [4]:
df = pd.read_excel("cleaned_vocab.xlsx")

In [5]:
words = list(df.WORDS)
labels = list(df.LABELS)
categories = list(df.LABELS.unique())

Extracting words from the dataset that are available in the vocab

In [6]:
filtered_words = []
for word in words:
    if (str(word) in vocab) and (" " not in str(word)):
        filtered_words.append(word)       

In [None]:
print(len(filtered_words))
print(filtered_words)

Generating word embeddings from the filtered words

In [8]:
word_embeddings = []

for word in filtered_words:
    word_embeddings.append(pre_trained_model.wv[str(word)])

Labelling filtered words

In [48]:
filtered_labels = []

for fw in filtered_words: 
    word_idx = words.index(fw) # finding the filtered word in the words list
    lbl = labels[word_idx] # finding the corresponding lable from the labels list
    cat_idx = categories.index(lbl) # getting index of the label from the category list
    filtered_labels.append(cat_idx) # saving as filtered label
        


In [None]:
len(filtered_labels)

Generateing filter embeddings

In [33]:
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(np.array(word_embeddings), np.array(filtered_labels), test_size=0.3)

In [69]:
X_train[0].shape

(100,)

In [None]:
X_train

In [None]:
from tensorflow import keras

In [102]:
X_test[0].shape

(100,)

In [70]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=X_train[0].shape),
    keras.layers.Dense(128, activation = "relu"),
    keras.layers.Dense(64, activation = "relu"),
    keras.layers.Dense(len(categories), activation = "sigmoid")
])

In [71]:
model.compile(
    optimizer = "adam",
    loss = "sparse_categorical_crossentropy",
    metrics = ["accuracy"]
)

In [None]:
X_train

In [None]:
history = model.fit(X_train, y_train, epochs = 5)

In [77]:
import matplotlib.pyplot as plt

Visualising training accuracy and loss

In [162]:
loss, acc = model.evaluate(X_test, y_test, verbose = 2)

2/2 - 0s - loss: 0.4316 - accuracy: 0.8387 - 35ms/epoch - 18ms/step


In [None]:
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plt.plot(history.history["loss"], label = "Training loss")
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history["accuracy"], label = "Training loss")
plt.title("Training accuracy")
plt.xlabel("Epoch")
plt.ylabel("accuracy")
plt.legend()

plt.tight_layout()
plt.show()

In [83]:
predictions = model.predict(X_test)



In [99]:
n = 34
nth_pred = predictions[n]
idx = np.argmax(nth_pred)
categories[idx]

' grocery '

In [160]:
p = model.predict(pre_trained_model.wv["television"].reshape((1,100)))
categories[np.argmax(p)]



' gadget'

Saving the model

In [163]:
from datetime import datetime

In [164]:
model.save(f"AmazonReviews\word_classifier_{str(datetime.now()).replace('-','').split(' ')[0]}.keras")