In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pickle
import pandas as pd
import numpy as np
import gensim.downloader as api
import re
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Books Embeddings

In [None]:
book_metadata_df = pd.read_json('/content/drive/MyDrive/Priyanka/final_book_metadata.json', lines=True)

In [None]:
book_metadata_df.head()

Unnamed: 0,category,description,title,brand,price,asin
0,"[Books, Literature & Fiction, Dramas & Plays]",[William Shakespeare is widely regarded as the...,Love's Labour's Lost: Performed by Derek Jacob...,Visit Amazon's William Shakespeare Page,$20.93,1050230
1,"[Books, New, Used & Rental Textbooks, Humanities]",[William Shakespeare is widely regarded as the...,Othello: Complete &amp; Unabridged,Visit Amazon's William Shakespeare Page,,1048767
2,"[Books, Children's Books, Literature & Fiction]",[],The Secret Garden,Frances Hodgson [illustrated by ruth sanderson...,$4.72,1945424
3,"[Books, Literature & Fiction, Literary]",[],Ten Little Niggers,Visit Amazon's Agatha Christie Page,$50.63,2318350
4,"[Books, Mystery, Thriller &amp; Suspense, Thri...",[Jan Needle is a well known children's writer....,Dracula (Collins Drama),Jan Needle,,3302245


In [None]:
book_metadata_df['category'] = book_metadata_df['category'].apply(' '.join)
book_metadata_df['description'] = book_metadata_df['description'].apply(' '.join)

book_metadata_df['category'] = book_metadata_df['category'].str.replace('Books', '', case=False)
#book_metadata_df['brand'] = book_metadata_df['brand'].str.replace('Visit Amazon\'s', '', case=False)
#book_metadata_df['brand'] = book_metadata_df['brand'].str.replace('Page', '', case=False)

In [None]:
book_metadata_df.head()

Unnamed: 0,category,description,title,brand,price,asin
0,Literature & Fiction Dramas & Plays,William Shakespeare is widely regarded as the ...,Love's Labour's Lost: Performed by Derek Jacob...,Visit Amazon's William Shakespeare Page,$20.93,1050230
1,"New, Used & Rental Text Humanities",William Shakespeare is widely regarded as the ...,Othello: Complete &amp; Unabridged,Visit Amazon's William Shakespeare Page,,1048767
2,Children's Literature & Fiction,,The Secret Garden,Frances Hodgson [illustrated by ruth sanderson...,$4.72,1945424
3,Literature & Fiction Literary,,Ten Little Niggers,Visit Amazon's Agatha Christie Page,$50.63,2318350
4,"Mystery, Thriller &amp; Suspense Thrillers &a...",Jan Needle is a well known children's writer. ...,Dracula (Collins Drama),Jan Needle,,3302245


In [None]:
def pre_process_text(text):

    words = nltk.word_tokenize(text.lower())
    words = [re.sub(r"[^a-zA-Z]", "", word) for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    return words

In [None]:
book_metadata_df['category'] = book_metadata_df['category'].apply(pre_process_text)
book_metadata_df['description'] = book_metadata_df['description'].apply(pre_process_text)
book_metadata_df['title'] = book_metadata_df['title'].apply(pre_process_text)
#book_metadata_df['brand'] = book_metadata_df['brand'].apply(pre_process_text)

In [None]:
book_metadata_df['book_data'] = book_metadata_df['category'] + book_metadata_df['title'] + book_metadata_df['description']

In [None]:
book_metadata_df.head()

Unnamed: 0,category,description,title,brand,price,asin,book_data
0,"[literature, fiction, dramas, plays]","[william, shakespeare, widely, regarded, great...","[love, labour, lost, performed, derek, jacobi,...",Visit Amazon's William Shakespeare Page,$20.93,1050230,"[literature, fiction, dramas, plays, love, lab..."
1,"[new, used, rental, text, humanities]","[william, shakespeare, widely, regarded, great...","[othello, complete, amp, unabridged]",Visit Amazon's William Shakespeare Page,,1048767,"[new, used, rental, text, humanities, othello,..."
2,"[children, literature, fiction]",[],"[secret, garden]",Frances Hodgson [illustrated by ruth sanderson...,$4.72,1945424,"[children, literature, fiction, secret, garden]"
3,"[literature, fiction, literary]",[],"[ten, little, niggers]",Visit Amazon's Agatha Christie Page,$50.63,2318350,"[literature, fiction, literary, ten, little, n..."
4,"[mystery, thriller, amp, suspense, thrillers, ...","[jan, needle, well, known, children, writer, n...","[dracula, collins, drama]",Jan Needle,,3302245,"[mystery, thriller, amp, suspense, thrillers, ..."


In [None]:
# book_metadata_df['brand'].value_counts()

In [None]:
word2vec_model = api.load("word2vec-google-news-300")



In [None]:
def generate_sentence_embedding(words, model, embedding_size=300):
    # Initialize an empty embedding vector
    embedding = np.zeros(embedding_size)

    # Count the number of words in the model's vocabulary
    word_count = 0

    # Iterate over each word in the sentence and add its vector to the embedding
    for word in words:
        if word in model:
            embedding += model[word]
            word_count += 1

    # Calculate the average of word vectors to get the sentence embedding
    if word_count > 0:
        embedding /= word_count

    return embedding

In [None]:
book_metadata_df["book_embedding"] = book_metadata_df["book_data"].apply(lambda words: generate_sentence_embedding(words, word2vec_model))

In [None]:
print(book_metadata_df["book_embedding"].head())

0    [0.056060791015625, 0.026201248168945312, 0.03...
1    [0.06280721028645833, 0.011431884765625, 0.040...
2    [0.039111328125, 0.03606414794921875, 0.034558...
3    [0.041753133138020836, -0.050252278645833336, ...
4    [0.060791015625, 0.03930442563949093, -0.00409...
Name: book_embedding, dtype: object


In [None]:
book_initial_embeddings = {item_id: list(emb) for item_id, emb in zip(book_metadata_df['asin'].tolist(), book_metadata_df["book_embedding"])}

In [None]:
#list(book_initial_embeddings.values())[:2]

# Export embeddings

In [None]:
#with open("/content/drive/MyDrive/Priyanka/book_initial_embeddings.json", "w") as f:
#    json.dump(book_initial_embeddings, f)

with open("/content/drive/MyDrive/Priyanka/book_initial_embeddings.pkl", "wb") as f:
    pickle.dump(book_initial_embeddings, f)