In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
from google.colab import drive
import gzip
import json
drive.mount('/content/drive')
from tqdm.auto import tqdm
import re
from scipy.cluster.vq import *
import torch
import torch.nn as nn
import torch.optim as optim
import math
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaModel
from transformers import AdamW

Loading previously created processed dataframe

In [None]:
grouped_df=pd.read_csv("/content/drive/MyDrive/Data/grouped_df.csv")
grouped_df=grouped_df[1:25000]

In [None]:
class DoubleStackBERT:
    def __init__(self, tokenizer_source): 
        self.model = RobertaModel.from_pretrained('/content/drive/MyDrive/Data/docberta_dummy_25000')
        self.codebook = np.load('/content/drive/MyDrive/Data/tp_codebook_25000.npy')
        with open('/content/drive/MyDrive/Data/tp_vocabs_25000.json') as json_file:
            self.__vocabs = json.load(json_file)
        self.sentenceTransformer = SentenceTransformer('all-mpnet-base-v2')
        self.bos_token = self.cls_token = ''
        self.eos_token = self.sep_token = ''
        self.unk_token = ''
        self.pad_token = ''
        self.mask_token ='' 
        self.mask_token_id = self.__vocabs['']

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
    def getVocabularySize(self):
        return len(self.__vocabs)
  
    # Splits text into sentences
    def textToSentences(self,text):
        try:
            sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)
        except:
            sentences = [""]
        return sentences

    # Splits to whole document into sentences and generates sentence embedding using pretrained SBERT model for 
    # each sentence and returns a  2D list of those embedding
    def createSentencesEmbedding(self, data):
        # splitting each text of the dataset into a list of sentences
        array = []
        if type(data) == str: 
            array.append(data)
        else: 
            array = data

        sentences = [self.textToSentences(text) for text in array]
        embeddings = []
        for i in range(len(sentences)):
            embeddings.append(self.sentenceTransformer.encode(sentences[i]))

        return embeddings
  
    def createTokens(self, list_of_docs, max_length= -1, padding = True, truncation = True): 
        docs_tokens = [] 
        doc_embeddings = self.createSentencesEmbedding(list_of_docs)
        for doc in doc_embeddings:
            cluster_ids = vq(doc,self.codebook)
            token_ids = []
            attention_mask = []
            for cluster in cluster_ids[0]: 
                token_ids.append(self.__vocabs[str(cluster)])
            #If truncation is set, we truncate the tokens above max_length and set padding to false
            if truncation and max_length != -1 :
                if len(token_ids) + 2 > max_length:
                    token_ids = token_ids[:(max_length-2)]
                    padding = False

            input_ids = [self.__vocabs['']] + token_ids + [self.__vocabs['']]
            attention_mask.extend([1] * len(input_ids))
            # If padding is set then added padding to the remaining space of max_length 
            # Because all the input ids are not of the same size and roberta models deal with same sized inputs 
            if padding:
                padding_len = max_length - len(input_ids)
                input_ids.extend([self.__vocabs['']] * padding_len)
                attention_mask.extend([0] * padding_len)
            #creates a list of dictionary of input ids and their attention masks
            docs_tokens.append({'input_ids':input_ids, 'attention_mask':attention_mask})

        return docs_tokens
    #Creates document tensors for all the rows in dataset and returns a tensor of 2D array containing tensors for all documents
    def documentTensors(self,dataset,max_length= -1, padding = True, truncation = True):
        tensors = []
        if type(dataset) is str:
            dataset = [dataset]
        for d in dataset:
            t = torch.tensor([self.createTokens(d,max_length,padding,truncation)[0]['input_ids']])
            output = self.model(t).pooler_output.cpu().detach().numpy()
            tensors.append(output.reshape((output.shape[1])))
        return tensors

    #Creates a document vector by concatenating document tensors and sentence embeddings
    def documentEmbeddings(self,data, max_length= -1, padding = True, truncation = True):
        if type(data) == str:
            data = [data]
        embeddings = self.sentenceTransformer.encode(data)
        tensors = self.documentTensors(data,max_length,padding,truncation)
        return np.concatenate((embeddings, tensors), axis=1)

Save all the book embeddings

In [None]:
import numpy as np
np.save("/content/drive/MyDrive/Data/book_embeddings_25000.npy",book_embeddings)

In [None]:
# book_embeddings=np.load('/content/drive/MyDrive/Data/book_embeddings_25000.npy',allow_pickle=True)

In [None]:
#Get book title from id number
def title(index):
    return grouped_df[grouped_df.S_no == index]["title"].values[0]

In [None]:
from scipy.cluster.vq import *
# using 100 cluster
cluster_centers,_ = kmeans(book_embeddings, 100, 20)

In [None]:
# np.save("/content/drive/MyDrive/Data/book_Codebook_10000.npy",cluster_centers)

In [None]:
#Assign a cluster to every book_embedding
cluster, _ = vq(book_embeddings,cluster_centers)

In [None]:
grouped_df['cluster'] = cluster

In [None]:
grouped_df.iloc[8:9,:].head(2)

Unnamed: 0,language_code,is_ebook,average_rating,similar_books,description,publisher,title,book_id,book_genre,author_ids,n_votes,rating,review_text,review_length,description_len,combined,cluster
2077,en-GB,False,2.93,[],It's not the greatest start to a summer holida...,Oxford,Forever X,2167609,100000000000,['1631'],0.0,5.0,"A delightful, slightly bonkers tale of a car b...",470,942,Oxford Forever X ['1631']2.93 00100000000000It...,54


In [None]:

d="""It's not the greatest start to a summer holiday. The car breaks down, miles from anywhere, and suddenly Joy and her family are stuck in the first 
bed-and-breakfast they can find. And it's no ordinary place. Forever X, says the sign on the house. That's strange enough. But inside, everything is even 
weirder - from the girl in an elf costume to the mysterious Mr Angel. And then the police arrive.... a superbly written novel, both funny and 
illuminating.-- School Librarian. * Geraldine McCaughrean is one of the most highly-acclaimed living children's writers. She has won the Carnegie 
Medal, the Whitbread Children's Novel Award (twice), the Guardian Children's Fiction Award, and the inaugural Blue Peter Book of the Year Award. * 
Stunning new cover design for all of Geraldine McCaughrean's novels - this title is being issued simultaneously with new matching edition of 
Plundering Paradise. * Funny, quirky theme will have broad appeal."""
n="Forever X"


In [None]:
#Instantiate model
model = DoubleStackBERT('doublestackbert_tokenizer')
query_vector=model.documentEmbeddings(n+d)
print(query_vector.T.shape,cluster_centers[0].shape)
queryCluster, _ = vq(query_vector,cluster_centers)

Some weights of the model checkpoint at /content/drive/MyDrive/Data/docberta_dummy_10000 were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/Data/docberta_dummy_10000 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able t

(1536, 1) (1536,)


In [None]:
print("Cluster id of query book:",queryCluster[0])

Cluster id of query book: 68


In [None]:
def elementsInSameCluster(centroid):
    cluster =grouped_df
    cluster['embeddings_id'] = [i for i in range(0,book_embeddings.shape[0])]
    multiIndex_cluster = cluster.sort_values(by=['cluster'])
    multiIndex_cluster = multiIndex_cluster.set_index(['cluster'])
    books = multiIndex_cluster.loc[centroid]

    ids = np.asarray(books.index)
    titles = [t for t in books['title']]
    genre=[t for t in books['book_genre']]
    description = [d for d in books['description']]
    embeddings = [book_embeddings[id] for id in books['embeddings_id']]
    similar_books = {'id':ids, 'title':titles, 'description':description,'genre':genre}
    return similar_books, embeddings
same_cluster_books, cluster_book_embeddings = elementsInSameCluster(queryCluster[0])

In [None]:
for i in range(len(same_cluster_books[1])):
    print("************************************************************************")
    print("Title       :",same_cluster_books['title'][i])
    print("Genre :",same_cluster_books['genre'][i])
    print("Description :",same_cluster_books['desc'][i])


Title       : The Mysterious Benedict Society (The Mysterious Benedict Society, #1)
Genre : 01111111101010
Description : "Are you a gifted child looking for special opportunities?"
Dozens of children respond to this peculiar ad in the newspaper and are then put through a series of mind-bending tests, which readers take along with them. Only four children-two boys and two girls-succeed. Their challenge: to go on a secret mission that only the most intelligent and inventive children could complete. To accomplish it they will have to go undercover at the Learning Institute for the Very Enlightened, where the only rule is that there are no rules. But what they'll find in the hidden underground tunnels of the school is more than your average school supplies. So, if you're gifted, creative, or happen to know Morse Code, they could probably use your help.
Title       : The Mysterious Benedict Society (The Mysterious Benedict Society, #1)
Genre : 01111111101010
Description : "Are you a gifted 