In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from google.colab import drive
import os
import gzip
import json
import pandas as pd
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from tqdm.auto import tqdm
import re
from scipy.cluster.vq import *
import torch
import torch.nn as nn
import torch.optim as optim
import math
import json
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaModel
from transformers import AdamW

In [27]:
grouped_df=pd.read_csv("/content/drive/MyDrive/Data/grouped_df.csv",dtype=str)
grouped_df=grouped_df[1:25000]

In [28]:
class DoubleStackBERT:
    def __init__(self, tokenizer_source): 
        #Initilize pretrained models, embeddings and vocab files
        self.model = RobertaModel.from_pretrained('/content/drive/MyDrive/Data/docberta_dummy_25000')
        self.codebook = np.load('/content/drive/MyDrive/Data/tp_codebook_25000.npy')
        with open('/content/drive/MyDrive/Data/tp_vocabs_25000.json') as json_file:
            self.__vocabs = json.load(json_file)
        self.sentenceTransformer = SentenceTransformer('all-mpnet-base-v2')
        self.bos_token = self.cls_token = ''
        self.eos_token = self.sep_token = ''
        self.unk_token = ''
        self.pad_token = ''
        self.mask_token ='' 
        self.mask_token_id = self.__vocabs['']

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
    def getVocabularySize(self):
        return len(self.__vocabs)
  
    # Splits text into sentences
    def textToSentences(self,text):
        try:
            sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)
        except:
            sentences = [""]
        return sentences

    # Splits to whole document into sentences and generates sentence embedding using pretrained SBERT model for 
    # each sentence and returns a  2D list of those embedding
    def createSentencesEmbedding(self, data):
        # splitting each text of the dataset into a list of sentences
        array = []
        if type(data) == str: 
            array.append(data)
        else: 
            array = data

        sentences = [self.textToSentences(text) for text in array]
        embeddings = []
        for i in range(len(sentences)):
            embeddings.append(self.sentenceTransformer.encode(sentences[i]))

        return embeddings
  
    def createTokens(self, list_of_docs, max_length= -1, padding = True, truncation = True): 
        docs_tokens = [] 
        doc_embeddings = self.createSentencesEmbedding(list_of_docs)
        for doc in doc_embeddings:
            cluster_ids = vq(doc,self.codebook)
            token_ids = []
            attention_mask = []
            for cluster in cluster_ids[0]: 
                token_ids.append(self.__vocabs[str(cluster)])
            #If truncation is set, we truncate the tokens above max_length and set padding to false
            if truncation and max_length != -1 :
                if len(token_ids) + 2 > max_length:
                    token_ids = token_ids[:(max_length-2)]
                    padding = False

            input_ids = [self.__vocabs['']] + token_ids + [self.__vocabs['']]
            attention_mask.extend([1] * len(input_ids))
            # If padding is set then added padding to the remaining space of max_length 
            # Because all the input ids are not of the same size and roberta models deal with same sized inputs 
            if padding:
                padding_len = max_length - len(input_ids)
                input_ids.extend([self.__vocabs['']] * padding_len)
                attention_mask.extend([0] * padding_len)
            #creates a list of dictionary of input ids and their attention masks
            docs_tokens.append({'input_ids':input_ids, 'attention_mask':attention_mask})

        return docs_tokens
    #Creates document tensors for all the rows in dataset and returns a tensor of 2D array containing tensors for all documents
    def documentTensors(self,dataset,max_length= -1, padding = True, truncation = True):
        tensors = []
        if type(dataset) is str:
            dataset = [dataset]
        for d in dataset:
            t = torch.tensor([self.createTokens(d,max_length,padding,truncation)[0]['input_ids']])
            output = self.model(t).pooler_output.cpu().detach().numpy()
            tensors.append(output.reshape((output.shape[1])))
        return tensors

    #Creates a document vector by concatenating document tensors and sentence embeddings
    def documentEmbeddings(self,data, max_length= -1, padding = True, truncation = True):
        if type(data) == str:
            data = [data]
        embeddings = self.sentenceTransformer.encode(data)
        tensors = self.documentTensors(data,max_length,padding,truncation)
        return np.concatenate((embeddings, tensors), axis=1)

In [30]:
book_embeddings=np.load('/content/drive/MyDrive/Data/book_embeddings_10000.npy',allow_pickle=True)

In [31]:
def title(index):
    return grouped_df[grouped_df.S_no == index]["title"].values[0]

In [32]:
from scipy.cluster.vq import *
# using 100 cluster
bookCodebook,_ = kmeans(book_embeddings, 100, 20)

In [34]:
cluster, _ = vq(book_embeddings,bookCodebook)

In [35]:
grouped_df['cluster'] = cluster

In [37]:
model = DoubleStackBERT('doublestackbert_tokenizer')


Some weights of the model checkpoint at /content/drive/MyDrive/Data/docberta_dummy_10000 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/Data/docberta_dummy_10000 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [38]:
def getElementsInTheSameCluster(centroid):
    cluster =grouped_df
    cluster['embeddings_id'] = [i for i in range(0,book_embeddings.shape[0])]
    multiIndex_cluster = cluster.sort_values(by=['cluster'])
    multiIndex_cluster = multiIndex_cluster.set_index(['cluster'])
    books = multiIndex_cluster.loc[centroid]

    ids = np.asarray(books.index)
    titles = [t for t in books['title']]
    genre=[t for t in books['book_genre']]
    description = [d for d in books['description']]
    embeddings = [book_embeddings[id] for id in books['embeddings_id']]
    similar_books = {'id':ids, 'title':titles, 'description':description,'genre':genre}
    return similar_books, embeddings


In [39]:
#Calculate relevance of document based on matching number of book genres
def relevance(predicted,expected,total):
  child_genres=["classic",'favorite','fiction','children','fantasy','young-adult','childhood','kids','adventure','animal','school','picture-books','library','other']
  baseline=sum([1 for i in expected if i=="1"])
  rel_score=[]
  for p in predicted[:total]:
    score=0
    for idx,p in enumerate(p):
      if expected[idx]==p and p=="1":score+=1
    rel_score.append(score/(baseline+0.01)*1)
  return rel_score

#calculate precision with a threshold of 0.4
def precision(relevance_score):
  return sum([1 for i in relevance_score if i>0.4])/len(relevance_score)


In [40]:
def calculate_score(total=10):
  score=[]
  for row,d in grouped_df.iterrows():
    name=d["title"]
    desc=d["description"]
    #concatenate book name and description and get document embedding for the same
    query_vector=model.getEmbeddings(name+desc)
    #check qhich cluster it belongs to
    queryCluster, _ = vq(query_vector,bookCodebook)
    #Fetch books in the same cluster
    similar_books, cluster_book_embeddings = getElementsInTheSameCluster(queryCluster[0])
    book_g=d["book_genre"]
    similar_books=list(set(similar_books['genre']))
    #calculate precision@total
    relevance_score=relevance(similar_books,book_g,total)
    prec=precision(relevance_score)
    score.append(prec)
  return sum(score)/len(score)
  

In [41]:
result=calculate_score(5)
print("Precision @5: ",result)

Precision @5:  0.9413741374137464


In [42]:
result=calculate_score(10)
print("Precision @10: ",result)

Precision @10:  0.7889888988899418


In [43]:
result=calculate_score(25)
print("Precision @25: ",result)

Precision @25:  0.7621362136214499
