# Search Engine Application Core Functionality 
#### Finding relevant documents 

In [1]:
import os
import sys
import json
import re
import string
import random
import time
import datetime
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import plotly.express as px
from tqdm import tqdm
# import plotly.io as pio

from argparse import Namespace
from tqdm import tqdm
from datasets import Dataset

import transformers
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline

import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer

import pickle

import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
args = Namespace(
    corpus_path="./../Dev/corpus.json",
    model_path="models/mlm_model_manual",
    tfidf_pkl_path="./TFIDF/tfidf.pkl",
    num_results=10
)

## Data Preparation

In [3]:
df = pd.read_json(args.corpus_path)

df.head()

Unnamed: 0,name,main,date,jurisdictions,court,attorneys,extra,id
0,"Ann M. Osborn and Thomas Osborn, Plaintiffs in...","Ann M. Osborn and Thomas Osborn, Plaintiffs in...",1857-12-01,Illinois,Illinois Supreme Court,"[Underwoods, for Plaintiffs in Error., G. Koer...","[{'text': 'Catón, C. J. Mrs. Osborn was entitl...",AnnMOsbornandThomasOsbornPlaintiffsinErrorvJac...
1,"Nathan Prentice, Appellant, v. Phineas Kimball...","Nathan Prentice, Appellant, v. Phineas Kimball...",1857-12-01,Illinois,Illinois Supreme Court,"[C. L. Higbee, for Appellant., J. Grimshaw and...","[{'text': 'Breese, J. It is a rule in courts o...",NathanPrenticeAppellantvPhineasKimballAppellee
2,"Edward Haven et al., Plaintiffs in Error, v. H...","Edward Haven et al., Plaintiffs in Error, v. H...",1857-12-01,Illinois,Illinois Supreme Court,"[G. Koerner, for Plaintiffs in Error., G. Trum...","[{'text': 'Breese, J. On the 12th day of Eebru...",EdwardHavenetalPlaintiffsinErrorvHilmanMehlgar...
3,"Stephen R. Rowan and Nancy Ann, his Wife, Comp...","Stephen R. Rowan and Nancy Ann, his Wife, Comp...",1857-11-01,Illinois,Illinois Supreme Court,"[Nelson & Johnson, for Appellants., J. Olney a...","[{'text': 'Breese, Justice, delivered the opin...",StephenRRowanandNancyAnnhisWifeComplainantsApp...
4,"Thomas Rodney, Plaintiff in Error, v. The Illi...","Thomas Rodney, Plaintiff in Error, v. The Illi...",1857-11-01,Illinois,Illinois Supreme Court,"[J. Dougherty, for Plaintiff in Error., C. G. ...","[{'text': 'Skinner, J. The plaintiff sued the ...",ThomasRodneyPlaintiffinErrorvTheIllinoisCentra...


## Set up Pretrained model and tokeniser

#METHOD 1 : TF-IDF ==> BERT 

In [4]:
nltk.download('stopwords')
class tfidf_corp:
    '''
        Class definition of tfidf_corp object for building TF-IDF matrix of document corpus and performing 
        cosine similarity searches.
    '''


    def __init__(self, datapath):
        '''
            Constructor : initializes vectorizer object, corpus TF-IDF matrix, empty document list, and stopword list
        '''
        self.vectorizer = TfidfVectorizer()
        self.corpus_tfidf = None
        self.documents = []
        self.stop_words = set(stopwords.words('english') + list(string.punctuation))
        self.datapath = datapath

    def set_documents(self, df):
        self.documents = df

    def load_documents(self):
        with open(self.datapath, 'r') as corpus_file:
            self.documents = json.load(corpus_file)

    def add_document(self, document):
        '''
            Appends a single document objects to documents list class-attribute 

            Arguments:
                document : document json object (main, name, ..., extra)
        '''
        self.documents.append(document)

    def add_documents(self, documents):
        '''
            Appends list of documents to documents list class-attribute 

            Arguments:
                documents : list of document json objects [{main, name, ..., extra}]
        '''
        self.documents = self.documents + documents
    
    def generate_tfidf(self):
        '''
            Computes TF-IDF matrix for document corpus 
        '''

        if len(self.documents) < 1:
            print('No documents in corpus')
            return

        self.corpus_tfidf = self.vectorizer.fit_transform([obj['main'] for idx,obj in self.documents.iterrows()])

    def search(self, query, k):
        '''
            Performs cosine similarity search for query against document corpus 
        '''

        query_vector = self.vectorizer.transform([query])
        similarities = linear_kernel(query_vector, self.corpus_tfidf).flatten()

        ranked_documents = [(self.documents.loc[i], score) for i, score in enumerate(similarities) if score > 0]
        ranked_documents.sort(key=lambda x: x[1], reverse=True)

        return ranked_documents[0:k]


    def store_matrix(self, path):
        '''
            Saves TF-IDF matrix into pickle file 
        '''

        with open(path, 'wb') as pickle_file:
            pickle.dump((self.vectorizer, self.corpus_tfidf), pickle_file)


    def load_matrix(self, path):
        ''' 
            Loads TF-IDF matrix from pickle file 
        '''

        with open('Embeddings/tfidf.pkl', 'rb') as pickle_file:
            self.vectorizer, self.corpus_tfidf = pickle.load(pickle_file) # need to save both vectorizer object and matrix to file

[nltk_data] Downloading package stopwords to /home/jz75/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def search1(query, df, model_path):
    engine = tfidf_corp(args.corpus_path)

    # engine.load_documents()
    engine.set_documents(df)

    engine.generate_tfidf()

    # engine.store_matrix(args.tfidf_pkl_path)

    top_k_tfidf = engine.search(query, 10)


    df_rows = [row for row,_ in top_k_tfidf]

    dataframe = pd.concat(df_rows, axis=1).transpose()


    model = transformers.BertModel.from_pretrained(model_path)
    tokenizer = transformers.BertTokenizer.from_pretrained('casehold/legalbert')

    device = "cpu"
 
    if torch.cuda.is_available():
        device = 'cuda'

    model.to(device)

    query_tokens = tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)
    query_tokens = {key: value.to(device) for key, value in query_tokens.items()}
    
    with torch.no_grad():
        query_embedding = model(**query_tokens).last_hidden_state.mean(dim=1)

    similarity_scores = []

    for main_text in dataframe['main']:

        # Tokenize and encode the text for the model input
        text_tokens = tokenizer(main_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        text_tokens = {key: value.to(device) for key, value in text_tokens.items()}
        
        # Get text embedding
        with torch.no_grad():
            text_embedding = model(**text_tokens).last_hidden_state.mean(dim=1)
        
        # Compute cosine similarity and append to list
        similarity = cosine_similarity(query_embedding.cpu().numpy(), text_embedding.cpu().numpy())[0][0]
        similarity_scores.append(similarity)

    # Add similarity scores to the dataframe
    dataframe['similarity'] = similarity_scores
    
    # Sort the dataframe by similarity scores in descending order
    sorted_dataframe = dataframe.sort_values(by='similarity', ascending=False)
    
    # Optionally, you might want to drop the similarity column before returning
    # sorted_dataframe.drop(columns=['similarity'], inplace=True)
    
    return sorted_dataframe

In [6]:
result = search1('illinois defendent', df, args.model_path)
result.loc[:4, ['similarity', 'name']]

Some weights of BertModel were not initialized from the model checkpoint at models/mlm_model_manual and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,similarity,name
281,0.512436,The Illinois Central Railroad Co. v. Carl Wodr...
166,0.460161,"The Butler Street Foundry and Iron Company, De..."
417,0.446165,The Illinois Educational Association v. Peter ...
89,0.385628,"Charles H. Austin et al., by their next friend..."
84,0.358708,"Charles Sprague, Plaintiff in Error, v. The Il..."
4,0.345513,"Thomas Rodney, Plaintiff in Error, v. The Illi..."


# METHOD 2 : FAISS

In [63]:
def embed_sentence(model, tokenizer, text):
    '''
        For embedding a given sentence using the pre-trained BERT model
    '''

    with torch.no_grad():
        tokens = tokenizer.encode(text)
        batch_tokens = np.expand_dims(tokens, axis=0)
        batch_tokens = torch.tensor(batch_tokens).cuda()
        return model(batch_tokens)[0].cpu()

In [12]:
def compute_embedding_mean(embedding):
    if not isinstance(embedding, torch.Tensor):
        print('Embedding must be a torch.Tensor')
        return
    return embedding.mean(1)

def compute_cosine_measure(x1, x2):
    return cosine_similarity(x1,x2)

def compute_distance(x1, x2):
    return compute_cosine_measure(x1.detach().numpy(), x2.detach().numpy())

In [140]:
CHUNK_SIZE_EACH =50



def __embedding(text, model, tokenizer):
  return compute_embedding_mean(embed_sentence(model,tokenizer, text))



def compute_bert_embeddings(dataframe_chunk, current_index, end_marker, model, tokenizer):

  np_chunk = __embedding(dataframe_chunk.loc[current_index * end_marker]['name'], model, tokenizer).detach().numpy()
  # np_chunk = np_chunk.reshape(np_chunk.shape[1])
  print(end_marker)

  for idx in range(1, end_marker):

    try:
      embedding = __embedding(dataframe_chunk.loc[(current_index * end_marker) + idx]['name'], model, tokenizer).detach().numpy()
      #embedding = embedding.reshape(embedding.shape[1])
      np_chunk = np.append(np_chunk, embedding, axis = 0)
      # print('\r {}'.format(np_chunk.shape), end = '')
    except Exception as e:
      # print(e)
      np_chunk = np.append(np_chunk, np.zeros(shape = (1, 768)), axis = 0)
      continue 

  # print(np_chunk.shape)
  np.savez_compressed('title_{}'.format(current_index), a = np_chunk)


def compute_embeddings_and_save(dataframe, model, tokenizer):

  n_rows = len(dataframe)
  
  chunk_sizes = n_rows // CHUNK_SIZE_EACH
  remaining = n_rows - chunk_sizes * CHUNK_SIZE_EACH

  for i in range(0, 1):
    print('test')

    compute_bert_embeddings(dataframe[i * CHUNK_SIZE_EACH : (i * CHUNK_SIZE_EACH) + CHUNK_SIZE_EACH ], i, CHUNK_SIZE_EACH, model, tokenizer)

In [61]:
model = transformers.BertModel.from_pretrained(args.model_path)
tokenizer = transformers.BertTokenizer.from_pretrained('casehold/legalbert')

device = "cpu"

if torch.cuda.is_available():
    device = 'cuda'

model.to(device)

Some weights of BertModel were not initialized from the model checkpoint at models/mlm_model_manual and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [141]:
bert_embeddings = compute_embeddings_and_save(df, model, tokenizer)

test
50


In [129]:
def do_faiss_lookup(fastIndex, query_text, model, tokenizer, top_k):
    embedding_q = compute_embedding_mean(embed_sentence(model, tokenizer, query_text)).detach().numpy()
    
    #let it be float32
    embedding_q = embedding_q.astype('float32')
    
    #perform the search
    st = time.time()
    matched_em, matched_indexes = fastIndex.search(embedding_q, top_k) # it returns matched vectors and thier respective indexes, we are interested only in indexes.
    
    #indexes are already sorted wrt to closest match
    et = time.time()
    
    return et - st, matched_indexes

In [130]:
index_map = {} 
for index,row in df.iterrows():
    index_map[index] = {
        "Title" : row['name'],
        'Body' : row['main']
    }

In [144]:

bert_embeddings = np.load('title_0.npz')['a']
n_dimensions = bert_embeddings.shape[1] #Number of dimensions (764)

print(n_dimensions)

# We will create an index of type FlatL2, there are many kinds of indexes, you can look at it in their 
fastIndex = faiss.IndexFlatL2(n_dimensions) 

# Add the embedding vector to faiss index, it should of dtype 'float32'
fastIndex.add(bert_embeddings.astype('float32'))

time_faiss_cpu, indexes_top_faiss = do_faiss_lookup(fastIndex, 'illinois defendent', model, tokenizer, 10)

print('Look up time : ', time_faiss_cpu, 'seconds')


print(indexes_top_faiss[0])

for i,idx in enumerate(indexes_top_faiss):
    print('{}. {}'.format(i, df.loc[idx, 'name']))

768
Look up time :  3.719329833984375e-05 seconds
[32 36 41 13  7 42  1 18 48 33]
0. 32    The Nokomis Coal Company, Plaintiff in Error, ...
36    The Peabody Coal Company, Plaintiff in Error, ...
41    Frederick Brown, Plaintiff in Error, v. The Pe...
13    Daniel Finch et al., Apellants, v. Emma C. Mar...
7     Isaac C. Choate, Plaintiff in Error, v. The Pe...
42    Jacob H. Detrick, Appellant, v. Eli Migatt et ...
1     Nathan Prentice, Appellant, v. Phineas Kimball...
18    Richard A. Gregory and Wife, Appellants, v. Ly...
48    George Forquer et al., Appellants, v. Susannah...
33    The Groveland Coal Mining Company, Plaintiff i...
Name: name, dtype: object
