# Search Engine Application Core Functionality 
#### Finding relevant documents 

In [1]:
import os
import sys
import json
import re
import string
import random
import time
import datetime
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import plotly.express as px
from tqdm import tqdm
# import plotly.io as pio

from argparse import Namespace
from tqdm import tqdm
from datasets import Dataset

import transformers
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline

import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer

import pickle

import faiss

2024-03-24 03:09:13.592525: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
args = Namespace(
    corpus_path="./processed_data/search_test.csv",
    model_path="models/combined/parallel_combined",
    tfidf_pkl_path="./TFIDF/tfidf.pkl",
    num_results=10
)

## Data Preparation

In [12]:
df = pd.read_csv(args.corpus_path)
df = df[['query','main','label','name']]
df['idx'] = df.index
df.head()

Unnamed: 0,query,main,label,name,idx
0,as significantly restricted in his ability to ...,holding that an employer did not regard the em...,0,,0
1,will my photos be sold to facebook?,"Please contact us, as provided above, to make ...",0,,1
2,do you sell my data,You can object to the processing of your infor...,0,,2
3,which permissions does the app require and wha...,If you want to stop receiving personalized ads...,0,,3
4,interest rates,The rate of interest paid under the Term Note ...,1,,4


## Set up Pretrained model and tokeniser

#METHOD 1 : TF-IDF ==> BERT 

In [13]:
nltk.download('stopwords')
class tfidf_corp:
    '''
        Class definition of tfidf_corp object for building TF-IDF matrix of document corpus and performing 
        cosine similarity searches.
    '''


    def __init__(self, datapath):
        '''
            Constructor : initializes vectorizer object, corpus TF-IDF matrix, empty document list, and stopword list
        '''
        self.vectorizer = TfidfVectorizer()
        self.corpus_tfidf = None
        self.documents = []
        self.stop_words = set(stopwords.words('english') + list(string.punctuation))
        self.datapath = datapath

    def set_documents(self, df):
        self.documents = df

    def load_documents(self):
        with open(self.datapath, 'r') as corpus_file:
            self.documents = json.load(corpus_file)

    def add_document(self, document):
        '''
            Appends a single document objects to documents list class-attribute 

            Arguments:
                document : document json object (main, name, ..., extra)
        '''
        self.documents.append(document)

    def add_documents(self, documents):
        '''
            Appends list of documents to documents list class-attribute 

            Arguments:
                documents : list of document json objects [{main, name, ..., extra}]
        '''
        self.documents = self.documents + documents
    
    def generate_tfidf(self):
        '''
            Computes TF-IDF matrix for document corpus 
        '''

        if len(self.documents) < 1:
            print('No documents in corpus')
            return

        self.corpus_tfidf = self.vectorizer.fit_transform([obj['main'] for idx,obj in self.documents.iterrows()])

    def search(self, query, k):
        '''
            Performs cosine similarity search for query against document corpus 
        '''

        query_vector = self.vectorizer.transform([query])
        similarities = linear_kernel(query_vector, self.corpus_tfidf).flatten()

        ranked_documents = [(self.documents.loc[i], score) for i, score in enumerate(similarities) if score > 0]
        ranked_documents.sort(key=lambda x: x[1], reverse=True)

        return ranked_documents[0:k]


    def store_matrix(self, path):
        '''
            Saves TF-IDF matrix into pickle file 
        '''

        with open(path, 'wb') as pickle_file:
            pickle.dump((self.vectorizer, self.corpus_tfidf), pickle_file)


    def load_matrix(self, path):
        ''' 
            Loads TF-IDF matrix from pickle file 
        '''

        with open('Embeddings/tfidf.pkl', 'rb') as pickle_file:
            self.vectorizer, self.corpus_tfidf = pickle.load(pickle_file) # need to save both vectorizer object and matrix to file

[nltk_data] Downloading package stopwords to /home/jz75/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
engine = tfidf_corp(args.corpus_path)

# engine.load_documents()
engine.set_documents(df)

engine.generate_tfidf()


In [31]:
def search1(query, df, model_path):
    # engine = tfidf_corp(args.corpus_path)

    # engine.load_documents()
    # engine.set_documents(df)

    # engine.generate_tfidf()

    # engine.store_matrix(args.tfidf_pkl_path)

    top_k_tfidf = engine.search(query, 100)


    df_rows = [row for row,_ in top_k_tfidf]

    dataframe = pd.concat(df_rows, axis=1).transpose()

    dataframe.head()

    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = transformers.BertTokenizer.from_pretrained('casehold/legalbert')

    device = "cpu"
 
    if torch.cuda.is_available():
        device = 'cuda'

    model.to(device)
    
    similarity_scores = []

    for main_text in dataframe['main']:
        combined_input = query + " [SEP] " + main_text

        # Tokenize and encode the text for the model input
        text_tokens = tokenizer(combined_input, return_tensors='pt', padding=True, truncation=True, max_length=512)
        text_tokens = {key: value.to(device) for key, value in text_tokens.items()}
        
        # Get text embedding
        with torch.no_grad():
            model_output = model(**text_tokens)
            logits = model_output.logits 
            score = torch.nn.functional.softmax(logits, dim=1)[:,1].item()
            similarity_scores.append(score)
        
        

    # Add similarity scores to the dataframe
    dataframe['similarity'] = similarity_scores
    
    # Sort the dataframe by similarity scores in descending order
    sorted_dataframe = dataframe.sort_values(by='similarity', ascending=False)
    
    # Optionally, you might want to drop the similarity column before returning
    # sorted_dataframe.drop(columns=['similarity'], inplace=True)
    
    return sorted_dataframe.iloc[0]['idx']

In [32]:
df.iloc[14]

query    is there 2-step verification in case somebody ...
main     If you use Evernote Business, the Account Hold...
label                                                    0
name                                                   NaN
idx                                                     14
Name: 14, dtype: object

In [33]:
num_correct = 0
total = 0
for idx, row in df.iterrows():
    try:
        result = search1(row['query'], df, args.model_path)

        if row['idx'] == result : num_correct += 1
        total += 1
    except (ValueError ): continue


print(num_correct/total)

0.021627188465499485
