### Text Retrieval

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

vi_data_df = pd.read_csv('./vi_text_retrieval.csv')
context = vi_data_df['text']
context = [doc.lower() for doc in context]

tfidf_vectorizer = TfidfVectorizer()
context_embedded = tfidf_vectorizer.fit_transform(context)
context_embedded.toarray()[7][0]

0.31126580760710637

In [2]:
def tfidf_search(question, tfidf_vectorizer, top_d=5):
    # lowercasting before encoding
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    cosine_scores = cosine_similarity(context_embedded, query_embedded).flatten()
    
    # get top k cosine scores and its indices 
    results = []
    for idx in cosine_scores.argsort()[-top_d:][::-1]:
        doc_score = {
            'idx': idx,
            'cosine_score': cosine_scores[idx]
        }
        results.append(doc_score)
    return results

question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, tfidf_vectorizer)
results[0]['cosine_score']

0.6279910475266974

In [9]:
def corr_search(question, tfidf_vectorizer, top_d=5):
    # lowercasting before encoding
    query_embedded  = tfidf_vectorizer.transform([question.lower()])
    corr_scores = np.corrcoef(query_embedded.toarray()[0], context_embedded.toarray())
    corr_scores = corr_scores[0][1:]
    
    # get top k corr score and its index
    results = []
    for idx in corr_scores.argsort()[::-1][:top_d]:
        doc_score = {
            'idx': idx,
            'corr_score': corr_scores[idx]
        }
        results.append(doc_score)
    return results

question = vi_data_df.iloc[0]['question']
results = corr_search(question, tfidf_vectorizer)
results[1]['corr_score']

0.2073424647197336