In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import json
from sklearn.metrics import accuracy_score
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [2]:
def remove_punctuations(text):
    return text.replace('?',"")

In [3]:
def remove_accents(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [4]:
def remove_stopwords(text):
    list_sentences = text.split(".")
    new_lines = []
    for line in list_sentences:
        list_words = line.split(" ")
        new_lines.append(" ".join([word.lower() for word in list_words if word not in stopwords.words('english')]))
    return  ".".join(new_lines)

In [5]:
stemmer_obj = SnowballStemmer('english')  
lem = WordNetLemmatizer()

def lemmatizer_stemmer(text):
    list_sentences = text.split(".")
    new_lines = []
    for line in list_sentences:
        list_words = line.split(" ")
        # print(line)
        new_line = []
        for word in list_words:
            new_line.append(stemmer_obj.stem(lem.lemmatize(word)))
        new_lines.append(" ".join(new_line))
        # print(new_line)
    return ".".join(new_lines)

In [6]:
def preprocess_text(text: str) -> str :
    return lemmatizer_stemmer(remove_stopwords(remove_punctuations(remove_accents(text))))

## Train/Test Split

In [7]:
df = pd.read_csv("Data/train_data_original.csv", index_col=0)

In [8]:
train_df = df.drop(df[(df['Theme']=='Uranium') | (df['Theme'] == 'Alps')].index)

In [9]:
test_df = df[(df['Theme'] == 'Uranium') | (df['Theme'] == 'Alps')]

In [10]:
len(test_df)+len(train_df)

75055

## Training

In [18]:
train_paras = [preprocess_text(i) for i in train_df.Paragraph.unique()]

In [19]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_paras)

TfidfVectorizer()

In [20]:
len(train_paras)#, len(train_quesn)

15427

In [15]:
def para_prepro(i: int):
    return lemmatizer_stemmer(remove_stopwords(remove_punctuations(remove_accents(train_df.Paragraph.iloc[i]))))

In [16]:
def ques_prepro(i: int):
    return lemmatizer_stemmer(remove_stopwords(remove_punctuations(remove_accents(train_df.Question.iloc[i]))))

In [53]:
l1 = []
l2 = []
for i in tqdm(range(50000)):
    tfidf_ques = vectorizer.transform([ques_prepro(i)])
    tfidf_para = vectorizer.transform([para_prepro(i)])
    l1.append(float(cosine_similarity(tfidf_ques, tfidf_para)))
    l2.append(float(cosine_similarity(tfidf_ques, vectorizer.transform([para_prepro(i+50*np.random.choice(np.array([-2, -1, 1, 2])))]))))

  0%|          | 0/50000 [00:00<?, ?it/s]

In [54]:
m1 = sum(l1)/len(l1)
m2 = sum(l2)/len(l2)
th = (m1+m2)/2
print(th)
# learned value for 'th' is : 0.15951137477530242

0.15951137477530242


In [55]:
print(m1, m2)

0.2756027601667071 0.04341998938389775


## Testing

In [21]:
test_paras = test_df.Paragraph.unique()

In [22]:
cos_sim = np.full((len(test_df.Question), len(test_paras)), 0.0)

In [23]:
for i in tqdm(range(len(test_df.Question))):
    for j in range(len(test_paras)):
        cos_sim[i,j] = float(cosine_similarity(vectorizer.transform([preprocess_text(test_df.Question.iloc[i])]), vectorizer.transform([preprocess_text(test_paras[j])])))

  0%|          | 0/523 [00:00<?, ?it/s]

In [25]:
len(test_df.Question), len(test_paras)

(523, 128)

In [26]:
type(test_df.Question), type(test_paras)

(pandas.core.series.Series, numpy.ndarray)

In [27]:
best_para_idx = np.argmax(cos_sim,axis=1)

In [28]:
true_para_idx = []
for i in list(test_df.Paragraph):
    true_para_idx.append(list(test_paras).index(i))

In [29]:
type(true_para_idx), type(best_para_idx)

(list, numpy.ndarray)

In [30]:
len(true_para_idx), len(best_para_idx)

(523, 523)

In [31]:
accuracy_score(true_para_idx, best_para_idx)*100

64.24474187380497

In [34]:
true_para_idx

[0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 3,
 3,
 4,
 4,
 5,
 5,
 6,
 7,
 7,
 7,
 8,
 8,
 8,
 9,
 10,
 10,
 11,
 11,
 11,
 11,
 12,
 13,
 13,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 18,
 18,
 18,
 19,
 19,
 19,
 19,
 20,
 21,
 21,
 21,
 22,
 22,
 23,
 23,
 24,
 24,
 24,
 24,
 25,
 25,
 25,
 26,
 26,
 26,
 26,
 27,
 28,
 29,
 29,
 30,
 30,
 30,
 30,
 31,
 31,
 32,
 32,
 32,
 33,
 33,
 33,
 33,
 34,
 34,
 34,
 34,
 34,
 35,
 35,
 36,
 36,
 37,
 37,
 37,
 38,
 38,
 38,
 39,
 39,
 39,
 39,
 39,
 40,
 40,
 40,
 41,
 41,
 41,
 41,
 41,
 42,
 42,
 43,
 43,
 44,
 45,
 45,
 45,
 46,
 46,
 46,
 47,
 48,
 48,
 48,
 49,
 49,
 49,
 50,
 50,
 51,
 51,
 51,
 52,
 52,
 52,
 53,
 53,
 53,
 53,
 54,
 54,
 54,
 55,
 55,
 55,
 55,
 55,
 56,
 56,
 56,
 57,
 57,
 57,
 57,
 58,
 58,
 58,
 58,
 59,
 59,
 59,
 60,
 60,
 60,
 60,
 61,
 61,
 61,
 62,
 62,
 62,
 62,
 63,
 63,
 64,
 64,
 64,
 65,
 65,
 65,
 66,
 66,
 66,
 66,
 66,
 67,
 67,
 67,
 68,
 68,
 69,
 70,
 70,
 70,
 71,

In [32]:
best_para_cos_sim = np.max(cos_sim,axis=1)

In [33]:
best_para_cos_sim

array([0.41610977, 0.36560519, 0.51362324, 0.40554515, 0.19823942,
       0.14202045, 0.18591284, 0.2615077 , 0.26229456, 0.3678208 ,
       0.48982123, 0.41086587, 0.53255801, 0.61685095, 0.38767974,
       0.34907354, 0.33290861, 0.4619204 , 0.24753279, 0.25658075,
       0.35117266, 0.33709083, 0.37165002, 0.19549459, 0.23652574,
       0.60504678, 0.47413893, 0.33557475, 0.46948594, 0.2823495 ,
       0.05558193, 0.24016843, 0.15502893, 0.4919657 , 0.33928052,
       0.28015694, 0.37325321, 0.35088203, 0.54552199, 0.45399542,
       0.43482898, 0.22453612, 0.40626038, 0.28833183, 0.41323031,
       0.20259604, 0.45191897, 0.37601485, 0.24282262, 0.34336933,
       0.16991703, 0.15044767, 0.37331235, 0.43761093, 0.47565785,
       0.29105052, 0.24685169, 0.1786868 , 0.20012424, 0.60650388,
       0.30574278, 0.38732026, 0.40235987, 0.52786856, 0.44688609,
       0.40390738, 0.48513792, 0.48828125, 0.26690699, 0.22193312,
       0.36162982, 0.47033879, 0.27784372, 0.53581297, 0.42710