In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel



### Reading Datasets

In [4]:
DataFrame1 = pd.read_csv("/home/hasan/DATA SET/question answering/2735_4525_bundle_archive/S08_question_answer_pairs.txt", sep='\t')
DataFrame2 = pd.read_csv("/home/hasan/DATA SET/question answering/2735_4525_bundle_archive/S09_question_answer_pairs.txt", sep='\t')
DataFrame3 = pd.read_csv("/home/hasan/DATA SET/question answering/2735_4525_bundle_archive/S10_question_answer_pairs.txt", sep='\t', encoding='ISO-8859-1')


### Adding together

In [5]:
final_dataset = pd.concat([DataFrame1, DataFrame2, DataFrame3])

### Data Initialization

In [6]:
final_dataset.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4


In [7]:
final_dataset.shape

(3998, 6)

In [8]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3998 entries, 0 to 1457
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              3998 non-null   object
 1   Question                  3961 non-null   object
 2   Answer                    3422 non-null   object
 3   DifficultyFromQuestioner  3043 non-null   object
 4   DifficultyFromAnswerer    3418 non-null   object
 5   ArticleFile               3996 non-null   object
dtypes: object(6)
memory usage: 218.6+ KB


### Feature Engineering

#### Selecting Necessary Columns

In [9]:
final_dataset = final_dataset.loc[:, ['ArticleTitle','Question', 'Answer']]
final_dataset.head()

Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Abraham_Lincoln,Did his mother die of pneumonia?,no


#### Dropping rows those have nan data

In [10]:
final_dataset.dropna(axis=0, inplace=True)
final_dataset.shape

(3422, 3)

#### Removing _(under_score) from ArticleTitle column

In [11]:
final_dataset['ArticleTitle'] = final_dataset['ArticleTitle'].str.replace('_', ' ')
final_dataset.head()

Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
1,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
3,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Abraham Lincoln,Did his mother die of pneumonia?,no


#### Adding ArticleTitle and Question columns together

In [12]:
final_dataset['Question'] = final_dataset['ArticleTitle'] + ' ' +final_dataset['Question']

# Taking necessary two columns
final_dataset = final_dataset.loc[:,['Question', 'Answer']]
final_dataset.head()


Unnamed: 0,Question,Answer
0,Abraham Lincoln Was Abraham Lincoln the sixtee...,yes
1,Abraham Lincoln Was Abraham Lincoln the sixtee...,Yes.
2,Abraham Lincoln Did Lincoln sign the National ...,yes
3,Abraham Lincoln Did Lincoln sign the National ...,Yes.
4,Abraham Lincoln Did his mother die of pneumonia?,no


#### Dropping duplicate question

In [13]:
# dropping duplicate data from Question column
final_dataset = final_dataset.drop_duplicates(subset='Question')

# dropping duplicate rows
final_dataset.dropna(inplace=True)
final_dataset.head()

Unnamed: 0,Question,Answer
0,Abraham Lincoln Was Abraham Lincoln the sixtee...,yes
2,Abraham Lincoln Did Lincoln sign the National ...,yes
4,Abraham Lincoln Did his mother die of pneumonia?,no
6,Abraham Lincoln How many long was Lincoln's fo...,18 months
8,Abraham Lincoln When did Lincoln begin his pol...,1832


In [14]:
final_dataset.shape

(2206, 2)

### Vectorizer

I am going to use different different vectorizer those are :

    1. TfidfVectorizer
    2. CountVectorizer
    

#### Using TfidfVectorizer with linear kernel

In [15]:
stopwords_list = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def my_tokenizer(doc):
    
    words = word_tokenize(doc)
    pos_tags = pos_tag(words)
    
    non_stopwords = [w for w in pos_tags if not w[0].lower() in stopwords_list]
    
    non_punctuation = [w for w in non_stopwords if not w[0] in string.punctuation]
    
    lemmas = []
    for w in non_punctuation:
        if w[1].startswith('J'):
            pos = wordnet.ADJ
        elif w[1].startswith('V'):
            pos = wordnet.VERB
        elif w[1].startswith('N'):
            pos = wordnet.NOUN
        elif w[1].startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        
        lemmas.append(lemmatizer.lemmatize(w[0], pos))

    return lemmas


In [16]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(final_dataset['Question']))
print('Shape of tfidf_matrix is :', tfidf_matrix.shape)


Shape of tfidf_matrix is : (2206, 3565)


#### Testing with new questions

In [17]:
def get_question(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = linear_kernel(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    
    print('The New Question is : ', question)
    print('The Closest question found:', final_dataset.iloc[max_similarity]['Question'])
    print('Score of similarity: {:.2%}'.format(similarity[0, max_similarity]))
    print('The Answer is :', final_dataset.iloc[max_similarity]['Answer'])


In [18]:
get_question('When Abraham Lincoln started his political career')

The New Question is :  When Abraham Lincoln started his political career
The Closest question found: Abraham Lincoln Did Lincoln start his political career in 1832?
Score of similarity: 87.73%
The Answer is : Yes


In [19]:
get_question('Where did Volta enter retirement')

The New Question is :  Where did Volta enter retirement
The Closest question found: Alessandro Volta Where did Volta enter retirement?
Score of similarity: 90.66%
The Answer is : Spain


In [20]:
get_question('Can whales fly')

The New Question is :  Can whales fly
The Closest question found: Otter Do sea otters have a layer of fat like whales?
Score of similarity: 32.07%
The Answer is : No


In [21]:
get_question('Who was the fourth president of the United States')

The New Question is :  Who was the fourth president of the United States
The Closest question found: James Monroe Was James Monroe President of the United States?
Score of similarity: 44.30%
The Answer is : yes


In [22]:
get_question('How high are crime rates in Brazil')

The New Question is :  How high are crime rates in Brazil
The Closest question found: Saint Petersburg When did the crime level become higher?
Score of similarity: 51.49%
The Answer is : After the October revolution.


#### using CountVectorizer with cosine_similarity

In [23]:
stopwords_list = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def my_tokenizer(doc):
    
    words = word_tokenize(doc)
    pos_tags = pos_tag(words)
    
    non_stopwords = [w for w in pos_tags if not w[0].lower() in stopwords_list]
    
    non_punctuation = [w for w in non_stopwords if not w[0] in string.punctuation]
    
    lemmas = []
    for w in non_punctuation:
        if w[1].startswith('J'):
            pos = wordnet.ADJ
        elif w[1].startswith('V'):
            pos = wordnet.VERB
        elif w[1].startswith('N'):
            pos = wordnet.NOUN
        elif w[1].startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        
        lemmas.append(lemmatizer.lemmatize(w[0], pos))

    return lemmas


In [24]:
tfidf_vectorizer = CountVectorizer(tokenizer=my_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(final_dataset['Question']))
print('Shape of tfidf_matrix is :', tfidf_matrix.shape)


Shape of tfidf_matrix is : (2206, 3565)


#### Testing with new questions

In [25]:
def get_question(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    
    print('The New Question is : ', question)
    print('The Closest question found:', final_dataset.iloc[max_similarity]['Question'])
    print('Score of similarity: {:.2%}'.format(similarity[0, max_similarity]))
    print('The Answer is :', final_dataset.iloc[max_similarity]['Answer'])


In [26]:
get_question('What was Alessandro Volta`s profession')

The New Question is :  What was Alessandro Volta`s profession
The Closest question found: Alessandro Volta What was Alessandro Volta`s profession?
Score of similarity: 96.23%
The Answer is : physisist


In [27]:
get_question('Who is Alessandro Volta')

The New Question is :  Who is Alessandro Volta
The Closest question found: Alessandro Volta What was Alessandro Volta`s profession?
Score of similarity: 94.28%
The Answer is : physisist


In [28]:
get_question('Who created battery ')

The New Question is :  Who created battery 
The Closest question found: London What had the Anglo-Saxons created by the 600s?
Score of similarity: 35.36%
The Answer is : By the 600s, the Anglo-Saxons had created a new S09_settlement called Lundenwic.


In [29]:
get_question('What order Santiago gave ')

The New Question is :  What order Santiago gave 
The Closest question found: Santiago What is the Order of Santiago?
Score of similarity: 77.46%
The Answer is : a Spanish knightly order


In [30]:
get_question('What is made by gut ')

The New Question is :  What is made by gut 
The Closest question found: Lyre Are the strings of a classical lyre made of gut?
Score of similarity: 50.00%
The Answer is : Yes
