In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [125]:
df = pd.read_csv('./SQuAD_csv.csv')

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86821 entries, 0 to 86820
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    86821 non-null  int64 
 1   context       86821 non-null  object
 2   question      86821 non-null  object
 3   id            86821 non-null  object
 4   answer_start  86821 non-null  int64 
 5   text          86818 non-null  object
dtypes: int64(2), object(4)
memory usage: 4.0+ MB


In [127]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,context,question,id,answer_start,text
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,269,in the late 1990s
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,207,singing and dancing
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,526,2003
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,166,"Houston, Texas"
4,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,276,late 1990s
5,5,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what R&B group was she the lead singer?,56bf6b0f3aeaaa14008c9603,320,Destiny's Child
6,6,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,56bf6b0f3aeaaa14008c9604,505,Dangerously in Love
7,7,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,56bf6b0f3aeaaa14008c9605,360,Mathew Knowles
8,8,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,56d43c5f2ccc5a1400d830a9,276,late 1990s
9,9,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,56d43c5f2ccc5a1400d830aa,290,lead singer


In [128]:
# Define the fraction of rows to keep (e.g., keep 80% of the rows)
fraction_to_keep = 0.2

df = df.sample(frac=fraction_to_keep, random_state=1).reset_index(drop=True)

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17364 entries, 0 to 17363
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    17364 non-null  int64 
 1   context       17364 non-null  object
 2   question      17364 non-null  object
 3   id            17364 non-null  object
 4   answer_start  17364 non-null  int64 
 5   text          17364 non-null  object
dtypes: int64(2), object(4)
memory usage: 814.1+ KB


In [130]:
df.columns

Index(['Unnamed: 0', 'context', 'question', 'id', 'answer_start', 'text'], dtype='object')

In [131]:
df = df.drop(labels={'Unnamed: 0','id', 'answer_start'},axis=1)

In [132]:
df = df.dropna()

In [133]:
if df.duplicated().sum() > 0:
    print("Duplicates exists.")
    df = df.drop_duplicates()
    print("Remove Duplicates")

else:
    print("No duplicates")

No duplicates


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17364 entries, 0 to 17363
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   context   17364 non-null  object
 1   question  17364 non-null  object
 2   text      17364 non-null  object
dtypes: object(3)
memory usage: 407.1+ KB


In [135]:
df.head(5)

Unnamed: 0,context,question,text
0,"Despite the death of Queen Mary on 24 March, t...",When was the coronation of Elizabeth as Queen?,2 June 1953
1,Clothing can and has in history been made from...,What is an article that is carried rather than...,purses
2,In 2013–14 a pornographic actor was trying to ...,What legal system did the actor use after fili...,Federal Court of Canada
3,The Cold War drew to a close in the late 1980s...,What was the Soviet Union suffering from in t...,severe economic stagnation
4,Commercial turkeys are usually reared indoors ...,What the average for the amount of turkeys ar...,60 million birds in the United States


In [136]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [137]:
def clean_text(text):
    if not isinstance(text,str):
        return ""
    
    text = text.lower()
    
    text = re.sub(r'[^\w\s\d]','',text)
    
    words =word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # join the words back together
    
    text = ''.join(words)
    return text

In [None]:
# df['new_context'] =df['context'] + df['question']

In [138]:
df['context_clean'] = df['context'].apply(clean_text)

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17364 entries, 0 to 17363
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   context        17364 non-null  object
 1   question       17364 non-null  object
 2   text           17364 non-null  object
 3   context_clean  17364 non-null  object
dtypes: object(4)
memory usage: 542.8+ KB


In [140]:
df.head(5)

Unnamed: 0,context,question,text,context_clean
0,"Despite the death of Queen Mary on 24 March, t...",When was the coronation of Elizabeth as Queen?,2 June 1953,despitedeathqueenmary24marchcoronation2june195...
1,Clothing can and has in history been made from...,What is an article that is carried rather than...,purses,clothinghistorymadewidevarietymaterialmaterial...
2,In 2013–14 a pornographic actor was trying to ...,What legal system did the actor use after fili...,Federal Court of Canada,201314pornographicactortryingremovearchivedima...
3,The Cold War drew to a close in the late 1980s...,What was the Soviet Union suffering from in t...,severe economic stagnation,coldwardrewcloselate1980searly1990sunitedstate...
4,Commercial turkeys are usually reared indoors ...,What the average for the amount of turkeys ar...,60 million birds in the United States,commercialturkeyusuallyrearedindoorscontrolled...


In [157]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix


In [158]:
train_data, test_data = train_test_split(df, test_size=0.2,random_state=42)

In [160]:
cv = CountVectorizer(max_features=10000,stop_words='english')

In [161]:
vector = cv.fit_transform(df['context_clean'].values.astype('U')).toarray()

In [185]:
df['context']

0        Despite the death of Queen Mary on 24 March, t...
1        Clothing can and has in history been made from...
2        In 2013–14 a pornographic actor was trying to ...
3        The Cold War drew to a close in the late 1980s...
4        Commercial turkeys are usually reared indoors ...
                               ...                        
17359    There is usually an indication for a specific ...
17360    The islands are at relatively low altitudes, w...
17361    During World War II, the British destroyed the...
17362    Tourism is considered another important indust...
17363    London has a diverse range of peoples and cult...
Name: context, Length: 17364, dtype: object

In [162]:
vector.shape

(17364, 10000)

In [146]:
similarity = cosine_similarity(vector)

In [147]:
similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [186]:
distance = sorted(list(enumerate(similarity[17317])), reverse=True, key= lambda vector:vector[1])
for i in distance[0:5]:
    print(df.iloc[i[0]].question)

What groups can be combined to describe every group?
What group can be the quotient of the free group over the generators of the group?
When was the coronation of Elizabeth as Queen?
What is an article that is carried rather than worn that isn't regarded as clothing?
What legal system did the actor use after filing DMCA petitions?


In [181]:
def recommend(question):
    try:
        index = df[df['question'] == question].index[0]
        print(index)
        distance = sorted(list(enumerate(similarity[index])), reverse=True, key= lambda vector:vector[1])
        print(similarity[index])
        for i in distance[0:5]:
            print(df.iloc[i[0]].question)
    except:
        print("No question")
        

In [184]:
recommend("What are these statutes essential to?")

17301
[0. 0. 0. ... 0. 0. 0.]
What are these statutes essential to?
When was the coronation of Elizabeth as Queen?
What is an article that is carried rather than worn that isn't regarded as clothing?
What legal system did the actor use after filing DMCA petitions?
What was  the Soviet Union suffering from in the 1980's?


In [None]:
df.to_csv('new_questions.csv',index=False)