In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(123)

<torch._C.Generator at 0x19503a7b3b0>

In [2]:
# import training data
dft_eng = pd.read_csv('../../data/dft_eng.csv')
dft_jap = pd.read_csv('../../data/dft_jap.csv')
dft_fin = pd.read_csv('../../data/dft_fin.csv')

# import validation data
dfv_eng = pd.read_csv('../../data/dfv_eng.csv')
dfv_jap = pd.read_csv('../../data/dfv_jap.csv')
dfv_fin = pd.read_csv('../../data/dfv_fin.csv')

#import word count
word_count = pd.read_csv('../../data/question_word_count.csv')

In [3]:
stop_words = ['\n','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','a','about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during','each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him','himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no','nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over','own','s','same','she','should','so','some','such','t','than','that','the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','very','was','we','were','what','when','where','which','while','who','whom','why','will','with','you','your','yours','yourself','yourselves','{','|','}','~']

In [4]:
def ans_freq(que, doc):

    freq = [x for x in que if x in doc]
    freq = len(freq)/len(que)
    return freq

In [5]:
# calculating frequenzy of words in answer that are in text 
for df in [dft_jap,dfv_jap,dft_eng,dfv_eng,dft_fin,dfv_fin]:
    frequency = []
    for question, answer in zip(df['question_text_tokenized'], df['document_plaintext_tokenized']):
        frequency.append(ans_freq(eval(question), eval(answer)))

    df['word_frequency_score'] = frequency

In [6]:
dft_eng

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,answer_start,answer_text,question_text_tokenized,document_plaintext_tokenized,answer_text_tokenized,labels,word_frequency_score
0,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': array([159]), 'answer_text': ...",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,[159],['1920s'],"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",['1920s'],1,0.571429
1,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': array([610]), 'answer_text': ...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,[610],['Sully Prudhomme'],"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...","['sully', 'prudhomme']",1,0.700000
2,When is the dialectical method used?,Dialectic,english,"{'answer_start': array([129]), 'answer_text': ...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,[129],['discourse between two or more people holding...,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...","['discourse', 'between', 'two', 'or', 'more', ...",1,0.571429
3,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': array([88]), 'answer_text': a...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,[88],['Sejong the Great'],"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...","['sejong', 'the', 'great']",1,0.250000
4,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': array([0]), 'answer_text': ar...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,[0],"['Grasshoppers are plant-eaters, with a few sp...","['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...","['grasshoppers', 'are', 'plant-eaters', ',', '...",1,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7384,What was Neil Brooks' fastest recorded time?,Swimming at the 1980 Summer Olympics – Men's 4...,english,"{'answer_start': array([-1]), 'answer_text': a...",The medley relay was scheduled in the Olympisk...,https://en.wikipedia.org/wiki/Swimming%20at%20...,[-1],[''],"['what', 'was', 'neil', 'brooks', ""'"", 'fastes...","['the', 'medley', 'relay', 'was', 'scheduled',...",[],0,0.444444
7385,Who are the three most important eastern philo...,Eastern philosophy,english,"{'answer_start': array([-1]), 'answer_text': a...",Sāmkhya is a dualist philosophical tradition b...,https://en.wikipedia.org/wiki/Eastern%20philos...,[-1],[''],"['who', 'are', 'the', 'three', 'most', 'import...","['sāmkhya', 'is', 'a', 'dualist', 'philosophic...",[],0,0.222222
7386,Who was costume designer for the first Star Wa...,John Mollo,english,"{'answer_start': array([-1]), 'answer_text': a...",Mollo was surprised by the success of Star War...,https://en.wikipedia.org/wiki/John%20Mollo,[-1],[''],"['who', 'was', 'costume', 'designer', 'for', '...","['mollo', 'was', 'surprised', 'by', 'the', 'su...",[],0,0.545455
7387,Who developed the first thermonuclear weapon?,History of nuclear weapons,english,"{'answer_start': array([-1]), 'answer_text': a...","In the end, President Truman made the final de...",https://en.wikipedia.org/wiki/History%20of%20n...,[-1],[''],"['who', 'developed', 'the', 'first', 'thermonu...","['in', 'the', 'end', ',', 'president', 'truman...",[],0,0.285714


In [7]:
def get_highest_wordscore(que, language, data):
    count = word_count.loc[(word_count['word'].isin(eval(que)))&(word_count['language']==language)]
    return count.iloc[np.argmax(count['norm'])]['norm']
     

get_highest_wordscore(dft_eng.question_text_tokenized[0],dft_eng.language[0], word_count)

0.9996961121346224

In [8]:
dft_eng.document_plaintext_tokenized[0]

"['quantum', 'field', 'theory', 'naturally', 'began', 'with', 'the', 'study', 'of', 'electromagnetic', 'interactions', ',', 'as', 'the', 'electromagnetic', 'field', 'was', 'the', 'only', 'known', 'classical', 'field', 'as', 'of', 'the', '1920s.', '[', '8', ']', ':1']"

## Bag of Words

In [9]:
bag_df = dft_eng[["question_text_tokenized", "document_plaintext_tokenized", "labels"]]

bag_df

Unnamed: 0,question_text_tokenized,document_plaintext_tokenized,labels
0,"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",1
1,"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...",1
2,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...",1
3,"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...",1
4,"['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...",1
...,...,...,...
7384,"['what', 'was', 'neil', 'brooks', ""'"", 'fastes...","['the', 'medley', 'relay', 'was', 'scheduled',...",0
7385,"['who', 'are', 'the', 'three', 'most', 'import...","['sāmkhya', 'is', 'a', 'dualist', 'philosophic...",0
7386,"['who', 'was', 'costume', 'designer', 'for', '...","['mollo', 'was', 'surprised', 'by', 'the', 'su...",0
7387,"['who', 'developed', 'the', 'first', 'thermonu...","['in', 'the', 'end', ',', 'president', 'truman...",0


In [10]:
data = [(eval(x), y) for x, y in zip((dft_eng["document_plaintext_tokenized"]), dft_eng["labels"])]

test_data = [(eval(x), y) for x, y in zip((dft_eng["question_text_tokenized"]), dft_eng["labels"])]

label_to_ix = {0: 0, 1: 1}

In [11]:
test_data

[(['when', 'was', 'quantum', 'field', 'theory', 'developed', '?'], 1),
 (['who',
   'was',
   'the',
   'first',
   'nobel',
   'prize',
   'winner',
   'for',
   'literature',
   '?'],
  1),
 (['when', 'is', 'the', 'dialectical', 'method', 'used', '?'], 1),
 (['who', 'invented', 'hangul', '?'], 1),
 (['what', 'do', 'grasshoppers', 'eat', '?'], 1),
 (['how',
   'large',
   'is',
   'the',
   'kerman',
   'province',
   'of',
   'southeastern',
   'iran',
   '?'],
  1),
 (['when', 'was', 'guitar', 'hero', 'live', 'first', 'released', '?'], 1),
 (['when',
   'were',
   'bluebonnets',
   'named',
   'the',
   'state',
   'flower',
   'of',
   'texas',
   '?'],
  1),
 (['who', 'created', 'the', 'series', 'clannad', '?'], 1),
 (['when', 'was', 'the', 'uss', 'taylor', 'built', '?'], 1),
 (['what',
   "'s",
   'the',
   'difference',
   'between',
   'man-slaughter',
   'and',
   'homicide',
   '?'],
  1),
 (['when',
   'did',
   'dc',
   'comics',
   'first',
   'introduce',
   'the',
   'gu

In [12]:
# Function to map each word in the vocab to an unique integer
# Indexing the Bag of words vector
from collections import Counter
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)


word_to_ix

{'quantum': 0,
 'field': 1,
 'theory': 2,
 'naturally': 3,
 'began': 4,
 'with': 5,
 'the': 6,
 'study': 7,
 'of': 8,
 'electromagnetic': 9,
 'interactions': 10,
 ',': 11,
 'as': 12,
 'was': 13,
 'only': 14,
 'known': 15,
 'classical': 16,
 '1920s.': 17,
 '[': 18,
 '8': 19,
 ']': 20,
 ':1': 21,
 'nobel': 22,
 'prize': 23,
 'in': 24,
 'literature': 25,
 '(': 26,
 'swedish': 27,
 ':': 28,
 'nobelpriset': 29,
 'i': 30,
 'litteratur': 31,
 ')': 32,
 'is': 33,
 'awarded': 34,
 'annually': 35,
 'by': 36,
 'academy': 37,
 'to': 38,
 'authors': 39,
 'for': 40,
 'outstanding': 41,
 'contributions': 42,
 'literature.': 43,
 'it': 44,
 'one': 45,
 'five': 46,
 'prizes': 47,
 'established': 48,
 '1895': 49,
 'will': 50,
 'alfred': 51,
 'which': 52,
 'are': 53,
 'chemistry': 54,
 'physics': 55,
 'peace': 56,
 'and': 57,
 'physiology': 58,
 'or': 59,
 'medicine.': 60,
 '1': 61,
 'dictated': 62,
 "'s": 63,
 'award': 64,
 'administered': 65,
 'foundation': 66,
 'a': 67,
 'committee': 68,
 'that': 69,


In [13]:
tdoc_data = [(eval(x)) for x in dft_eng["document_plaintext_tokenized"]]
vdoc_data = [(eval(x)) for x in dfv_eng["document_plaintext_tokenized"]]

tque_data = [eval(x) for x in dft_eng["question_text_tokenized"]]
vque_data = [eval(x) for x in dfv_eng["question_text_tokenized"]]




In [24]:
flat_tdoc = np.array([item for sublist in tdoc_data for item in sublist])
flat_vdoc = np.array([item for sublist in vdoc_data for item in sublist])


countt = Counter(flat_tdoc)
countv = Counter(flat_vdoc)

countv.most_common(10)

[('the', 7333),
 (',', 5886),
 ('of', 3602),
 ('and', 2728),
 ('in', 2514),
 ('[', 1937),
 (']', 1936),
 ('to', 1899),
 ('a', 1795),
 ('(', 1186)]

In [15]:
countt['the']

52652

In [16]:
# Functions to create BoW vectors
def make_bow_vector(sentence, word_to_ix):
    vec = np.zeros(len(countt))
    for word in sentence:
        vec[countt[word]] += 1
    return vec


print(make_bow_vector(eval(dft_eng["document_plaintext_tokenized"][0]),countt))
print(dft_eng["document_plaintext_tokenized"][0])
print(countt['quantum'])

[0. 1. 0. ... 0. 0. 0.]
['quantum', 'field', 'theory', 'naturally', 'began', 'with', 'the', 'study', 'of', 'electromagnetic', 'interactions', ',', 'as', 'the', 'electromagnetic', 'field', 'was', 'the', 'only', 'known', 'classical', 'field', 'as', 'of', 'the', '1920s.', '[', '8', ']', ':1']
17


In [20]:
X_train = np.array([make_bow_vector(eval(x), countt) for x in dft_eng["document_plaintext_tokenized"]])
X_val = np.array([make_bow_vector(eval(x), countv) for x in dfv_eng["document_plaintext_tokenized"]])

y_train = dft_eng.labels.values
y_val = dfv_eng.labels.values

print(X_val.shape)
X_val

(7389, 65106)


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 3., 2., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.]])

In [28]:
dft_eng["document_plaintext_tokenized"]


  X_train == X_val


False

In [29]:
clf = LogisticRegression(C=1000, penalty='l1', random_state=1, solver='liblinear').fit(X_train, y_train)
pred = clf.predict(X_val)
accuracy_score(pred, y_val)

0.7202020202020202

0       1
1       1
2       1
3       1
4       1
       ..
7384    0
7385    0
7386    0
7387    0
7388    0
Name: labels, Length: 7389, dtype: int64