In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from collections import Counter
import torch

In [2]:
# import training data
dft_eng = pd.read_csv('../../data/dft_eng.csv')
dft_jap = pd.read_csv('../../data/dft_jap.csv')
dft_fin = pd.read_csv('../../data/dft_fin.csv')

# import validation data
dfv_eng = pd.read_csv('../../data/dfv_eng.csv')
dfv_jap = pd.read_csv('../../data/dfv_jap.csv')
dfv_fin = pd.read_csv('../../data/dfv_fin.csv')

#import word count
word_count = pd.read_csv('../../data/question_word_count.csv')

In [3]:
stop_words = ['\n','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','a','about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during','each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him','himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no','nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over','own','s','same','she','should','so','some','such','t','than','that','the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','very','was','we','were','what','when','where','which','while','who','whom','why','will','with','you','your','yours','yourself','yourselves','{','|','}','~']

## Bag of Words

In [4]:
bag_df = dft_eng[["question_text_tokenized", "document_plaintext_tokenized", "labels"]]

bag_df

Unnamed: 0,question_text_tokenized,document_plaintext_tokenized,labels
0,"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",1
1,"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...",1
2,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...",1
3,"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...",1
4,"['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...",1
...,...,...,...
7384,"['what', 'was', 'neil', 'brooks', ""'"", 'fastes...","['the', 'medley', 'relay', 'was', 'scheduled',...",0
7385,"['who', 'are', 'the', 'three', 'most', 'import...","['sāmkhya', 'is', 'a', 'dualist', 'philosophic...",0
7386,"['who', 'was', 'costume', 'designer', 'for', '...","['mollo', 'was', 'surprised', 'by', 'the', 'su...",0
7387,"['who', 'developed', 'the', 'first', 'thermonu...","['in', 'the', 'end', ',', 'president', 'truman...",0


In [5]:
tdoc_data = [eval(x) for x in dft_eng["document_plaintext_tokenized"]]
vdoc_data = [eval(x) for x in dfv_eng["document_plaintext_tokenized"]]

tque_data = [eval(x) for x in dft_eng["question_text_tokenized"]]
vque_data = [eval(x) for x in dfv_eng["question_text_tokenized"]]




In [25]:
flat_tdoc = np.array([item for sublist in tdoc_data for item in sublist])
flat_vdoc = np.array([item for sublist in vdoc_data for item in sublist])

flat_tque = np.array([item for sublist in tque_data for item in sublist])
flat_vque = np.array([item for sublist in vque_data for item in sublist])

flat_t = np.concatenate((flat_tdoc, flat_tque))
flat_v = np.concatenate((flat_vdoc, flat_vque))

countt = Counter(flat_t)
countv = Counter(flat_v)

countt["what"]

2371

In [7]:
# Functions to create BoW vectors
def make_bow_vector(sentence, word_to_ix):
    vec = np.zeros(len(countt))
    for word in sentence:
        vec[countt[word]] += 1
    return vec


print(make_bow_vector(eval(dft_eng["document_plaintext_tokenized"][0]), countt))

[0. 1. 0. ... 0. 0. 0.]


In [8]:
X_train_doc = np.array([make_bow_vector(eval(x), countt) for x in dft_eng["document_plaintext_tokenized"]])
X_train_que = np.array([make_bow_vector(eval(x), countt) for x in dft_eng["question_text_tokenized"]])

X_val_doc = np.array([make_bow_vector(eval(x), countv) for x in dfv_eng["document_plaintext_tokenized"]])
X_val_que = np.array([make_bow_vector(eval(x), countv) for x in dfv_eng["question_text_tokenized"]])

y_train = dft_eng.labels.values
y_val = dfv_eng.labels.values


print(X_train_doc.shape)
print(X_train_que.shape)
print(X_val_doc.shape)
print(X_val_que.shape)
# print(X_val.shape)

(7389, 65320)
(7389, 65320)
(990, 65320)
(990, 65320)


In [22]:
clf = LogisticRegression(C=10, penalty='l2', random_state=1, solver='liblinear').fit(X_train, y_train)
pred = clf.predict(X_val)
accuracy_score(pred, y_val)

0.7343434343434343

## Pytorch