In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from collections import Counter

In [10]:
# import training data
dft_eng = pd.read_csv('../../data/dft_eng.csv')

# import validation data
dfv_eng = pd.read_csv('../../data/dfv_eng.csv')

#import word count
word_count = pd.read_csv('../../data/question_word_count.csv')

In [11]:
stop_words = ['\n','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','a','about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during','each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him','himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no','nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over','own','s','same','she','should','so','some','such','t','than','that','the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','very','was','we','were','what','when','where','which','while','who','whom','why','will','with','you','your','yours','yourself','yourselves','{','|','}','~']

## Bag of Words

In [12]:
bag_df = dft_eng[["question_text_tokenized", "document_plaintext_tokenized", "labels"]]

bag_df

Unnamed: 0,question_text_tokenized,document_plaintext_tokenized,labels
0,"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",1
1,"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...",1
2,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...",1
3,"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...",1
4,"['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...",1
...,...,...,...
7384,"['what', 'was', 'neil', 'brooks', ""'"", 'fastes...","['the', 'medley', 'relay', 'was', 'scheduled',...",0
7385,"['who', 'are', 'the', 'three', 'most', 'import...","['sāmkhya', 'is', 'a', 'dualist', 'philosophic...",0
7386,"['who', 'was', 'costume', 'designer', 'for', '...","['mollo', 'was', 'surprised', 'by', 'the', 'su...",0
7387,"['who', 'developed', 'the', 'first', 'thermonu...","['in', 'the', 'end', ',', 'president', 'truman...",0


In [79]:
tdoc_data = [eval(x) for x in dft_eng["document_plaintext_tokenized"]]
vdoc_data = [eval(x) for x in dfv_eng["document_plaintext_tokenized"]]

tque_data = [eval(x) for x in dft_eng["question_text_tokenized"]]
vque_data = [eval(x) for x in dfv_eng["question_text_tokenized"]]

t_comb = [eval(que) + eval(doc) for que, doc in zip(dft_eng["question_text_tokenized"],dft_eng["document_plaintext_tokenized"])]
v_comb = [eval(que) + eval(doc) for que, doc in zip(dfv_eng["question_text_tokenized"],dfv_eng["document_plaintext_tokenized"])]

In [84]:
tdoc_data = [x for x in tdoc_data if not x in stop_words]
vdoc_data = [eval(x) for x in dfv_eng["document_plaintext_tokenized"]]

tque_data = [eval(x) for x in dft_eng["question_text_tokenized"]]
vque_data = [eval(x) for x in dfv_eng["question_text_tokenized"]]

t_comb = [eval(que) + eval(doc) for que, doc in zip(dft_eng["question_text_tokenized"],dft_eng["document_plaintext_tokenized"])]
v_comb = [eval(que) + eval(doc) for que, doc in zip(dfv_eng["question_text_tokenized"],dfv_eng["document_plaintext_tokenized"])]

In [86]:
'''
doc_data_nostop = []
for doc in tdoc_data:
    tdoc_data.append([word for word in doc if word not in stop_words ])

vdoc_data_nostop = []
for doc in tdoc_data:
    vdoc_data_nostop.vdoc_data([word for word in doc if word not in stop_words ])

tque_data_nostop = []
for doc in tque_data:
    tque_data.append([word for word in doc if word not in stop_words ])

vque_data_nostop = []
for doc in vque_data:
    vdoc_data_nostop.vdoc_data([word for word in doc if word not in stop_words ])

'''
t_comb_nostop = []
for doc in t_comb:
    t_comb_nostop.append([word for word in doc if word not in stop_words ])

v_comb_nostop = []
for doc in v_comb:
    v_comb_nostop.append([word for word in doc if word not in stop_words ])

In [87]:
flat_tdoc = np.array([item for sublist in tdoc_data for item in sublist])
flat_vdoc = np.array([item for sublist in vdoc_data for item in sublist])

flat_tque = np.array([item for sublist in tque_data for item in sublist])
flat_vque = np.array([item for sublist in vdoc_data for item in sublist])

flat_tcomb = np.array([item for sublist in t_comb for item in sublist])
flat_vcomb = np.array([item for sublist in v_comb for item in sublist])

flat_tcomb_nostop = np.array([item for sublist in t_comb_nostop for item in sublist])
flat_vcomb_nostop = np.array([item for sublist in v_comb_nostop for item in sublist])

countt_doc = Counter(flat_tdoc)
countt_que = Counter(flat_tque)
countt_comb = Counter(flat_tcomb)
countt_comb_nostop = Counter(flat_vcomb_nostop)

In [89]:
# Functions to create BoW vectors
def make_bow_vector(sentence):
    vec = np.zeros(len(countt_comb))
    for word in sentence:
        vec[countt_comb[word]] += 1
    return vec

In [90]:
l1 = [make_bow_vector(comb) for comb in t_comb_nostop]
v1 = [make_bow_vector(comb) for comb in v_comb_nostop]

In [19]:
l1 = [make_bow_vector(comb) for comb in t_comb]
l2 = [make_bow_vector(que) for que in tque_data]
l3 = [make_bow_vector(doc) for doc in tdoc_data]
#np.array([make_bow_vector(que, countt_comb) for x in v_comb])

In [26]:
v1 = [make_bow_vector(comb) for comb in v_comb]
v2 = [make_bow_vector(que) for que in vque_data]
v3 = [make_bow_vector(doc) for doc in vdoc_data]

In [91]:
X_train = np.array(l1)
#X_train = np.array([np.concatenate([que,doc,comb]) for que, doc, comb in zip(l2,l3,l1)])
#X_train = np.array([np.concatenate([que,doc]) for que, doc in zip(l2,l3)])

In [92]:
X_val = np.array(v1)
#X_val = np.array([np.concatenate([que, doc, comb]) for que, doc, comb in zip(v2,v3,v1)])
#X_val = np.array([np.concatenate([que,doc]) for que, doc in zip(v2,v3)])


In [93]:
#X_train = np.array([np.concatenate([make_bow_vector(que, countt_comb) + make_bow_vector(doc, countt_comb) + make_bow_vector(comb, countt_comb)]) for que, doc, comb in zip(flat_tque, flat_tdoc, flat_tcomb)])
#X_val = np.array([np.concatenate([make_bow_vector(que, countt_comb) + make_bow_vector(doc, countt_comb) + make_bow_vector(comb, countt_comb)]) for que, doc, comb in zip(flat_vque, flat_vdoc, flat_vcomb)])
#X_val = np.array([make_bow_vector(x, countt_comb) for x in v_comb])

y_train = dft_eng.labels.values
y_val = dfv_eng.labels.values

In [94]:
print(X_train.shape)
print(y_train.shape)

print(X_val.shape)
print(y_val.shape)

(7389, 65320)
(7389,)
(990, 65320)
(990,)


In [95]:
clf = LogisticRegression(C=1000, penalty='l1', random_state=1, solver='liblinear').fit(X_train, y_train)
pred = clf.predict(X_val)
accuracy_score(pred, y_val)

0.7141414141414142