In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from collections import Counter

In [52]:
# import training data
dft_eng = pd.read_csv('../data/dft_eng.csv')

# import validation data
dfv_eng = pd.read_csv('../data/dfv_eng.csv')

## Bag of Words

### Functions to create BoW vectors

In [55]:
def make_bow_vector(sentence, countt):
    vec = np.zeros(len(countt))
    for word in sentence:
        vec[countt[word]] += 1
    return vec

### Combine and flatten

In [56]:
t_comb = [eval(que) + eval(doc) for que, doc in zip(dft_eng["question_text_tokenized"],dft_eng["document_plaintext_tokenized"])]
v_comb = [eval(que) + eval(doc) for que, doc in zip(dfv_eng["question_text_tokenized"],dfv_eng["document_plaintext_tokenized"])]


flat_tcomb = np.array([item for sublist in t_comb for item in sublist])
flat_vcomb = np.array([item for sublist in v_comb for item in sublist])

### Initial dictionary

In [57]:
countt_comb = Counter(flat_tcomb)

### Create bow vectors

In [58]:
bow_t = [make_bow_vector(comb, countt_comb) for comb in t_comb]
bow_v = [make_bow_vector(comb, countt_comb) for comb in v_comb]

### Add word frequency to bow vectors

In [59]:
def ans_freq(que, doc):

    freq = [x for x in que if x in doc]
    freq = len(freq)/len(que)
    return freq

def freq_words_in_text (df_t,df_v):
    for df in [df_t,df_v]:
        frequency = []
        for question, answer in zip(df['question_text_tokenized'], df['document_plaintext_tokenized']):
            frequency.append(ans_freq(eval(question), eval(answer)))

        df['word_frequency_score'] = frequency

freq_words_in_text(dft_eng,dfv_eng)

for index in range(len(bow_t)):
    bow_t[index] = np.append(dft_eng['word_frequency_score'][index],bow_t[index])

for index in range(len(bow_v)):
    bow_v[index] = np.append(dfv_eng['word_frequency_score'][index],bow_v[index])

### Create train and validation data

In [61]:
X_train = np.array(bow_t)
X_val = np.array(bow_v)
y_train = dft_eng.label.values
y_val = dfv_eng.label.values

### Initialise logres model

In [62]:
clf = LogisticRegression(C=1000, penalty='l1', random_state=1, solver='liblinear').fit(X_train, y_train)

### predict val data

In [72]:
val_pred = clf.predict(X_val)
accuracy_score(val_pred, y_val)

0.7555555555555555

In [73]:
train_pred = clf.predict(X_train)
accuracy_score(train_pred, y_train)

0.794694816619299

In [74]:
dft_eng['logres_pred'] = train_pred
dfv_eng['logres_pred'] = val_pred


In [64]:
import pickle

In [65]:
filename = 'logres_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [76]:
dft_eng.to_csv('../data/dft_eng.csv',index=False) 
dfv_eng.to_csv('../data/dfv_eng.csv',index=False) 