In [106]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import spacy
from nltk import Tree
en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
import xgboost
from sklearn import ensemble

In [2]:
train = pd.read_json("data/training.json")

In [25]:
contexts = []
questions = []
answers_text = []
answers_start = []
is_impossible = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'] if len(q_a['answers']) > 0 else None)
            answers_text.append(q_a['answers'][0]['text'] if len(q_a['answers']) > 0 else None)
            is_impossible.append(q_a['is_impossible'])
            contexts.append(sub_para['context'])   
train_df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text, "target": is_impossible})

In [12]:
valid = pd.read_json("data/development.json")

In [26]:
contexts = []
questions = []
answers_text = []
answers_start = []
is_impossible = []
for i in range(valid.shape[0]):
    topic = valid.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'] if len(q_a['answers']) > 0 else None)
            answers_text.append(q_a['answers'][0]['text'] if len(q_a['answers']) > 0 else None)
            is_impossible.append(q_a['is_impossible'])
            contexts.append(sub_para['context'])   
valid_df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text, "target": is_impossible})

In [14]:
test = pd.read_json("data/test.json")

In [15]:
contexts = []
questions = []
for i in range(test.shape[0]):
    topic = test.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            contexts.append(sub_para['context'])   
test_df = pd.DataFrame({"context":contexts, "question": questions})

In [4]:
with open("data/dict_embeddings.pickle", "rb") as f:
    dict_emb = pickle.load(f)

In [27]:
def process_data(df):
    df['sentences'] = df['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    df['sent_emb'] = df['sentences'].apply(lambda x: [dict_emb[item][0] if item in dict_emb else np.zeros(4096) for item in x])
    df['quest_emb'] = df['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
    return df

In [28]:
train_df = process_data(train_df)

In [29]:
valid_df = process_data(valid_df)

In [30]:
test_df = process_data(test_df)

In [31]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [32]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(cosine(item,x["quest_emb"][0]))
    return li

In [33]:
np.warnings.filterwarnings('ignore')
train_df["cosine_sim"] = train_df.apply(cosine_sim, axis = 1)
valid_df["cosine_sim"] = valid_df.apply(cosine_sim, axis = 1)
test_df["cosine_sim"] = test_df.apply(cosine_sim, axis = 1)

In [34]:
def get_candidate(distances):
    return np.argmin(distances)

In [61]:
len(train_df["sentences"][44])

5

In [62]:
def create_features(df):
    df_vect = pd.DataFrame()
    for k in range(len(df["cosine_sim"])):
        for i in range(len(df["cosine_sim"][k])):
            df_vect.loc[k, "column_cos_"+"%s"%i] = df["cosine_sim"][k][i]
            
    df_vect["target"] = df["target"]
    return df_vect

In [63]:
train_vect = create_features(train_df)

In [66]:
train_vect.head()

Unnamed: 0,column_cos_0,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,...,column_cos_18,column_cos_19,column_cos_20,column_cos_21,column_cos_22,column_cos_23,column_cos_24,column_cos_25,column_cos_26,target
0,0.697562,0.67801,0.650128,0.73902,,,,,,,...,,,,,,,,,,False
1,0.576485,0.621842,0.588227,0.589809,,,,,,,...,,,,,,,,,,False
2,0.533887,0.592701,0.647803,0.57086,,,,,,,...,,,,,,,,,,False
3,0.624988,0.622416,0.611707,0.635369,,,,,,,...,,,,,,,,,,False
4,0.708255,0.735227,0.70217,0.797786,,,,,,,...,,,,,,,,,,False


In [95]:
train_vect = train_vect.loc[:,"column_cos_0":"column_cos_9"]
train_vect = train_vect.fillna(0)
train_vect = 1 - train_vect

In [91]:
train_vect.shape

(69596, 10)

In [92]:
valid_vect = create_features(valid_df)

In [99]:
valid_vect = valid_vect.loc[:,"column_cos_0":"column_cos_9"]
valid_vect = valid_vect.fillna(0)
valid_vect = 1 - valid_vect

In [101]:
train_vect[(train_vect < 0).any(axis=1)]

Unnamed: 0,column_cos_0,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9


In [109]:
def train_model(classifier, feature_vector_train, target_train, feature_vector_valid, target_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, target_train)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    print(metrics.classification_report(target_valid, predictions))
    return metrics.accuracy_score(predictions, target_valid)

In [110]:
accuracy = train_model(xgboost.XGBClassifier(), train_vect, train_df['target'], valid_vect, valid_df['target'])
print("GLM: ", accuracy)

             precision    recall  f1-score   support

      False       0.56      0.48      0.52      8700
       True       0.54      0.62      0.58      8700

avg / total       0.55      0.55      0.55     17400

GLM:  0.550459770115
