In [1]:
import pandas as pd
import json
import pickle
from numpy import linalg as LA
import numpy as np

from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# with open('training.json', "r") as f:
#     d = json.loads(f.read())

# train = pd.DataFrame.from_dict(d["data"])
# print(train.head(3))

## read data

- **Load preprocessed dataframes**

In [2]:
train = pd.read_pickle("data/train.pickle")
train_context = pd.read_pickle("data/train_context.pickle")
dev = pd.read_pickle("data/dev.pickle")
dev_context = pd.read_pickle("data/dev_context.pickle")

In [14]:
test = pd.read_pickle("data/test.pickle")
test_context = pd.read_pickle("data/test_context.pickle")

In [5]:
train = train.reset_index(drop = True)
dev = test.reset_index(drop = True)

In [7]:
train.head()

Unnamed: 0,answers,context_idx,id,is_impossible,plausible_answers,question
0,"[{'text': 'Dangerously in Love', 'answer_start...",0,56d43c5f2ccc5a1400d830ab,False,,What was the first album Beyoncé released as a...
1,"[{'text': 'Houston, Texas', 'answer_start': 166}]",0,56bf6b0f3aeaaa14008c9601,False,,In what city and state did Beyonce grow up?
2,"[{'text': 'Mathew Knowles', 'answer_start': 360}]",0,56bf6b0f3aeaaa14008c9605,False,,Who managed the Destiny's Child group?
3,"[{'text': 'late 1990s', 'answer_start': 276}]",0,56bf6b0f3aeaaa14008c9602,False,,In which decade did Beyonce become famous?
4,"[{'text': 'five', 'answer_start': 590}]",0,56d43c5f2ccc5a1400d830ad,False,,How many Grammy awards did Beyoncé win for her...


In [4]:
def read_train(filename):
    train = pd.read_json(filename)
    df = pd.DataFrame()
    context_count = 0
    contexts = []
    qa_dfs = []
    for i in range(len(train)):
        curr = train.loc[i, "data"]
        lsts = curr["paragraphs"] # each contains context, qas
        for item in lsts:
            contexts.append(item["context"])
            curr_qas = item["qas"]
            # columns in tmp_df: [u'answers', u'id', u'is_impossible', u'question', u'context_idx']
            tmp_df = pd.DataFrame.from_records(curr_qas) 
            tmp_df["context_idx"] = context_count
            qa_dfs.append(tmp_df)
            context_count += 1
#         print(i)
    context_df = pd.DataFrame.from_dict({"context": contexts})
    df = pd.concat(qa_dfs)
    return context_df, df

In [12]:
def save_pickle(filename, obj):
    pickling_on = open(filename + ".pickle","wb")
    pickle.dump(obj, pickling_on)
    pickling_on.close()

In [None]:
train_context, train = read_train("training.json")
# save_pickle("train_context", train_context)
# save_pickle("train", train)



In [9]:
dev_context, dev = read_train("data/development.json")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [13]:
save_pickle("data/dev_context", dev_context)
save_pickle("data/dev", dev)

In [None]:
test_context, test = read_train("data/test.json")

In [None]:
len(test)

In [None]:
print("length of training is", len(train))
print("length of train context is ", len(train_context))
train.head()

In [None]:
print("length of test is", len(test))
print("length of test context is ", len(test_context))
test.head()

## baseline system

In [None]:
def cosine_similarity(arr1, arr2):
    return 1 - spatial.distance.cosine(arr1, arr2.T)
#     return np.dot(arr1.T, arr2)/(LA.norm(arr1)*LA.norm(arr2))

In [None]:
# add a column with similarity score in df
def compute_sim(context_df, df):
    df["similarity"] = 0
    res = []
    for i in range(len(context_df)):
        vectorizer = CountVectorizer(stop_words="english") # build a vectorizer for each context
        curr_context = [context_df.loc[i, "context"]]
        vectorizer.fit(curr_context)
        context_vec = vectorizer.transform(curr_context)
        qas = df[df["context_idx"] == i].reset_index()
        for j in range(len(qas)):
            q_vec = vectorizer.transform([qas.loc[j, "question"]])
            if np.sum(q_vec) != 0:
                qas.loc[j, "similarity"] = cosine_similarity(q_vec.toarray(), context_vec.toarray())
            else:
                qas.loc[j, "similarity"] = 0
        res.append(qas)
    res_df = pd.concat(res)
    return res_df.reset_index(drop = True)

In [None]:
train_res = compute_sim(train_context, train)

In [None]:
train_res

In [None]:
train_res["similarity"] = train_res["similarity"].fillna(0)

In [None]:
from matplotlib import pyplot
true_sim = train_res[train_res["is_impossible"] == True]["similarity"]
false_sim = train_res[train_res["is_impossible"] == False]["similarity"]
bins = np.linspace(0.01, 1, 200)

pyplot.hist(true_sim, bins, alpha=0.5, label='true')
pyplot.hist(false_sim, bins, alpha=0.5, label='false')
pyplot.xlabel('similarity score')
pyplot.ylabel('question count')
pyplot.legend(loc='upper right')
# pyplot.show()

In [None]:
train_res[train_res["is_impossible"] == True]["similarity"].describe()

In [None]:
train_res[train_res["is_impossible"] == False]["similarity"].describe()

In [None]:
test_res = compute_sim(test_context, test)

In [None]:
test_res

In [None]:
def predict_baseline(x, thred):
    if x < thred:
        return 0
    else:
        return 1

In [None]:
threshold = 0.25
test_res["predicted"] = [predict_baseline(x, threshold) for x in test_res["similarity"]] 

In [None]:
output = test_res[["id", "predicted"]].to_json(orient='values')[1:-1].replace("[", "")
output = "{" +  output.replace("]", "") + "}"
output = output.replace('",', '": ')

In [None]:
submission = test_res[["id", "predicted"]]
submission.to_csv("baseline_submission.csv", index = False)

In [None]:
with open('dev_baseline.json', 'w') as f:
    f.write(output)

## NER tag parsing

In [17]:
import spacy

In [18]:
nlp = spacy.load('en_core_web_sm')

In [19]:
context = 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [20]:
a = nlp(context)

In [21]:
lst = [item for item in a]

In [22]:
for item in lst:
    print(item.text, '->', item.ent_type_) 

The -> 
Normans -> NORP
( -> 
Norman -> PERSON
: -> 
Nourmands -> 
; -> 
French -> NORP
: -> 
Normands -> 
; -> 
Latin -> LANGUAGE
: -> 
Normanni -> 
) -> 
were -> 
the -> 
people -> 
who -> 
in -> 
the -> DATE
10th -> DATE
and -> DATE
11th -> DATE
centuries -> DATE
gave -> 
their -> 
name -> 
to -> 
Normandy -> PERSON
, -> 
a -> 
region -> 
in -> 
France -> GPE
. -> 
They -> 
were -> 
descended -> 
from -> 
Norse -> PERSON
( -> 
" -> 
Norman -> WORK_OF_ART
" -> 
comes -> 
from -> 
" -> 
Norseman -> WORK_OF_ART
" -> 
) -> 
raiders -> 
and -> 
pirates -> 
from -> 
Denmark -> GPE
, -> 
Iceland -> GPE
and -> 
Norway -> GPE
who -> 
, -> 
under -> 
their -> 
leader -> 
Rollo -> PERSON
, -> 
agreed -> 
to -> 
swear -> 
fealty -> 
to -> 
King -> 
Charles -> PERSON
III -> PERSON
of -> 
West -> 
Francia -> 
. -> 
Through -> 
generations -> 
of -> 
assimilation -> 
and -> 
mixing -> 
with -> 
the -> 
native -> 
Frankish -> NORP
and -> 
Roman -> NORP
- -> NORP
Gaulish -> NORP
populations -> 
, ->

In [25]:
a.ents

(Normans,
 Norman,
 French,
 Latin,
 the 10th and 11th centuries,
 Normandy,
 France,
 Norse,
 Norman,
 Norseman,
 Denmark,
 Iceland,
 Norway,
 Rollo,
 Charles III,
 Frankish,
 Roman-Gaulish,
 Carolingian,
 West Francia,
 Normans,
 the first half of the 10th century,
 centuries)

In [26]:
# q_sample = "In what country is Normandy located"
q_sample = "Who is responsible for Normandy?"

In [27]:
q = nlp(q_sample)

In [29]:
list(q.ents)

[Normandy]

## sentence embedding