In [None]:
import json
with open("development.json") as datafile:
  data = json.load(datafile)

In [None]:
import torch
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
import nltk
nltk.download('punkt')
W2V_PATH = 'dataset/fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)
infersent.build_vocab_k_words(K=100000)

In [None]:
import json
import nltk
from collections import defaultdict
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

training_json = {}
development_json = {}
test_json = {}
training_features = {}
training_labels = {}
feats = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "what", "when", "how", "many", "why", "who", 'percentage NNP', 'percentage NNPS', 'percentage CD']

training_dict = {}
development_dict = {}
test_dict = {}

####TRAINING PREPROCESSING - Creating feature vectors
with open('training.json','r') as f:
    training_json = json.load(f)

for x in training_json['data']:
    for p in x['paragraphs']:
        context = p['context']
        doc = nlp(context)
        labels = [x.label_ for x in doc.ents]
        features = defaultdict(int)
        features = Counter(labels)
        questions = {}
        for q in p['qas']:
            qid = q['id']
            questions[qid] = {q['question']:q['is_impossible']}
            words = nltk.word_tokenize(q['question'])
            features['what'] = int(any((True for word in words if word.lower() == "what")))
            features['when'] = int(any((True for word in words if word.lower() == "when")))
            features['where'] = int(any((True for word in words if word.lower() == "where")))
            features['how'] = int(any((True for word in words if word.lower() == "how")))
            features['many'] = int(any((True for word in words if word.lower() == "many")))
            features['why'] = int(any((True for word in words if word.lower() == "why")))
            features['who'] = int(any((True for word in words if word.lower() == "who")))
            
            tagged = nltk.pos_tag(words)
            percentPronouns = 0
            percentNouns = 0
            percentNumbers = 0
            #print(tagged)
            properNouns = [word for word,pos in tagged if pos == 'NNP']
            numPN = len(properNouns)
            countPN = 0
            if numPN > 0: 
                for pn in properNouns:
                    if pn in context:   
                        countPN = countPN + 1
                percentPronouns = float(countPN)/numPN
            
            nouns = [word for word,pos in tagged if pos == 'NN']
            numNN = len(nouns)
            countNN = 0
            if numNN > 0:
                for nn in nouns:
                    if nn in context:
                        countNN = countNN + 1
                percentNouns = float(countNN)/numNN
               
            numbers = [word for word,pos in tagged if pos == 'CD']
            numNumbers = len(numbers)
            countNumbers = 0
            if numNumbers > 0:
                for num in numbers: 
                    if num in context:
                        countNumbers = countNumbers + 1
                percentNumbers = float(countNumbers)/numNumbers
            
            features['percentage NNP'] = percentPronouns
            features['percentage NN'] = percentNouns
            features['percentage CD'] = percentNumbers
            
            feature_vector = []
            for i in feats:
                feature_vector.append(features[i])
            training_features[qid] = feature_vector
            training_labels[qid] = int(q['is_impossible'])
            
        training_dict[context] = questions
        
with open('training_features.json', 'w') as fp:
    json.dump(training_features, fp)
with open('training_labels.json', 'w') as fp:
    json.dump(training_labels, fp)

In [None]:
#### PLANNED SENTENCE EMBEDDING
sentence_dict = {}
sentences = []
for x in training_json['data']:
    for p in x['paragraphs']:
        context = p['context']
        sentence = sent_tokenize(context)
        for sent in sentence:
          sentences.append(sent)
        for q in p['qas']:
          question = q['question']
          sentences.append(question)
for i in range(len(sentences)):
    sentence_dict[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)

with open('sentence_embed.json', 'w') as fp:
    json.dump(sentence_dict, fp)

In [None]:
####DEVELOPMENT PREPROCESSING
development_features = {}
test_features = {}

with open('development.json','r') as f:
    development_json = json.load(f)

for x in development_json['data']:
    for p in x['paragraphs']:
        context = p['context']
        doc = nlp(context)
        labels = [x.label_ for x in doc.ents]
        features = defaultdict(int)
        features = Counter(labels)
        questions = {}
        for q in p['qas']:
            qid = q['id']
            questions[qid] = {q['question']:q['is_impossible']}
            words = nltk.word_tokenize(q['question'])
            features['what'] = int(any((True for word in words if word.lower() == "what")))
            features['when'] = int(any((True for word in words if word.lower() == "when")))
            features['where'] = int(any((True for word in words if word.lower() == "where")))
            features['how'] = int(any((True for word in words if word.lower() == "how")))
            features['many'] = int(any((True for word in words if word.lower() == "many")))
            features['why'] = int(any((True for word in words if word.lower() == "why")))
            features['who'] = int(any((True for word in words if word.lower() == "who")))
            
            tagged = nltk.pos_tag(words)
            percentPronouns = 0
            percentNouns = 0
            percentNumbers = 0
            #print(tagged)
            properNouns = [word for word,pos in tagged if pos == 'NNP']
            numPN = len(properNouns)
            countPN = 0
            if numPN > 0: 
                for pn in properNouns:
                    if pn in context:   
                        countPN = countPN + 1
                percentPronouns = float(countPN)/numPN
            
            nouns = [word for word,pos in tagged if pos == 'NN']
            numNN = len(nouns)
            countNN = 0
            if numNN > 0:
                for nn in nouns:
                    if nn in context:
                        countNN = countNN + 1
                percentNouns = float(countNN)/numNN
               
            numbers = [word for word,pos in tagged if pos == 'CD']
            numNumbers = len(numbers)
            countNumbers = 0
            if numNumbers > 0:
                for num in numbers: 
                    if num in context:
                        countNumbers = countNumbers + 1
                percentNumbers = float(countNumbers)/numNumbers
            
            features['percentage NNP'] = percentPronouns
            features['percentage NN'] = percentNouns
            features['percentage CD'] = percentNumbers
            
            feature_vector = []
            for i in feats:
                feature_vector.append(features[i])
            development_features[qid] = feature_vector
            
        development_dict[context] = questions


with open('development_features.json', 'w') as fp:
    json.dump(development_features, fp)


####TESTING
with open('test.json','r') as f:
    test_json = json.load(f)

for x in test_json['data']:
    for p in x['paragraphs']:
        context = p['context']
        doc = nlp(context)
        labels = [x.label_ for x in doc.ents]
        features = defaultdict(int)
        features = Counter(labels)
        questions = {}
        for q in p['qas']:
            qid = q['id']
            questions[qid] = q['question']
            words = nltk.word_tokenize(q['question'])
            features['what'] = int(any((True for word in words if word.lower() == "what")))
            features['when'] = int(any((True for word in words if word.lower() == "when")))
            features['where'] = int(any((True for word in words if word.lower() == "where")))
            features['how'] = int(any((True for word in words if word.lower() == "how")))
            features['many'] = int(any((True for word in words if word.lower() == "many")))
            features['why'] = int(any((True for word in words if word.lower() == "why")))
            features['who'] = int(any((True for word in words if word.lower() == "who")))
            
            tagged = nltk.pos_tag(words)
            percentPronouns = 0
            percentNouns = 0
            percentNumbers = 0
            #print(tagged)
            properNouns = [word for word,pos in tagged if pos == 'NNP']
            numPN = len(properNouns)
            countPN = 0
            if numPN > 0: 
                for pn in properNouns:
                    if pn in context:   
                        countPN = countPN + 1
                percentPronouns = float(countPN)/numPN
            
            nouns = [word for word,pos in tagged if pos == 'NN']
            numNN = len(nouns)
            countNN = 0
            if numNN > 0:
                for nn in nouns:
                    if nn in context:
                        countNN = countNN + 1
                percentNouns = float(countNN)/numNN
               
            numbers = [word for word,pos in tagged if pos == 'CD']
            numNumbers = len(numbers)
            countNumbers = 0
            if numNumbers > 0:
                for num in numbers: 
                    if num in context:
                        countNumbers = countNumbers + 1
                percentNumbers = float(countNumbers)/numNumbers
            
            features['percentage NNP'] = percentPronouns
            features['percentage NN'] = percentNouns
            features['percentage CD'] = percentNumbers
            
            feature_vector = []
            for i in feats:
                feature_vector.append(features[i])
            test_features[qid] = feature_vector
        test_dict[context] = questions

with open('test_features.json', 'w') as fp:
    json.dump(test_features, fp)

In [None]:
## Finding the root
from nltk import Tree
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

import json

import en_core_web_sm
nlp = en_core_web_sm.load()

with open('training.json','r') as f:
    training_json = json.load(f)

question_root_dict = {} #dictionary for boolean of whether the root of the sentences is in the context 
for x in training_json['data']:
    for p in x['paragraphs']:
        sentences = []
        context = p['context']
        sentence = nlp(context.lower()).sents
        for sent in sentence:
          sentences.append(sent)
        for q in p['qas']:
          qid = q['id']
          question = q['question']
          question_root_dict[qid] = 0
          question = question.lower()
          qroot = st.stem(str([sent.root for sent in nlp(question).sents][0])) #if the question has 2 parts, just pick the first and stem it
          for sent in sentences:
            sroots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks] 
            if qroot in sroots:
              question_root_dict[qid] = 1
              break
            
with open('question_root_train.json', 'w') as fp:
    json.dump(question_root_dict, fp)

In [None]:
#### Reloading features
with open('training_features.json','r') as f:
    training_json = json.load(f)
training_features = training_json

with open('test_features.json','r') as f:
    test_json = json.load(f)
    
test_features = test_json

with open('question_root_train.json', 'r') as f:
    question_root_dict_json = json.load(f)
question_root_dict = question_root_dict_json

In [None]:
# Attempt 2 -> mixing NER counts and removing extraneous ones
new_training_features = {}
for key in training_features:
    features = training_features[key]
    new_features = []
    new_features.append(features[0])
    new_features.append(features[1]+features[3])
    new_features.append(features[4]+features[5])
    new_features.append(features[2]+features[6]+features[10])
    new_features.append(features[11]+features[12])
    new_features.append(features[13]+features[14]+features[15]+features[17])
    for i in range(18,len(feats)):
        new_features.append(features[i])
    new_training_features[key] = new_features

In [None]:
# Attempt 3 - only keeping the last 3 features
# new_training_features = {}
# for key in training_features:
#     features = training_features[key]
#     new_features = features[-3:]
#     new_training_features[key] = new_features

# appending the root boolean
for key in training_features:
    new_training_features[key].append(question_root_dict[key])

In [None]:
# standardizing feature vectors and gold standard labels for logistic regression
X = []
y = []
for key in training_features:
    X.append(new_training_features[key])
    y.append(int(not training_labels[key]))

In [None]:
# Implementation of Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_std = scaler.fit_transform(X)

clf = LogisticRegression(random_state=0, solver='lbfgs')

model = clf.fit(X, y)

In [None]:
# Creating root boolean for test set
with open('test.json','r') as f:
    testing_json = json.load(f)

question_root_dict_test = {}
for x in testing_json['data']:
    for p in x['paragraphs']:
        sentences = []
        context = p['context']
        sentence = nlp(context.lower()).sents
        for sent in sentence:
          sentences.append(sent)
        for q in p['qas']:
          qid = q['id']
          question = q['question']
          question_root_dict_test[qid] = 0
          question = question.lower()
          qroot = st.stem(str([sent.root for sent in nlp(question).sents][0]))
          for sent in sentences:
            sroots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
            if qroot in sroots:
              question_root_dict_test[qid] = 1
              break
            
with open('question_root_test.json', 'w') as fp:
    json.dump(question_root_dict_test, fp)

In [None]:
# Matching attempts 2+3 for the dev/test sets
import csv

new_dev_features = {}
for key in development_features:
    features = development_features[key]
    new_features = []
    new_features.append(features[0])
    new_features.append(features[1]+features[3])
    new_features.append(features[4]+features[5])
    new_features.append(features[2]+features[6]+features[10])
    new_features.append(features[11]+features[12])
    new_features.append(features[13]+features[14]+features[15]+features[17])
    for i in range(18,len(feats)):
        new_features.append(features[i])
    new_dev_features[key] = new_features
    
new_dev_features = {}
for key in development_features:
    features = development_features[key]
    new_features = features[-3:]
    new_dev_features[key] = new_features

    
# predictions = {}
# dev_predictions = {}
# for key in development_features:
#     val = int(model.predict([new_dev_features[key]]))
#     dev_predictions[key] = val

# with open('dev_predictions.json', 'w') as fp:
#     json.dump(dev_predictions, fp)

new_test_features = {}
for key in test_features:
    features = test_features[key]
    new_features = []
    new_features.append(features[0])
    new_features.append(features[1]+features[3])
    new_features.append(features[4]+features[5])
    new_features.append(features[2]+features[6]+features[10])
    new_features.append(features[11]+features[12])
    new_features.append(features[13]+features[14]+features[15]+features[17])
    for i in range(18,len(feats)):
        new_features.append(features[i])
    new_test_features[key] = new_features

# new_test_features = {}
# for key in test_features:
#     features = test_features[key]
#     new_features = features[-3:]
#     new_test_features[key] = new_features

for key in test_features:
    new_test_features[key].append(question_root_dict_test[key])
    
for key in test_features:
    val = int(model.predict([new_test_features[key]]))
    predictions[key] = val
    
# Creating prediction csv
with open('preds_p4.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["Category", "Id"])
    for key in predictions:
        spamwriter.writerow([predictions[key], key])
    