In [1]:
from stanfordcorenlp import StanfordCoreNLP
import logging
import json
from copy import deepcopy

In [4]:
# !!!!!NEED TO run corenlp server and install package first, instructions are at
# https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

# example for running server:
# %%bash
# cd /Users/emily/workspace/research/spring2020/stanford-corenlp-full-2018-10-05
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000




In [2]:

class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def dependency_parse(self, sentence):
        return self.nlp.dependency_parse(sentence)

    def annotate(self, sentence):
        return json.loads(self.nlp.annotate(sentence, properties=self.props))

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens


sNLP = StanfordNLP()
#     print("Annotate:", sNLP.annotate(text))

#     print("Parse:", sNLP.parse(text))
#     print("Dep Parse:", sNLP.dependency_parse(text))

In [3]:
sNLP.pos('the wind is caused by the storms')

[('the', 'DT'),
 ('wind', 'NN'),
 ('is', 'VBZ'),
 ('caused', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('storms', 'NNS')]

In [6]:
#write to textbook_context.txt for tony manov model
# import json
# import random

# texts = []
# f = open("textbook_context.txt","w") 
# with open('tqa_train_val_test/train/tqa_v1_train.json') as json_file:
#     data = json.load(json_file)
#     for lesson in data:
#         #one lesson
        
#         if len(texts)>=50:
#             break
#         #vocab
#         vocabs = list(lesson["adjunctTopics"]["Vocabulary"].keys())
        
#         if len(vocabs)!= 0 and random.uniform(0, 1)>0.5:
#             #topics
#             for tid in lesson["topics"]:
#                 if random.uniform(0, 1)>0.5:
#                     text = lesson["topics"][tid]['content']['text']
#                     texts.append(text)
#                     f.write(text)
#                     f.write('\n')
# f.close()

In [11]:
#tony and manov with vocab
# import json
# import random
# import csv


# text_ans = []
# f = open("textbook_context_vocab.txt","w") 
# with open('tqa_train_val_test/train/tqa_v1_train.json') as json_file:
#     data = json.load(json_file)
#     for lesson in data:
#         #one lesson
        
#         if len(text_ans)>=50:
#             break
#         #vocab
#         vocabs = list(lesson["adjunctTopics"]["Vocabulary"].keys())
        
#         if len(vocabs)!= 0 and random.uniform(0, 1)>0.5:
#             #topics
#             for tid in lesson["topics"]:
#                 text = lesson["topics"][tid]['content']['text']
#                 relevent_vocabs = []

#                 for vocab in vocabs:
#                     start = text.find(vocab)
#                     if start >=0: #vocab term found
#                         relevent_vocabs.append(vocab)
#                 if len(relevent_vocabs)>0 and random.uniform(0, 1)>0.5 and len(text_ans)<50: 
#                     text_ans.append((text, relevent_vocabs))

# data=[('smith, bob',2),('carol',3),('ted',4),('alice',5)]

# with open('textbook_context_vocab.csv','w') as f:
#     writer=csv.writer(f)
#     writer.writerow(['text','vocabs'])
#     for row in text_ans:
#         writer.writerow(row)


In [39]:

import json
import random


text_vocabs = []

with open('tqa_train_val_test/train/tqa_v1_train.json') as json_file:
    data = json.load(json_file)
    for lesson in data:
        #one lesson
        if len(text_vocabs)>=50:
            break
        #vocab
        vocabs = list(lesson["adjunctTopics"]["Vocabulary"].keys())
        if len(vocabs)!= 0 and random.uniform(0, 1)>0.5:
            #topics
            for tid in lesson["topics"]:
                text = lesson["topics"][tid]['content']['text']
                ansIdx = []
                for vocab in vocabs:
                    start = text.find(vocab)
                    if start >=0: #vocab term found
                        end = start + len(vocab) # if the start_index is not -1
                        word_start = len(sNLP.word_tokenize(text[:start]))
                        word_end = word_start + len(sNLP.word_tokenize(text[start:end])) #inclusive
                        ansIdx.append(list(range(word_start, word_end)))
                if len(ansIdx)>0 and random.uniform(0, 1)>0.5 and len(text_vocabs)<50: 
                    text_vocabs.append((text, ansIdx))
                    
context_ans = [(sNLP.word_tokenize(x[0]), x[1][0])for x in text_vocabs]

# print(json.dumps(person_dict, indent = 4, sort_keys=True))

In [56]:
context_ans = []
for x in text_vocabs:
    for a in x[1]:
        context_ans.append((sNLP.word_tokenize(x[0]), a))
len(context_ans)

95

In [60]:
def answer_to_idx(context, answer): #retrun ans_idx
# context = "But when the Silk Road, the long overland trading route from China to the Mediterranean, became costlier and more dangerous to travel, Europeans searched for a more efficient and inexpensive trade route over water, initiating the development of what we now call the Atlantic World."
# answer = "Europeans searched for a more efficient and inexpensive trade route over water"
    start = context.find(answer)
    if start >=0: #vocab term found
        end = start + len(answer) # if the start_index is not -1
        word_start = len(sNLP.word_tokenize(context[:start]))
        word_end = word_start + len(sNLP.word_tokenize(context[start:end])) #inclusive
        return(list(range(word_start, word_end)))
        


In [80]:
#Tony and Manov model: given csv of context, modelanswer
def preprocess_csv_context_answer(filename):
    import pandas as pd

    df = pd.read_csv(filename)
    text_ansidx = []
    tokenized_text_ansidx = []
    for i in range(len(df)):
        df.columns = ['Context','modelanswer']
        c = df['Context'][i]
        a = df['modelanswer'][i]
        ansIdx = answer_to_idx(c,a)
        print(ansIdx)
        if ansIdx != None: #if answer exists
            text_ansidx.append((c, [ansIdx]))
            tokenized_text_ansidx.append((sNLP.word_tokenize(c), ansIdx))
    return text_ansidx, tokenized_text_ansidx

text_ans, context_ans = preprocess_csv_context_answer('effect.csv')

[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
[27, 28]
[27, 28]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
[5, 6]
[28, 29]
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]
[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [81]:
%store context_ans
del context_ans

Stored 'context_ans' (list)


In [82]:

#given text_ans(text, ansIdx), create dataAns = list of dictionary(ex) that contains
#document
#pos
#ner
#doc_case
#ansInd

dataAns = []
for text, ansIdxs in text_ans: #!!!!!!!!!CHANGE
    ex = {}
    document = sNLP.word_tokenize(text)
    pos = [tup[1] for tup in sNLP.pos(text)]
    ner = [tup[1] for tup in sNLP.ner(text)]
    
    ex['document'] = document
    ex['pos'] = pos
    ex['ner'] = ner
    
    
    # get case info
    doc_case = []
    for w in ex['document']:
        if w.isalpha():
            if w.islower():
                doc_case.append('L')
            else:
                doc_case.append('U')
        else:
            doc_case.append('L')
    ex['doc_case'] = doc_case
    
    # find answers using special ner tags
    ansInds = [] # list of lists of answer indicators
#                 ansIdxs = [] #answer indexes

#                 idx = 0
#                 while idx < len(ner):
#                     if ner[idx] != 'O':
#                         k = idx+1
#                         while k < len(ner):
#                             if ner[k] == ner[idx]:
#                                 k += 1
#                             else:
#                                 break
#                         ansIdxs.append(list(range(idx,k)))
#                         idx = k
#                     else:
#                         idx += 1    

#                 new_ansIdxs = []# do not include duplicate answer indices
#                 for ansIdx in ansIdxs:
#                     if ansIdx not in new_ansIdxs:
#                         new_ansIdxs.append(ansIdx)
#                 ansIdxs = new_ansIdxs
#                 print("ansindex",ansIdxs)
    
    for idx in range(len(ansIdxs)): #for each answer, create answer indicaters
        ansInd = ['-'] * len(ner)
        for j in ansIdxs[idx]:
            ansInd[j] = 'A'
        ansInds.append(ansInd)
        
    for idx in range(len(ansInds)):#for each answer, creat ex and add
        ansInd = ansInds[idx]
        newData = deepcopy(ex)
        newData['ansInd'] = ansInd
#         print("ans", idx, newData, '\n')
        dataAns.append(newData)
    
#     print("finding answers completed")
    
    #end

    
    

    



finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed
finding answers completed


In [83]:
# append case, pos, ner, ansInd as features to context
with open("inputformat.txt", 'wb') as f:
    for ex in dataAns:
        line = u' '.join([ex['document'][idx].replace(' ', '').lower() + '￨' + ex['doc_case'][idx] + '￨' +
                        ex['pos'][idx] + '￨' + ex['ner'][idx] + '￨' + ex['ansInd'][idx]
                        for idx in range(len(ex['document']))]).encode('utf-8').strip()
        f.write(line + u'\n'.encode('utf-8'))
f.close()


In [84]:
#answer ||| context in readable format
with open("readable.txt", 'wb') as f:
    for ex in dataAns:
        ans = []
        for idx in range(len(ex['ansInd'])):
            if ex['ansInd'][idx] == 'A':
                ans.append(ex['document'][idx])
        line = u' '.join(ans + [' ||| '] + [ex['document'][idx].replace(' ', '').lower() 
                        for idx in range(len(ex['document']))]).encode('utf-8').strip()
        f.write(line + u'\n'.encode('utf-8'))
f.close()

### read output_questions

In [23]:
#read output
import csv

questions = open("output_questions_QG-Net.pt.txt.prob.txt", 'rb')
questions = questions.readlines() 
ans_context = open("readable.txt", 'rb')
ans_context = ans_context.readlines() 
scores = open("scores.csv", 'wb')


for i in range(3):
    row = []
    row.append(questions[i])
    
    print(ans_context[i].split(('|||').encode('utf-8')))
    


[b'Europeans searched for a more efficient and inexpensive trade route over water ,  ', b'  globalization , the ever-increasing interconnectedness of the world , is not a new phenomenon , but it accelerated when western europeans discovered the riches of the east . during the crusades ( 1095 \xe2\x80\x93 1291 ) , europeans developed an appetite for spices , silk , porcelain , sugar , and other luxury items from the east , for which they traded fur , timber , and slavic people they captured and sold ( hence the word slave ) . but when the silk road , the long overland trading route from china to the mediterranean , became costlier and more dangerous to travel , europeans searched for a more efficient and inexpensive trade route over water , initiating the development of what we now call the atlantic world .\n']
[b'Mistakenly believing they had reached the East Indies ,  ', b'  in pursuit of commerce in asia , fifteenth-century traders unexpectedly encountered a \xe2\x80\x9c new world \x