### causal preprocessing

In [10]:
from stanfordcorenlp import StanfordCoreNLP
import numpy as np
import logging
import json
from copy import deepcopy
import pandas as pd
import numpy as np
from nltk.tree import *
from nltk import RegexpParser
from nltk import tokenize
from string import punctuation
pd.set_option('display.max_colwidth', -1)

In [4]:
# !!!!!NEED TO run corenlp server and install package first, instructions are at
# https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

# example for running server:
# %%bash
# cd /Users/emily/workspace/research/spring2020/stanford-corenlp-full-2018-10-05
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000

In [2]:
class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }
    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)
sNLP = StanfordNLP()

In [8]:
def replace_cue(context, cue_before, cue_after) -> str:
    return context.replace(cue_before, cue_after)

def answer_str(context, cue, comp) -> str: #comp = 0/1, indicates whether answer before or after cue word in a sentence
    sents = tokenize.sent_tokenize(context)
    for sent in sents:
        if cue in sent:
            return sent.split(cue)[comp].strip().strip(punctuation)
        
def make_shorter(context, cue, ambig): #deleted sentences before and after causal sent iff the causalsent has no ambiguous pronoun. True = if made shorter
    sents = tokenize.sent_tokenize(context)
    for sent in sents:
        if cue in sent:# sent = causal sent
            words = tokenize.word_tokenize(sent)
            if any(w in ambig for w in words): #causalsent contains ambig words
                return context, False
            else:
                return sent, True

def find_cue(context, cuewords) ->str:
    for cue in cuewords:
        if cue in context:
            return cue

def cue_sent(context, cue):
    sents = tokenize.sent_tokenize(context)
    for sent in sents:
        if cue in sent:# sent = causal sent
            return sent
        
def answer_to_idx(context, answer): #retrun ans_idx, case sensitive
# context = "But when the Silk Road, the long overland trading route from China to the Mediterranean, became costlier and more dangerous to travel, Europeans searched for a more efficient and inexpensive trade route over water, initiating the development of what we now call the Atlantic World."
# answer = "Europeans searched for a more efficient and inexpensive trade route over water"
    start = context.find(answer)
    if start >=0: #vocab term found
        end = start + len(answer) # if the start_index is not -1
        word_start = len(sNLP.word_tokenize(context[:start]))
        word_end = word_start + len(sNLP.word_tokenize(context[start:end])) #inclusive
        return(list(range(word_start, word_end)))
    

In [93]:
ambiguous_pronouns = ['That','Thats','This','It','Its','These','Those']
cuewords = ['because of', 'as a result of', 'due to']
anschoice = ['cause', 'effect']
contextchoice = ['because of','as a result of', 'due to', 'shorter']


#CASE SENSITIVE
C = pd.read_csv('context_causal.txt', sep = '\n', header=None)
C.columns = ['c']
C = C[C['c'].str.contains('|'.join(cuewords))]

C['cue'] = C['c'].apply(lambda c: find_cue(c, cuewords))
C['sent'] = C.apply(lambda row: cue_sent(row['c'], row['cue']), axis=1)
C = C[~C['sent'].str.contains('Figure')] #get rid of Figure
C['cause'] = C.apply(lambda row: answer_str(row['c'], row['cue'], 1), axis=1)

C['effect'] = C.apply(lambda row: answer_str(row['c'], row['cue'], 0), axis=1)
for cue in cuewords:
    C[cue] = C.apply(lambda row: replace_cue(row['c'], row['cue'], cue), axis=1)
  
C['madeshorter'] = C.apply(lambda row: make_shorter(row['c'], row['cue'], ambiguous_pronouns)[1], axis=1)      
C['shorter'] = C.apply(lambda row: make_shorter(row['c'], row['cue'], ambiguous_pronouns)[0], axis=1)
C.reset_index(inplace=True,drop=True)
C.to_csv('causal.csv')
C.head(1)

Unnamed: 0,c,cue,sent,cause,effect,because of,as a result of,due to,madeshorter,shorter
0,"Streams often start in mountains, where the land is very steep. You can see an example in Figure 10.4. A mountain stream flows very quickly because of the steep slope. This causes a lot of erosion and very little deposition. The rapidly falling water digs down into the stream bed and makes it deeper. It carves a narrow, V-shaped channel.",because of,A mountain stream flows very quickly because of the steep slope.,the steep slope,A mountain stream flows very quickly,"Streams often start in mountains, where the land is very steep. You can see an example in Figure 10.4. A mountain stream flows very quickly because of the steep slope. This causes a lot of erosion and very little deposition. The rapidly falling water digs down into the stream bed and makes it deeper. It carves a narrow, V-shaped channel.","Streams often start in mountains, where the land is very steep. You can see an example in Figure 10.4. A mountain stream flows very quickly as a result of the steep slope. This causes a lot of erosion and very little deposition. The rapidly falling water digs down into the stream bed and makes it deeper. It carves a narrow, V-shaped channel.","Streams often start in mountains, where the land is very steep. You can see an example in Figure 10.4. A mountain stream flows very quickly due to the steep slope. This causes a lot of erosion and very little deposition. The rapidly falling water digs down into the stream bed and makes it deeper. It carves a narrow, V-shaped channel.",True,A mountain stream flows very quickly because of the steep slope.


### below are for QGnet preprocessing, ignore if not needed

In [27]:
def preprocess_df(df, ans, context): #(C, 'cause', 'because of') cause_becauseof.csv, effect_becauseof.csv
    text_ansidx = []
    for i in range(len(df)):
        c = df[context][i]
        a = df[ans][i]
        ansIdx = answer_to_idx(c,a)
#         print(ansIdx)
        if ansIdx != None: #if answer exists
            text_ansidx.append((c, [ansIdx]))
    return text_ansidx #, tokenized_text_ansidx 

all_text_ansidx = []
for a in anschoice:
    for c in contextchoice:
        all_text_ansidx.extend(preprocess_df(C, a, c))    


In [34]:
#given text_ans(text, ansIdx), create dataAns = list of dictionary(ex) that contains #document#pos#ner#doc_case#ansInd
dataAns = []
for text, ansIdxs in all_text_ansidx: #!!!!!!!!!CHANGE
    ex = {}
    document = sNLP.word_tokenize(text)
    pos = [tup[1] for tup in sNLP.pos(text)]
    ner = [tup[1] for tup in sNLP.ner(text)]
    
    ex['document'] = document
    ex['pos'] = pos
    ex['ner'] = ner
    
    # get case info
    doc_case = []
    for w in ex['document']:
        if w.isalpha():
            if w.islower():
                doc_case.append('L')
            else:
                doc_case.append('U')
        else:
            doc_case.append('L')
    ex['doc_case'] = doc_case
    
    # find answers using special ner tags
    ansInds = [] # list of lists of answer indicators
    for idx in range(len(ansIdxs)): #for each answer, create answer indicaters
        ansInd = ['-'] * len(ner)
        for j in ansIdxs[idx]:
            ansInd[j] = 'A'
        ansInds.append(ansInd)
    for idx in range(len(ansInds)):#for each answer, creat ex and add
        ansInd = ansInds[idx]
        newData = deepcopy(ex)
        newData['ansInd'] = ansInd
#         print("ans", idx, newData, '\n')
        dataAns.append(newData) 

In [35]:
# append case, pos, ner, ansInd as features to context
with open("inputformat.txt", 'wb') as f:
    for ex in dataAns:
        line = u' '.join([ex['document'][idx].replace(' ', '').lower() + '￨' + ex['doc_case'][idx] + '￨' +
                        ex['pos'][idx] + '￨' + ex['ner'][idx] + '￨' + ex['ansInd'][idx]
                        for idx in range(len(ex['document']))]).encode('utf-8').strip()
        f.write(line + u'\n'.encode('utf-8'))
f.close()


In [36]:
#answer ||| context in readable format
with open("readable.txt", 'wb') as f:
    for ex in dataAns:
        ans = []
        for idx in range(len(ex['ansInd'])):
            if ex['ansInd'][idx] == 'A':
                ans.append(ex['document'][idx])
        line = u' '.join(ans + [' ||| '] + [ex['document'][idx].replace(' ', '').lower() 
                        for idx in range(len(ex['document']))]).encode('utf-8').strip()
        f.write(line + u'\n'.encode('utf-8'))
f.close()

### read output

In [118]:
# anschoice = ['cause', 'effect']
# contextchoice = ['because of','as a result of', 'due to', 'shorter']

out = pd.read_csv("output_questions_QG-Net.pt.txt.prob.txt", sep='\n',header=None)
out_cause, out_effect = np.array_split(out, 2)

In [119]:
causedf = C.copy()

cause_split = np.array_split(out_cause, len(contextchoice))
for df in cause_split:
    df.reset_index(drop=True,inplace=True)
# print(cause_split[1])
for i in range(len(contextchoice)):
    causedf[contextchoice[i]+'_cause'] = cause_split[i]

In [120]:
causedf.loc[157]

c                       A wedge is simple machine that consists of two inclined planes, giving it a thin end and thick end, as you can see in the Figure 1.1. A wedge is used to cut or split apart objects. Force is applied to the thick end of the wedge, and the wedge, in turn, applies force to the object along both of its sloping sides. This force causes the object to split apart. A knife is another example of a wedge. In the Figure 1.2, a knife is being used to chop tough pecans. The job is easy to do with the knife because of the wedge shape of the blade. The very thin edge of the blade easily enters and cuts through the pecans.     
cue                     because of                                                                                                                                                                                                                                                                                                                                   

In [121]:
out_effect.drop(out_effect.index[0]) #!!!!!!! error

effectdf = C.copy()
effect_split = np.array_split(out_effect, len(contextchoice))
for df in effect_split:
    df.reset_index(drop=True,inplace=True)
for i in range(len(contextchoice)):
    effectdf[contextchoice[i]+'_effect'] = effect_split[i]

In [127]:
outputdf = effectdf.merge(causedf)


In [130]:
outputdf.head()
outputdf.to_csv('causal_output.csv',index=False)


In [35]:
import json
import parser
import re
import nltk

def load():
	filepath = '/Users/tonytu/desktop/old_stuff/berkeley/fall2020/research/CE_extractor--Patterns_Based/CE_extractor/dev-v2.0.json'
	with open(filepath) as file:
		data = json.load(file)
	#print data by paragraphs
	print('succesfully loaded {} articles from squad dataset'.format(len(data['data'])))
	return data['data']

def parse(data):
	#extract all paragraphs from json 
	titles = []
	paragraphs = []
	for i in range(len(data)):
		title = data[i]['title']
		titles.append(title)
		for j in range(len(data[i]['paragraphs'])):
			paragraphs.append(data[i]['paragraphs'][j]['context'])
	return titles, paragraphs

def split(paragraphs):
	#return all the three sentence windows for all paragraphs in nested lists
	#the three sentences are contained in a list in the form of string. 
	three_sentence_window = []
	sentences = []
	for paragraph in paragraphs:
		splitted = re.split(r' *[\.\?!][\'"\)\]]* *', paragraph)[:-1]
		sentences.append(splitted)
# 		three_sentence_window.append([splitted[i:i+3] for i in range(0, len(splitted), 3)])
	return sentences
		
if __name__ == '__main__':
	paragraphs = load()
	# parse(paragraphs)
	# print(paragraphs[0]['paragraphs'][0]['context'])
	titles, paragraphs = parse(paragraphs)
	textbook_sentences = [item for sublist in split(paragraphs) for item in sublist]


succesfully loaded 35 articles from squad dataset


In [40]:
len(textbook_sentences)

7099

In [37]:
a = 'They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia'

In [38]:
a += ' .'

In [39]:
a

'They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia .'

In [15]:
a = [1,3,5,6,7,9,10]
b = [2,4,6,8,11]

given_value = 7
a_list = [1,4,8]
def find_closest(given_value, a_list):
    absolute_diff_func = lambda list_value : abs(list_value - given_value)
    closest_value = min(a_list, key=absolute_diff_func)
    return closest_value
find_closest(given_value, a_list)

for i in range(len(a)):
    print(b.index(find_closest(a[i], b)))

0
0
1
2
2
3
4


In [47]:
textbook_sentences[1:2000]

['They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia',
 'Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia',
 'The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries',
 'The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East',
 'The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated',
 'They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or No

In [49]:
['Isaac surrendered and was confined with silver chains, because Richard had promised that he would not place him in irons',
'Because of this, Ethelred fled to Normandy in 1013, when he was forced from his kingdom by Sweyn Forkbeard',
'The Albanian forces could not take part in the ensuing battle because it had started before their arrival',
'They are called hierarchy theorems because they induce a proper hierarchy on the classes defined by constraining the respective resources',
'Because the problem P = NP is not solved, being able to reduce a known NP-complete problem, Π2, to another problem, Π1, would indicate that there is no known polynomial-time solution for Π1',
'This is because a polynomial-time solution to Π1 would yield a polynomial-time solution to Π2',
'Similarly, because all NP problems can be reduced to the set, finding an NP-complete problem that can be solved in polynomial time would mean that P = NP',
'The question of whether P equals NP is one of the most important open questions in theoretical computer science because of the wide implications of a solution',
"Victoria's total gross state product (GSP) is ranked second in Australia, although Victoria is ranked fourth in terms of GSP per capita because of its limited mining activity",
'In Geneva, Hugues, though Catholic, was a leader of the "Confederate Party", so called because it favoured independence from the Duke of Savoy through an alliance between the city-state of Geneva and the Swiss Confederation',
]










['Isaac surrendered and was confined with silver chains, because Richard had promised that he would not place him in irons',
 'Because of this, Ethelred fled to Normandy in 1013, when he was forced from his kingdom by Sweyn Forkbeard',
 'The Albanian forces could not take part in the ensuing battle because it had started before their arrival',
 'They are called hierarchy theorems because they induce a proper hierarchy on the classes defined by constraining the respective resources',
 'Because the problem P = NP is not solved, being able to reduce a known NP-complete problem, Π2, to another problem, Π1, would indicate that there is no known polynomial-time solution for Π1',
 'This is because a polynomial-time solution to Π1 would yield a polynomial-time solution to Π2',
 'Similarly, because all NP problems can be reduced to the set, finding an NP-complete problem that can be solved in polynomial time would mean that P = NP',
 'The question of whether P equals NP is one of the most impor