In [2]:
import json

with open("semeval_datasetV3.json", "r") as f:
    data = json.load(f)
    
import spacy

# data but only keep examples that are labeled as causal
causal_data = [ d for d in data if d["relation_type"] == 0][:]





In [2]:
nlp = spacy.load("en_core_web_lg")

docs = []
for d in causal_data:
    docs.append(nlp(d["sent"]))

"""
#get token dependencies
for t in doc:
    #subject would be
    print(t, t.dep_)
    if t.dep_ == "nsubj" or t.dep_ == "nsubjpass":
        subject = t.orth_
        print("subject",subject)
    #iobj for indirect object
    if t.dep_ == "iobj":
        indirect_object = t.orth_
    #dobj for direct object
    if t.dep_ == "dobj":
        direct_object = t.orth_
    if t.dep_ == "pobj":
        passive_object  = t.orth_
print("---------------------------------")
print(subject)
print(direct_object)
print(indirect_object)
print(passive_subject)
print(passive_object)
"""

'\n#get token dependencies\nfor t in doc:\n    #subject would be\n    print(t, t.dep_)\n    if t.dep_ == "nsubj" or t.dep_ == "nsubjpass":\n        subject = t.orth_\n        print("subject",subject)\n    #iobj for indirect object\n    if t.dep_ == "iobj":\n        indirect_object = t.orth_\n    #dobj for direct object\n    if t.dep_ == "dobj":\n        direct_object = t.orth_\n    if t.dep_ == "pobj":\n        passive_object  = t.orth_\nprint("---------------------------------")\nprint(subject)\nprint(direct_object)\nprint(indirect_object)\nprint(passive_subject)\nprint(passive_object)\n'

In [3]:
nouns_and_verbs = []
print(causal_data[2]["sent"])

#get token dependencies
for t in docs[2]:
    word = {}
    if t.orth_ == "from":
        print("from", t.pos_, t.dep_)
    if t.pos_ == "NOUN" or t.pos_ == "PROPN" or t.pos_ == "VERB":
        word["pos"] = t.pos_
        word["text"] = str(t.orth_)
        word["index"] = t.i
        word["dep"] = t.dep_
        nouns_and_verbs.append(word)
print(nouns_and_verbs)

The singer, who performed three of the nominated songs, also caused a commotion on the red carpet.
[{'pos': 'NOUN', 'text': 'singer', 'index': 1, 'dep': 'nsubj'}, {'pos': 'VERB', 'text': 'performed', 'index': 4, 'dep': 'relcl'}, {'pos': 'VERB', 'text': 'nominated', 'index': 8, 'dep': 'amod'}, {'pos': 'NOUN', 'text': 'songs', 'index': 9, 'dep': 'pobj'}, {'pos': 'VERB', 'text': 'caused', 'index': 12, 'dep': 'ROOT'}, {'pos': 'NOUN', 'text': 'commotion', 'index': 14, 'dep': 'dobj'}, {'pos': 'NOUN', 'text': 'carpet', 'index': 18, 'dep': 'pobj'}]


In [4]:
def generate_sent_info(sent):
    doc = nlp(sent)
    nouns_and_verbs = []
    for t in doc:
        word = {}
        if t.pos_ == "NOUN" or t.pos_ == "PROPN" or t.pos_ == "VERB":
            word["pos"] = t.pos_
            word["text"] = str(t.orth_)
            word["index"] = t.i
            word["dep"] = t.dep_
            nouns_and_verbs.append(word)
    return nouns_and_verbs

In [5]:
def spacy_doc_data(data):
    word_data = []
    for d in data:
        n_and_v = generate_sent_info(d["sent"])
        word_data.append(n_and_v)
    return word_data

In [6]:
# remove nouns from consideration that are not separated by a verb
def filter_contiguous_nouns(n_v_list, pairings):
    filtered = []
    for pairing in pairings:
        item_1_index = pairing[0]["index"]
        item_2_index = pairing[1]["index"]
        for word in n_v_list:
            if word["pos"] == "VERB":
                if word["index"] > item_1_index and word["index"] < item_2_index:
                    filtered.append(pairing)
    return filtered
    

In [65]:
#
def filter_dep_types(pairings):
    filtered = []
    for pairing in pairings:
        if pairing[0]["dep"] in ["nsubj", "nsubjpass", "dobj", "pobj", "iobj", "nsubj","advcl", "meta", "xcomp", "conj"]:
            if pairing[1]["dep"] in ["nsubj", "nsubjpass", "dobj", "pobj", "iobj", "nsubj","advcl", "meta", "xcomp", "conj"]:
                filtered.append(pairing)
    return filtered
    

In [8]:
# get all possible cause effect pairs in a sentence using nouns and verbs list
def get_possible_relations(n_v_list):
    pairings = []
    for i, word in enumerate(n_v_list):
        for j in range(i + 1, len(n_v_list)):
            word2 = n_v_list[j]
            if word["pos"] != "VERB" and word2["pos"] != "VERB":
                pairings.append((word, word2))
    return pairings
                    
                    
        
    

In [37]:
def evaluate_containment(dataset, pairings_list):
    num_contained = 0
    for i, d in enumerate(dataset):
        e1 = d["e1_contents"]
        e2 = d["e2_contents"]
        pairings = pairings_list[i]
        temp = num_contained
        for pairing in pairings:
            if pairing[0]["text"] in e1 or pairing[0]["text"] in e2:
                if pairing[1]["text"] in e1 or pairing[1]["text"] in e2:
                    num_contained += 1
                    break
        if num_contained == temp:
            pass
            #print(d["sent"], pairing)
    return num_contained
                    
        

In [66]:
#print(len(nouns_and_verbs))

print(len(causal_data))
n_and_v_list = spacy_doc_data(causal_data)

pairings_list = []
for n_and_v in n_and_v_list:
    pairings_list.append(get_possible_relations(n_and_v))
    
overall_num_pairings = 0
for pairing in pairings_list:
    overall_num_pairings += len(pairing)
    
filtered_list = []
for i, pairing in enumerate(pairings_list):
    filtered_list.append(filter_dep_types(pairing))

num_contained = evaluate_containment(causal_data, filtered_list)
print("contained", num_contained, "as percent", num_contained / len(filtered_list))

reduced_num_pairings = 0
for filtered in filtered_list:
    reduced_num_pairings += len(filtered)
print(overall_num_pairings)
print(reduced_num_pairings)

"""
print(len(pairings))
filtered = filter_contiguous_nouns(nouns_and_verbs, pairings)
print(len(filtered))
filtered_dep = filter_dep_types(filtered)
print(len(filtered_dep))
print(filtered_dep)

print("contained", evaluate_containment(causal_data, pairings_list))
"""



    
    
    
    

1003
contained 860 as percent 0.8574277168494516
20229
11044


'\nprint(len(pairings))\nfiltered = filter_contiguous_nouns(nouns_and_verbs, pairings)\nprint(len(filtered))\nfiltered_dep = filter_dep_types(filtered)\nprint(len(filtered_dep))\nprint(filtered_dep)\n\nprint("contained", evaluate_containment(causal_data, pairings_list))\n'

In [38]:
all_spacy_deps = ["ROOT",
"acl",
"acomp",
"advcl",
"advmod",
"agent",
"amod",
"appos",
"attr",
"aux",
"auxpass",
"case",
"cc",
"ccomp",
"compound",
"conj",
"csubj",
"csubjpass",
"dative",
"dep",
"det",
"dobj",
"expl",
"intj",
"mark",
"meta",
"neg",
"nmod",
"npadvmod",
"nsubj",
"nsubjpass",
"nummod",
"oprd",
"parataxis",
"pcomp",
"pobj",
"poss",
"preconj",
"predet",
"prep",
"prt",
"punct",
"quantmod",
"relcl",
"xcomp"]
for dep in all_spacy_deps:
    print(dep, "-", spacy.explain(dep))

ROOT - None
acl - clausal modifier of noun (adjectival clause)
acomp - adjectival complement
advcl - adverbial clause modifier
advmod - adverbial modifier
agent - agent
amod - adjectival modifier
appos - appositional modifier
attr - attribute
aux - auxiliary
auxpass - auxiliary (passive)
case - case marking
cc - coordinating conjunction
ccomp - clausal complement
compound - compound
conj - conjunct
csubj - clausal subject
csubjpass - clausal subject (passive)
dative - dative
dep - unclassified dependent
det - determiner
dobj - direct object
expl - expletive
intj - interjection
mark - marker
meta - meta modifier
neg - negation modifier
nmod - modifier of nominal
npadvmod - noun phrase as adverbial modifier
nsubj - nominal subject
nsubjpass - nominal subject (passive)
nummod - numeric modifier
oprd - object predicate
parataxis - parataxis
pcomp - complement of preposition
pobj - object of preposition
poss - possession modifier
preconj - pre-correlative conjunction
predet - None
prep - pr

In [64]:
def filter_dep_types_m(pairings, dep):
    filtered = []
    for pairing in pairings:
        if pairing[0]["dep"] in ["nsubj", "nsubjpass", "dobj", "pobj", "iobj", "ROOT", "acomp", "advcl", "meta", "xcomp","conj" dep]:
            if pairing[1]["dep"] in ["nsubj", "nsubjpass", "dobj", "pobj", "iobj", "nsubj","advcl", "meta", "xcomp","conj", dep]:
                filtered.append(pairing)
    return filtered

for dep in all_spacy_deps:
    print(dep, "-", spacy.explain(dep))
    
    n_and_v_list = spacy_doc_data(causal_data)

    pairings_list = []
    for n_and_v in n_and_v_list:
        pairings_list.append(get_possible_relations(n_and_v))

    overall_num_pairings = 0
    for pairing in pairings_list:
        overall_num_pairings += len(pairing)

    filtered_list = []
    for i, pairing in enumerate(pairings_list):
        filtered_list.append(filter_dep_types_m(pairing, dep))

    num_contained = evaluate_containment(causal_data, filtered_list)
    #print("contained", num_contained, "as percent", num_contained / len(filtered_list))

    reduced_num_pairings = 0
    for filtered in filtered_list:
        reduced_num_pairings += len(filtered)
    print(overall_num_pairings)
    if reduced_num_pairings == 8435:
        print("num contained", num_contained-768, "num pairings", 8435, "\npercentage", 0)
    else:
        print("num contained", num_contained-768, "num pairings", reduced_num_pairings-8435)
        print("percentage", ((num_contained-768)/(reduced_num_pairings-8435)))
    print("--------------------------------------")


SyntaxError: invalid syntax (<ipython-input-64-4d049df82d09>, line 4)

In [157]:
total = 0
for filtered in filtered_list:
    total += len(filtered)
print(len(filtered_list))
print(total/len(filtered_list))
    
# get simple sentences with fewer pairs of nouns
simple = []
for i, sent_pairings in enumerate(filtered_list):
    if len(sent_pairings) > 6:
        pass
    else:
        simple.append((sent_pairings, i))
print(len(simple))
print(simple[0])


1003
11.010967098703889
498
([({'pos': 'NOUN', 'text': 'burst', 'index': 1, 'dep': 'nsubjpass'}, {'pos': 'NOUN', 'text': 'pressure', 'index': 8, 'dep': 'pobj'})], 1)


In [87]:
# Adds the e1 e2 tags back into a sentence
def insert_e1_e2_labels(sent, e1, e2):
    e1_start = sent.find(e1)
    e1_end = sent.find(" ", e1_start)
    if e1_start == -1:
        print("error unable to find", e1, "in", sent)
    e1_desig_s = "<e1>"
    e1_desig_e = "</e1>"
    sent = sent[:e1_start] + e1_desig_s + sent[e1_start:e1_end] + e1_desig_e + sent[e1_end:]
    
    e2_start = sent.find(e2)
    e2_end = sent.find(" ", e2_start)
    if e2_start == -1:
        print("error unable to find", e2, "in", sent)
    e2_desig_s = "<e2>"
    e2_desig_e = "</e2>"
    sent = sent[:e2_start] + e2_desig_s + sent[e2_start:e2_end] + e2_desig_e + sent[e2_end:]
    return sent
    
# Adds the e1 e2 labels to all datapoints in dataset
def add_e1_e2_labels(dataset):
    for i, d in enumerate(dataset):
        d["sent"] = insert_e1_e2_labels(d["sent"], d["e1_contents"], d["e2_contents"])
    return dataset




In [161]:
# create augmented data

def convert_pairings_to_data(sent, pairing):
    sent = insert_e1_e2_labels(sent, pairing[0]["text"], pairing[1]["text"])
    return sent
    
#simple list item - tuple_num - access tuple - tuple item

def convert_all(data, simple):
    augmented = []
    for s in simple:
        pairings = s[0]
        for p in pairings:
            aug = {}
            aug["sent"] = convert_pairings_to_data(data[s[1]]["sent"], p)
            aug["e1_contents"] = p[0]["text"]
            aug["e2_contents"] = p[1]["text"]
            aug["relation_type"] = 1
            e1 = data[s[1]]["e1_contents"]
            e2 = data[s[1]]["e2_contents"]
            #these if statements remove 
            if p[0]["text"] in e1 or p[0]["text"] in e2:
                    if p[1]["text"] in e1 or p[1]["text"] in e2:
                        continue
            augmented.append(aug)
    return augmented
augmented = convert_all(causal_data, simple[:])
print(len(augmented))
print(augmented)

# add in positive examples
augmented = augmented + causal_data




1530
[{'sent': 'The <e1>singer,</e1> who performed three of the nominated <e2>songs,</e2> also caused a commotion on the red carpet.', 'e1_contents': 'singer', 'e2_contents': 'songs', 'relation_type': 1}, {'sent': 'The <e1>singer,</e1> who performed three of the nominated songs, also caused a commotion on the red <e2>carpet</e2>.', 'e1_contents': 'singer', 'e2_contents': 'carpet', 'relation_type': 1}, {'sent': 'The singer, who performed three of the nominated <e1>songs,</e1> also caused a <e2>commotion</e2> on the red carpet.', 'e1_contents': 'songs', 'e2_contents': 'commotion', 'relation_type': 1}, {'sent': 'The singer, who performed three of the nominated <e1>songs,</e1> also caused a commotion on the red <e2>carpet</e2>.', 'e1_contents': 'songs', 'e2_contents': 'carpet', 'relation_type': 1}, {'sent': 'The singer, who performed three of the nominated songs, also caused a <e1>commotion</e1> on the red <e2>carpet</e2>.', 'e1_contents': 'commotion', 'e2_contents': 'carpet', 'relation_ty

In [1]:
import random
augmented = augmented + causal_data
random.shuffle(augmented)
with open("augmented_candidate_causal_only.json", "w") as f:
    json.dump(augmented, f, indent = 4)

NameError: name 'augmented' is not defined