In [54]:
import sklearn
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import urllib
import spacy
from tqdm import tqdm
import en_core_web_sm
import re
import itertools
from IPython.display import display, HTML

In [55]:
# Init
tqdm.pandas()
pd.set_option('display.max_colwidth', -1)

  from pandas import Panel


In [159]:
ARG1 = "$ARG1"
ARG2 = "$ARG2"
FINAL_DF_COLUMNS = ['sample', 'extr_sample', 'pattern', 'weak_label', 'gold_label']

Upload the CONLL Data

In [57]:
conll_file = "dev.conll"
with open(conll_file, "r+") as input_file:
    train_data = input_file.read()

The first step is to define the relations we want to be extracted in our dataset:

In [58]:
def process_data(path_to_data):
    samples, relations = [], []
    with open(path_to_data, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("# id="):    # Instance starts
                sample = ""
                label = line.split(" ")[3][5:]
            elif line == "":  # Instance ends
                samples.append(sample)
                relations.append(label)
            elif line.startswith("#"):  # comment
                continue
            else:
                parts = line.split("\t")
                token = parts[1]
                if token == "-LRB-":
                    token = "("
                elif token == "-RRB-":
                    token = ")"
                sample += " " + token
    return samples, relations

samples, labels = process_data(conll_file)

Let's inspect all relation labels that we have in our dataset

In [59]:
print(set(labels))

{'org:country_of_headquarters', 'per:origin', 'per:city_of_birth', 'org:member_of', 'per:children', 'per:stateorprovinces_of_residence', 'per:alternate_names', 'per:spouse', 'per:parents', 'per:country_of_death', 'org:founded', 'per:age', 'org:top_members/employees', 'per:siblings', 'no_relation', 'per:cities_of_residence', 'per:country_of_birth', 'org:shareholders', 'per:cause_of_death', 'per:stateorprovince_of_birth', 'per:other_family', 'per:countries_of_residence', 'per:city_of_death', 'org:dissolved', 'org:website', 'org:parents', 'per:date_of_death', 'per:charges', 'org:founded_by', 'org:political/religious_affiliation', 'per:date_of_birth', 'per:stateorprovince_of_death', 'org:number_of_employees/members', 'org:alternate_names', 'org:members', 'per:employee_of', 'per:religion', 'org:stateorprovince_of_headquarters', 'org:city_of_headquarters', 'per:schools_attended', 'org:subsidiaries', 'per:title'}


In [60]:
samples = pd.DataFrame.from_dict({"samples": samples, "labels": labels})
samples.head()

Unnamed: 0,samples,labels
0,"At the same time , Chief Financial Officer Douglas Flint will become chairman , succeeding Stephen Green who is leaving to take a government job .",per:title
1,U.S. District Court Judge Jeffrey White in mid-February issued an injunction against Wikileaks after the Zurich-based Bank Julius Baer accused the site of posting sensitive account information stolen by a disgruntled former employee .,no_relation
2,"PARIS 2009-07-07 11:07:32 UTC French media earlier reported that Montcourt , ranked 119 , was found dead by his girlfriend in the stairwell of his Paris apartment .",per:city_of_death
3,"The current holdings of Blackstone-operated funds include Universal Orlando , Cadbury Schweppes , Freedom Communications , Nielsen Co. , Orangina and Vanguard Health Systems .",no_relation
4,The ICDC was formed after the Nepali government and the guerrillas reached in an understanding during summit talks held on July 16 at Prime Minister Girija Prashad Koirala 's residence at Baluwatar in downtown Kathmandu city .,no_relation


To make the calculation quicklier, let's choose only the samples that contain a relation (that is, labelled not with "no_relation" label). Moreover, let's reduce the number of relation we want to find and choose 5 relations from the TACRED relation labels.

In [61]:
selected_labels = {'org:alternate_names', 'per:date_of_birth', "org:top_members/employees"} '#not used
# relevant_samples = samples[samples['labels'].isin(selected_labels)]
# samples.groupby('labels').count()

In [62]:
# todo: simplify
relevant_samples = samples[samples["labels"]!='no_relation'].sample(n=400)
relevant_samples.groupby('labels').count()

Unnamed: 0_level_0,samples
labels,Unnamed: 1_level_1
org:alternate_names,19
org:city_of_headquarters,8
org:country_of_headquarters,10
org:dissolved,1
org:founded,2
org:founded_by,5
org:member_of,2
org:members,9
org:number_of_employees/members,2
org:parents,5


In [63]:
relevant_samples.head()

Unnamed: 0,samples,labels
14713,"ALBA -- the Bolivarian Alternative for the Americas -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of Dominica .",org:members
5313,"His conversion to Islam came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born Islamist radicals .",per:religion
12148,The United Nations appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on Saturday .,org:alternate_names
17841,"Marks & Spencer has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in one person .",per:title
15602,"She was the poet laureate of Maryland from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and Duke universities .",per:title


In order to turn the data into distantly supervised one, let's write down for each relation a couple of simple patterns that could help us to find the relations


In [172]:
patterns = pd.DataFrame.from_dict({"org:alternate_names": 
                                   [["$ARG1 ( $ARG2 ", 
                                     "$ARG1 formerly known as $ARG2", 
                                     "$ARG1 aka $ARG2", 
                                     "$ARG1 ( also known as $ARG2 )"]], 
                                   "per:date_of_birth": 
                                   [["$ARG1 ( born $ARG2 )", 
                                    "$ARG1 ( born $ARG2 in", 
                                    "$ARG1 ( $ARG2 -", 
                                    "$ARG1 was born in $ARG2"]],
                                   "org:top_members/employees":
                                   [["$ARG1 , executive director of $ARG2", 
                                     "$ARG1 , head of $ARG2", 
                                     "$ARG1 , who heads $ARG2",
                                     "$ARG1 , chief executive of $ARG2"]]}, orient='index', columns = ["Raw patterns"])

In [173]:
def escape_dollar(string):
    return re.sub("\\$", "\\\\$", str(string))

In [174]:
patterns.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,Raw patterns
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']"


Since we want to make a simple regex search, convert patterns into regexes

In [175]:
def preprocess_patterns(patterns):
    regex_patterns = [re.sub("\\\\\\$ARG", "(A )?(a )?(The )?(the )?\\$ARG", re.escape(pattern)) for pattern in patterns]
    return regex_patterns

patterns["Regex patterns"] = patterns["Raw patterns"].apply(preprocess_patterns)
patterns.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,Raw patterns,Regex patterns
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ ', '(A )?(a )?(The )?(the )?\\\$ARG1\\ formerly\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ aka\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ also\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ in', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\-', '(A )?(a )?(The )?(the )?\\\$ARG1\\ was\\ born\\ in\\ (A )?(a )?(The )?(the )?\\\$ARG2']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ executive\\ director\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ head\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ who\\ heads\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ chief\\ executive\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2']"


Now we can simply search patterns in the sentences. Because of the speed reasons, let's take only samples where we will definetly find smth :)

In [69]:
nlp = spacy.load("en_core_web_sm")
relevant_samples["Spacy_info"] = relevant_samples["samples"].apply(lambda x: nlp(x).to_json())

In [70]:
relevant_samples.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,samples,labels,Spacy_info
14713,"ALBA -- the Bolivarian Alternative for the Americas -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of Dominica .",org:members,"{'text': ' ALBA -- the Bolivarian Alternative for the Americas -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of Dominica .', 'ents': [{'start': 44, 'end': 52, 'label': 'LOC'}, {'start': 71, 'end': 81, 'label': 'NORP'}, {'start': 92, 'end': 103, 'label': 'PERSON'}, {'start': 108, 'end': 113, 'label': 'NORP'}, {'start': 121, 'end': 133, 'label': 'PERSON'}, {'start': 137, 'end': 141, 'label': 'DATE'}, {'start': 160, 'end': 167, 'label': 'GPE'}, {'start': 170, 'end': 179, 'label': 'GPE'}, {'start': 188, 'end': 197, 'label': 'LOC'}, {'start': 208, 'end': 216, 'label': 'LOC'}], 'sents': [{'start': 0, 'end': 218}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 5, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubjpass', 'head': 11}, {'id': 2, 'start': 6, 'end': 8, 'pos': 'PUNCT', 'tag': ':', 'dep': 'punct', 'head': 1}, {'id': 3, 'start': 9, 'end': 12, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 5}, {'id': 4, 'start': 13, 'end': 23, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'amod', 'head': 5}, {'id': 5, 'start': 24, 'end': 35, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'appos', 'head': 1}, {'id': 6, 'start': 36, 'end': 39, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 5}, {'id': 7, 'start': 40, 'end': 43, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 8}, {'id': 8, 'start': 44, 'end': 52, 'pos': 'PROPN', 'tag': 'NNPS', 'dep': 'pobj', 'head': 6}, {'id': 9, 'start': 53, 'end': 55, 'pos': 'PUNCT', 'tag': ':', 'dep': 'punct', 'head': 1}, {'id': 10, 'start': 56, 'end': 59, 'pos': 'AUX', 'tag': 'VBD', 'dep': 'auxpass', 'head': 11}, {'id': 11, 'start': 60, 'end': 67, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'ROOT', 'head': 11}, {'id': 12, 'start': 68, 'end': 70, 'pos': 'ADP', 'tag': 'IN', 'dep': 'agent', 'head': 11}, {'id': 13, 'start': 71, 'end': 81, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 14}, {'id': 14, 'start': 82, 'end': 91, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 16}, {'id': 15, 'start': 92, 'end': 96, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 16}, {'id': 16, 'start': 97, 'end': 103, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 12}, {'id': 17, 'start': 104, 'end': 107, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 16}, {'id': 18, 'start': 108, 'end': 113, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 19}, {'id': 19, 'start': 114, 'end': 120, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'compound', 'head': 21}, {'id': 20, 'start': 121, 'end': 126, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 21}, {'id': 21, 'start': 127, 'end': 133, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 16}, {'id': 22, 'start': 134, 'end': 136, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 11}, {'id': 23, 'start': 137, 'end': 141, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 22}, {'id': 24, 'start': 142, 'end': 145, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 11}, {'id': 25, 'start': 146, 'end': 150, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 26}, {'id': 26, 'start': 151, 'end': 159, 'pos': 'VERB', 'tag': 'VBZ', 'dep': 'conj', 'head': 11}, {'id': 27, 'start': 160, 'end': 167, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'dobj', 'head': 26}, {'id': 28, 'start': 168, 'end': 169, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 27}, {'id': 29, 'start': 170, 'end': 179, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 27}, {'id': 30, 'start': 180, 'end': 183, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 29}, {'id': 31, 'start': 184, 'end': 187, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 33}, {'id': 32, 'start': 188, 'end': 197, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 33}, {'id': 33, 'start': 198, 'end': 204, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'conj', 'head': 29}, {'id': 34, 'start': 205, 'end': 207, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 33}, {'id': 35, 'start': 208, 'end': 216, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 34}, {'id': 36, 'start': 217, 'end': 218, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 11}]}"
5313,"His conversion to Islam came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born Islamist radicals .",per:religion,"{'text': ' His conversion to Islam came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born Islamist radicals .', 'ents': [{'start': 19, 'end': 24, 'label': 'ORG'}, {'start': 48, 'end': 83, 'label': 'ORG'}, {'start': 86, 'end': 96, 'label': 'GPE'}, {'start': 156, 'end': 159, 'label': 'CARDINAL'}, {'start': 173, 'end': 181, 'label': 'NORP'}], 'sents': [{'start': 0, 'end': 192}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'DET', 'tag': 'PRP\$', 'dep': 'poss', 'head': 2}, {'id': 2, 'start': 5, 'end': 15, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 5}, {'id': 3, 'start': 16, 'end': 18, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 2}, {'id': 4, 'start': 19, 'end': 24, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 3}, {'id': 5, 'start': 25, 'end': 29, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'ROOT', 'head': 5}, {'id': 6, 'start': 30, 'end': 35, 'pos': 'ADP', 'tag': 'IN', 'dep': 'mark', 'head': 8}, {'id': 7, 'start': 36, 'end': 38, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubj', 'head': 8}, {'id': 8, 'start': 39, 'end': 47, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'advcl', 'head': 5}, {'id': 9, 'start': 48, 'end': 51, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 16}, {'id': 10, 'start': 52, 'end': 59, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 11}, {'id': 11, 'start': 60, 'end': 66, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nmod', 'head': 16}, {'id': 12, 'start': 67, 'end': 69, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 11}, {'id': 13, 'start': 70, 'end': 76, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 14}, {'id': 14, 'start': 77, 'end': 83, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 12}, {'id': 15, 'start': 84, 'end': 85, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 14}, {'id': 16, 'start': 86, 'end': 96, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'dobj', 'head': 8}, {'id': 17, 'start': 97, 'end': 98, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 16}, {'id': 18, 'start': 99, 'end': 104, 'pos': 'ADV', 'tag': 'WRB', 'dep': 'advmod', 'head': 24}, {'id': 19, 'start': 105, 'end': 107, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubjpass', 'head': 21}, {'id': 20, 'start': 108, 'end': 110, 'pos': 'AUX', 'tag': 'VBZ', 'dep': 'auxpass', 'head': 21}, {'id': 21, 'start': 111, 'end': 119, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'relcl', 'head': 16}, {'id': 22, 'start': 120, 'end': 122, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 24}, {'id': 23, 'start': 123, 'end': 127, 'pos': 'AUX', 'tag': 'VB', 'dep': 'aux', 'head': 24}, {'id': 24, 'start': 128, 'end': 132, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'xcomp', 'head': 21}, {'id': 25, 'start': 133, 'end': 138, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 24}, {'id': 26, 'start': 139, 'end': 142, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 27}, {'id': 27, 'start': 143, 'end': 152, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 25}, {'id': 28, 'start': 153, 'end': 155, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 27}, {'id': 29, 'start': 156, 'end': 159, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 34}, {'id': 30, 'start': 160, 'end': 167, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'amod', 'head': 32}, {'id': 31, 'start': 167, 'end': 168, 'pos': 'PUNCT', 'tag': 'HYPH', 'dep': 'punct', 'head': 32}, {'id': 32, 'start': 168, 'end': 172, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'amod', 'head': 34}, {'id': 33, 'start': 173, 'end': 181, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 34}, {'id': 34, 'start': 182, 'end': 190, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'pobj', 'head': 28}, {'id': 35, 'start': 191, 'end': 192, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 5}]}"
12148,The United Nations appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on Saturday .,org:alternate_names,"{'text': "" The United Nations appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on Saturday ."", 'ents': [{'start': 1, 'end': 19, 'label': 'ORG'}, {'start': 30, 'end': 33, 'label': 'CARDINAL'}, {'start': 53, 'end': 64, 'label': 'GPE'}, {'start': 68, 'end': 98, 'label': 'ORG'}, {'start': 115, 'end': 123, 'label': 'DATE'}], 'sents': [{'start': 0, 'end': 125}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 3}, {'id': 2, 'start': 5, 'end': 11, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 3}, {'id': 3, 'start': 12, 'end': 19, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubj', 'head': 4}, {'id': 4, 'start': 20, 'end': 29, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'ROOT', 'head': 4}, {'id': 5, 'start': 30, 'end': 33, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 7}, {'id': 6, 'start': 34, 'end': 41, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 7}, {'id': 7, 'start': 42, 'end': 49, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'dobj', 'head': 4}, {'id': 8, 'start': 50, 'end': 52, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 7}, {'id': 9, 'start': 53, 'end': 64, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'poss', 'head': 13}, {'id': 10, 'start': 65, 'end': 67, 'pos': 'PART', 'tag': 'POS', 'dep': 'case', 'head': 9}, {'id': 11, 'start': 68, 'end': 76, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 12}, {'id': 12, 'start': 77, 'end': 87, 'pos': 'PROPN', 'tag': 'NNPS', 'dep': 'compound', 'head': 13}, {'id': 13, 'start': 88, 'end': 98, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 8}, {'id': 14, 'start': 99, 'end': 100, 'pos': 'PUNCT', 'tag': '-LRB-', 'dep': 'punct', 'head': 13}, {'id': 15, 'start': 101, 'end': 104, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'appos', 'head': 13}, {'id': 16, 'start': 105, 'end': 106, 'pos': 'PUNCT', 'tag': '-RRB-', 'dep': 'punct', 'head': 13}, {'id': 17, 'start': 107, 'end': 111, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 4}, {'id': 18, 'start': 112, 'end': 114, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 17}, {'id': 19, 'start': 115, 'end': 123, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 18}, {'id': 20, 'start': 124, 'end': 125, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 4}]}"
17841,"Marks & Spencer has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in one person .",per:title,"{'text': ' Marks & Spencer has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in one person .', 'ents': [{'start': 1, 'end': 16, 'label': 'ORG'}, {'start': 45, 'end': 49, 'label': 'PRODUCT'}, {'start': 104, 'end': 108, 'label': 'DATE'}, {'start': 174, 'end': 177, 'label': 'CARDINAL'}], 'sents': [{'start': 0, 'end': 186}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 6, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubj', 'head': 5}, {'id': 2, 'start': 7, 'end': 8, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 1}, {'id': 3, 'start': 9, 'end': 16, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 1}, {'id': 4, 'start': 17, 'end': 20, 'pos': 'AUX', 'tag': 'VBZ', 'dep': 'aux', 'head': 5}, {'id': 5, 'start': 21, 'end': 25, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'ROOT', 'head': 5}, {'id': 6, 'start': 26, 'end': 29, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 7}, {'id': 7, 'start': 30, 'end': 34, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 9}, {'id': 8, 'start': 35, 'end': 39, 'pos': 'VERB', 'tag': 'MD', 'dep': 'aux', 'head': 9}, {'id': 9, 'start': 40, 'end': 44, 'pos': 'VERB', 'tag': 'VB', 'dep': 'ccomp', 'head': 5}, {'id': 10, 'start': 45, 'end': 49, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 11}, {'id': 11, 'start': 50, 'end': 54, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 9}, {'id': 12, 'start': 55, 'end': 57, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 13}, {'id': 13, 'start': 58, 'end': 63, 'pos': 'VERB', 'tag': 'VB', 'dep': 'advcl', 'head': 9}, {'id': 14, 'start': 64, 'end': 65, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 17}, {'id': 15, 'start': 66, 'end': 69, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 17}, {'id': 16, 'start': 70, 'end': 75, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 17}, {'id': 17, 'start': 76, 'end': 85, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 13}, {'id': 18, 'start': 86, 'end': 88, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 19}, {'id': 19, 'start': 89, 'end': 96, 'pos': 'VERB', 'tag': 'VB', 'dep': 'advcl', 'head': 13}, {'id': 20, 'start': 97, 'end': 100, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'dobj', 'head': 19}, {'id': 21, 'start': 101, 'end': 103, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 19}, {'id': 22, 'start': 104, 'end': 108, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 21}, {'id': 23, 'start': 109, 'end': 110, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 5}, {'id': 24, 'start': 111, 'end': 114, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 5}, {'id': 25, 'start': 115, 'end': 119, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 26}, {'id': 26, 'start': 120, 'end': 129, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'nsubj', 'head': 27}, {'id': 27, 'start': 130, 'end': 133, 'pos': 'VERB', 'tag': 'VBP', 'dep': 'conj', 'head': 5}, {'id': 28, 'start': 134, 'end': 137, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 29}, {'id': 29, 'start': 138, 'end': 142, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 30}, {'id': 30, 'start': 143, 'end': 155, 'pos': 'VERB', 'tag': 'VBZ', 'dep': 'ccomp', 'head': 27}, {'id': 31, 'start': 156, 'end': 159, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 32}, {'id': 32, 'start': 160, 'end': 164, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 33}, {'id': 33, 'start': 165, 'end': 170, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 30}, {'id': 34, 'start': 171, 'end': 173, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 30}, {'id': 35, 'start': 174, 'end': 177, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 36}, {'id': 36, 'start': 178, 'end': 184, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 34}, {'id': 37, 'start': 185, 'end': 186, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 27}]}"
15602,"She was the poet laureate of Maryland from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and Duke universities .",per:title,"{'text': ' She was the poet laureate of Maryland from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and Duke universities .', 'ents': [{'start': 30, 'end': 38, 'label': 'GPE'}, {'start': 44, 'end': 56, 'label': 'DATE'}, {'start': 74, 'end': 92, 'label': 'WORK_OF_ART'}, {'start': 96, 'end': 100, 'label': 'DATE'}, {'start': 129, 'end': 137, 'label': 'ORG'}, {'start': 142, 'end': 146, 'label': 'ORG'}], 'sents': [{'start': 0, 'end': 161}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 5, 'end': 8, 'pos': 'AUX', 'tag': 'VBD', 'dep': 'ROOT', 'head': 2}, {'id': 3, 'start': 9, 'end': 12, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 5}, {'id': 4, 'start': 13, 'end': 17, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'compound', 'head': 5}, {'id': 5, 'start': 18, 'end': 26, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'attr', 'head': 2}, {'id': 6, 'start': 27, 'end': 29, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 5}, {'id': 7, 'start': 30, 'end': 38, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 6}, {'id': 8, 'start': 39, 'end': 43, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 2}, {'id': 9, 'start': 44, 'end': 48, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 8}, {'id': 10, 'start': 49, 'end': 51, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 8}, {'id': 11, 'start': 52, 'end': 56, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 10}, {'id': 12, 'start': 57, 'end': 58, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 2}, {'id': 13, 'start': 59, 'end': 60, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 14}, {'id': 14, 'start': 61, 'end': 69, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'attr', 'head': 2}, {'id': 15, 'start': 70, 'end': 73, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 14}, {'id': 16, 'start': 74, 'end': 77, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 18}, {'id': 17, 'start': 78, 'end': 86, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 18}, {'id': 18, 'start': 87, 'end': 92, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 15}, {'id': 19, 'start': 93, 'end': 95, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 14}, {'id': 20, 'start': 96, 'end': 100, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 19}, {'id': 21, 'start': 101, 'end': 104, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 14}, {'id': 22, 'start': 105, 'end': 106, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 24}, {'id': 23, 'start': 107, 'end': 115, 'pos': 'VERB', 'tag': 'VBG', 'dep': 'amod', 'head': 24}, {'id': 24, 'start': 116, 'end': 125, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'conj', 'head': 14}, {'id': 25, 'start': 126, 'end': 128, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 24}, {'id': 26, 'start': 129, 'end': 137, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nmod', 'head': 29}, {'id': 27, 'start': 138, 'end': 141, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 26}, {'id': 28, 'start': 142, 'end': 146, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 26}, {'id': 29, 'start': 147, 'end': 159, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'pobj', 'head': 25}, {'id': 30, 'start': 160, 'end': 161, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 2}]}"


After that, take the arguments pairwise in each sentence and try if there is a pattern match

In [71]:
def get_extracted_sample(sample):
    return [(ARG1 + sample["text"][ent1["end"]:ent2["start"]] + ARG2) if ent1["end"] < ent2["end"] 
            else (ARG2 + sample["text"][ent2["end"]:ent1["start"]] + ARG1)
            for ent1, ent2 in itertools.permutations(sample["ents"],2)]

relevant_samples["candidates"] = relevant_samples["Spacy_info"].apply(lambda x: get_extracted_sample(x))

In [72]:
relevant_samples.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,samples,labels,Spacy_info,candidates
14713,"ALBA -- the Bolivarian Alternative for the Americas -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of Dominica .",org:members,"{'text': ' ALBA -- the Bolivarian Alternative for the Americas -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of Dominica .', 'ents': [{'start': 44, 'end': 52, 'label': 'LOC'}, {'start': 71, 'end': 81, 'label': 'NORP'}, {'start': 92, 'end': 103, 'label': 'PERSON'}, {'start': 108, 'end': 113, 'label': 'NORP'}, {'start': 121, 'end': 133, 'label': 'PERSON'}, {'start': 137, 'end': 141, 'label': 'DATE'}, {'start': 160, 'end': 167, 'label': 'GPE'}, {'start': 170, 'end': 179, 'label': 'GPE'}, {'start': 188, 'end': 197, 'label': 'LOC'}, {'start': 208, 'end': 216, 'label': 'LOC'}], 'sents': [{'start': 0, 'end': 218}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 5, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubjpass', 'head': 11}, {'id': 2, 'start': 6, 'end': 8, 'pos': 'PUNCT', 'tag': ':', 'dep': 'punct', 'head': 1}, {'id': 3, 'start': 9, 'end': 12, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 5}, {'id': 4, 'start': 13, 'end': 23, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'amod', 'head': 5}, {'id': 5, 'start': 24, 'end': 35, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'appos', 'head': 1}, {'id': 6, 'start': 36, 'end': 39, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 5}, {'id': 7, 'start': 40, 'end': 43, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 8}, {'id': 8, 'start': 44, 'end': 52, 'pos': 'PROPN', 'tag': 'NNPS', 'dep': 'pobj', 'head': 6}, {'id': 9, 'start': 53, 'end': 55, 'pos': 'PUNCT', 'tag': ':', 'dep': 'punct', 'head': 1}, {'id': 10, 'start': 56, 'end': 59, 'pos': 'AUX', 'tag': 'VBD', 'dep': 'auxpass', 'head': 11}, {'id': 11, 'start': 60, 'end': 67, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'ROOT', 'head': 11}, {'id': 12, 'start': 68, 'end': 70, 'pos': 'ADP', 'tag': 'IN', 'dep': 'agent', 'head': 11}, {'id': 13, 'start': 71, 'end': 81, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 14}, {'id': 14, 'start': 82, 'end': 91, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 16}, {'id': 15, 'start': 92, 'end': 96, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 16}, {'id': 16, 'start': 97, 'end': 103, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 12}, {'id': 17, 'start': 104, 'end': 107, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 16}, {'id': 18, 'start': 108, 'end': 113, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 19}, {'id': 19, 'start': 114, 'end': 120, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'compound', 'head': 21}, {'id': 20, 'start': 121, 'end': 126, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 21}, {'id': 21, 'start': 127, 'end': 133, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 16}, {'id': 22, 'start': 134, 'end': 136, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 11}, {'id': 23, 'start': 137, 'end': 141, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 22}, {'id': 24, 'start': 142, 'end': 145, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 11}, {'id': 25, 'start': 146, 'end': 150, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 26}, {'id': 26, 'start': 151, 'end': 159, 'pos': 'VERB', 'tag': 'VBZ', 'dep': 'conj', 'head': 11}, {'id': 27, 'start': 160, 'end': 167, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'dobj', 'head': 26}, {'id': 28, 'start': 168, 'end': 169, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 27}, {'id': 29, 'start': 170, 'end': 179, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 27}, {'id': 30, 'start': 180, 'end': 183, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 29}, {'id': 31, 'start': 184, 'end': 187, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 33}, {'id': 32, 'start': 188, 'end': 197, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 33}, {'id': 33, 'start': 198, 'end': 204, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'conj', 'head': 29}, {'id': 34, 'start': 205, 'end': 207, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 33}, {'id': 35, 'start': 208, 'end': 216, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 34}, {'id': 36, 'start': 217, 'end': 218, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 11}]}","['\$ARG1 -- was founded by \$ARG2', '\$ARG1 -- was founded by Venezuelan President \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by \$ARG1', '\$ARG1 President \$ARG2', '\$ARG1 President Hugo Chavez and \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader Fidel Castro in \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President \$ARG1', '\$ARG2 President \$ARG1', '\$ARG1 and \$ARG2', '\$ARG1 and Cuban leader \$ARG2', '\$ARG1 and Cuban leader Fidel Castro in \$ARG2', '\$ARG1 and Cuban leader Fidel Castro in 2004 and also includes \$ARG2', '\$ARG1 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG2', '\$ARG1 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and \$ARG1', '\$ARG2 President Hugo Chavez and \$ARG1', '\$ARG2 and \$ARG1', '\$ARG1 leader \$ARG2', '\$ARG1 leader Fidel Castro in \$ARG2', '\$ARG1 leader Fidel Castro in 2004 and also includes \$ARG2', '\$ARG1 leader Fidel Castro in 2004 and also includes Bolivia , \$ARG2', '\$ARG1 leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader \$ARG1', '\$ARG2 and Cuban leader \$ARG1', '\$ARG2 leader \$ARG1', '\$ARG1 in \$ARG2', '\$ARG1 in 2004 and also includes \$ARG2', '\$ARG1 in 2004 and also includes Bolivia , \$ARG2', '\$ARG1 in 2004 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader Fidel Castro in \$ARG1', '\$ARG2 and Cuban leader Fidel Castro in \$ARG1', '\$ARG2 leader Fidel Castro in \$ARG1', '\$ARG2 in \$ARG1', '\$ARG1 and also includes \$ARG2', '\$ARG1 and also includes Bolivia , \$ARG2', '\$ARG1 and also includes Bolivia , Nicaragua and the \$ARG2', '\$ARG1 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes \$ARG1', '\$ARG2 and Cuban leader Fidel Castro in 2004 and also includes \$ARG1', '\$ARG2 leader Fidel Castro in 2004 and also includes \$ARG1', '\$ARG2 in 2004 and also includes \$ARG1', '\$ARG2 and also includes \$ARG1', '\$ARG1 , \$ARG2', '\$ARG1 , Nicaragua and the \$ARG2', '\$ARG1 , Nicaragua and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG1', '\$ARG2 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , \$ARG1', '\$ARG2 leader Fidel Castro in 2004 and also includes Bolivia , \$ARG1', '\$ARG2 in 2004 and also includes Bolivia , \$ARG1', '\$ARG2 and also includes Bolivia , \$ARG1', '\$ARG2 , \$ARG1', '\$ARG1 and the \$ARG2', '\$ARG1 and the Caribbean island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 in 2004 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 and also includes Bolivia , Nicaragua and the \$ARG1', '\$ARG2 , Nicaragua and the \$ARG1', '\$ARG2 and the \$ARG1', '\$ARG1 island of \$ARG2', '\$ARG2 -- was founded by Venezuelan President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 President Hugo Chavez and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 and Cuban leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 leader Fidel Castro in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 in 2004 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 and also includes Bolivia , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 , Nicaragua and the Caribbean island of \$ARG1', '\$ARG2 and the Caribbean island of \$ARG1', '\$ARG2 island of \$ARG1']"
5313,"His conversion to Islam came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born Islamist radicals .",per:religion,"{'text': ' His conversion to Islam came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born Islamist radicals .', 'ents': [{'start': 19, 'end': 24, 'label': 'ORG'}, {'start': 48, 'end': 83, 'label': 'ORG'}, {'start': 86, 'end': 96, 'label': 'GPE'}, {'start': 156, 'end': 159, 'label': 'CARDINAL'}, {'start': 173, 'end': 181, 'label': 'NORP'}], 'sents': [{'start': 0, 'end': 192}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'DET', 'tag': 'PRP\$', 'dep': 'poss', 'head': 2}, {'id': 2, 'start': 5, 'end': 15, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 5}, {'id': 3, 'start': 16, 'end': 18, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 2}, {'id': 4, 'start': 19, 'end': 24, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 3}, {'id': 5, 'start': 25, 'end': 29, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'ROOT', 'head': 5}, {'id': 6, 'start': 30, 'end': 35, 'pos': 'ADP', 'tag': 'IN', 'dep': 'mark', 'head': 8}, {'id': 7, 'start': 36, 'end': 38, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubj', 'head': 8}, {'id': 8, 'start': 39, 'end': 47, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'advcl', 'head': 5}, {'id': 9, 'start': 48, 'end': 51, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 16}, {'id': 10, 'start': 52, 'end': 59, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 11}, {'id': 11, 'start': 60, 'end': 66, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nmod', 'head': 16}, {'id': 12, 'start': 67, 'end': 69, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 11}, {'id': 13, 'start': 70, 'end': 76, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 14}, {'id': 14, 'start': 77, 'end': 83, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 12}, {'id': 15, 'start': 84, 'end': 85, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 14}, {'id': 16, 'start': 86, 'end': 96, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'dobj', 'head': 8}, {'id': 17, 'start': 97, 'end': 98, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 16}, {'id': 18, 'start': 99, 'end': 104, 'pos': 'ADV', 'tag': 'WRB', 'dep': 'advmod', 'head': 24}, {'id': 19, 'start': 105, 'end': 107, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubjpass', 'head': 21}, {'id': 20, 'start': 108, 'end': 110, 'pos': 'AUX', 'tag': 'VBZ', 'dep': 'auxpass', 'head': 21}, {'id': 21, 'start': 111, 'end': 119, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'relcl', 'head': 16}, {'id': 22, 'start': 120, 'end': 122, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 24}, {'id': 23, 'start': 123, 'end': 127, 'pos': 'AUX', 'tag': 'VB', 'dep': 'aux', 'head': 24}, {'id': 24, 'start': 128, 'end': 132, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'xcomp', 'head': 21}, {'id': 25, 'start': 133, 'end': 138, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 24}, {'id': 26, 'start': 139, 'end': 142, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 27}, {'id': 27, 'start': 143, 'end': 152, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 25}, {'id': 28, 'start': 153, 'end': 155, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 27}, {'id': 29, 'start': 156, 'end': 159, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 34}, {'id': 30, 'start': 160, 'end': 167, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'amod', 'head': 32}, {'id': 31, 'start': 167, 'end': 168, 'pos': 'PUNCT', 'tag': 'HYPH', 'dep': 'punct', 'head': 32}, {'id': 32, 'start': 168, 'end': 172, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'amod', 'head': 34}, {'id': 33, 'start': 173, 'end': 181, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 34}, {'id': 34, 'start': 182, 'end': 190, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'pobj', 'head': 28}, {'id': 35, 'start': 191, 'end': 192, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 5}]}","['\$ARG1 came after he attended \$ARG2', '\$ARG1 came after he attended the Islamic Centre of Orange County , \$ARG2', '\$ARG1 came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of \$ARG2', '\$ARG1 came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born \$ARG2', '\$ARG2 came after he attended \$ARG1', '\$ARG1 , \$ARG2', '\$ARG1 , California , where he is believed to have come under the influence of \$ARG2', '\$ARG1 , California , where he is believed to have come under the influence of two foreign-born \$ARG2', '\$ARG2 came after he attended the Islamic Centre of Orange County , \$ARG1', '\$ARG2 , \$ARG1', '\$ARG1 , where he is believed to have come under the influence of \$ARG2', '\$ARG1 , where he is believed to have come under the influence of two foreign-born \$ARG2', '\$ARG2 came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of \$ARG1', '\$ARG2 , California , where he is believed to have come under the influence of \$ARG1', '\$ARG2 , where he is believed to have come under the influence of \$ARG1', '\$ARG1 foreign-born \$ARG2', '\$ARG2 came after he attended the Islamic Centre of Orange County , California , where he is believed to have come under the influence of two foreign-born \$ARG1', '\$ARG2 , California , where he is believed to have come under the influence of two foreign-born \$ARG1', '\$ARG2 , where he is believed to have come under the influence of two foreign-born \$ARG1', '\$ARG2 foreign-born \$ARG1']"
12148,The United Nations appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on Saturday .,org:alternate_names,"{'text': "" The United Nations appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on Saturday ."", 'ents': [{'start': 1, 'end': 19, 'label': 'ORG'}, {'start': 30, 'end': 33, 'label': 'CARDINAL'}, {'start': 53, 'end': 64, 'label': 'GPE'}, {'start': 68, 'end': 98, 'label': 'ORG'}, {'start': 115, 'end': 123, 'label': 'DATE'}], 'sents': [{'start': 0, 'end': 125}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 3}, {'id': 2, 'start': 5, 'end': 11, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 3}, {'id': 3, 'start': 12, 'end': 19, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubj', 'head': 4}, {'id': 4, 'start': 20, 'end': 29, 'pos': 'VERB', 'tag': 'VBD', 'dep': 'ROOT', 'head': 4}, {'id': 5, 'start': 30, 'end': 33, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 7}, {'id': 6, 'start': 34, 'end': 41, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 7}, {'id': 7, 'start': 42, 'end': 49, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'dobj', 'head': 4}, {'id': 8, 'start': 50, 'end': 52, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 7}, {'id': 9, 'start': 53, 'end': 64, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'poss', 'head': 13}, {'id': 10, 'start': 65, 'end': 67, 'pos': 'PART', 'tag': 'POS', 'dep': 'case', 'head': 9}, {'id': 11, 'start': 68, 'end': 76, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 12}, {'id': 12, 'start': 77, 'end': 87, 'pos': 'PROPN', 'tag': 'NNPS', 'dep': 'compound', 'head': 13}, {'id': 13, 'start': 88, 'end': 98, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 8}, {'id': 14, 'start': 99, 'end': 100, 'pos': 'PUNCT', 'tag': '-LRB-', 'dep': 'punct', 'head': 13}, {'id': 15, 'start': 101, 'end': 104, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'appos', 'head': 13}, {'id': 16, 'start': 105, 'end': 106, 'pos': 'PUNCT', 'tag': '-RRB-', 'dep': 'punct', 'head': 13}, {'id': 17, 'start': 107, 'end': 111, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 4}, {'id': 18, 'start': 112, 'end': 114, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 17}, {'id': 19, 'start': 115, 'end': 123, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 18}, {'id': 20, 'start': 124, 'end': 125, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 4}]}","['\$ARG1 appointed \$ARG2', '\$ARG1 appointed two foreign members of \$ARG2', ""\$ARG1 appointed two foreign members of Afghanistan 's \$ARG2"", ""\$ARG1 appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on \$ARG2"", '\$ARG2 appointed \$ARG1', '\$ARG1 foreign members of \$ARG2', ""\$ARG1 foreign members of Afghanistan 's \$ARG2"", ""\$ARG1 foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on \$ARG2"", '\$ARG2 appointed two foreign members of \$ARG1', '\$ARG2 foreign members of \$ARG1', ""\$ARG1 's \$ARG2"", ""\$ARG1 's Election Complaints Commission ( ECC ) here on \$ARG2"", ""\$ARG2 appointed two foreign members of Afghanistan 's \$ARG1"", ""\$ARG2 foreign members of Afghanistan 's \$ARG1"", ""\$ARG2 's \$ARG1"", '\$ARG1 ( ECC ) here on \$ARG2', ""\$ARG2 appointed two foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on \$ARG1"", ""\$ARG2 foreign members of Afghanistan 's Election Complaints Commission ( ECC ) here on \$ARG1"", ""\$ARG2 's Election Complaints Commission ( ECC ) here on \$ARG1"", '\$ARG2 ( ECC ) here on \$ARG1']"
17841,"Marks & Spencer has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in one person .",per:title,"{'text': ' Marks & Spencer has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in one person .', 'ents': [{'start': 1, 'end': 16, 'label': 'ORG'}, {'start': 45, 'end': 49, 'label': 'PRODUCT'}, {'start': 104, 'end': 108, 'label': 'DATE'}, {'start': 174, 'end': 177, 'label': 'CARDINAL'}], 'sents': [{'start': 0, 'end': 186}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 6, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nsubj', 'head': 5}, {'id': 2, 'start': 7, 'end': 8, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 1}, {'id': 3, 'start': 9, 'end': 16, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 1}, {'id': 4, 'start': 17, 'end': 20, 'pos': 'AUX', 'tag': 'VBZ', 'dep': 'aux', 'head': 5}, {'id': 5, 'start': 21, 'end': 25, 'pos': 'VERB', 'tag': 'VBN', 'dep': 'ROOT', 'head': 5}, {'id': 6, 'start': 26, 'end': 29, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 7}, {'id': 7, 'start': 30, 'end': 34, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 9}, {'id': 8, 'start': 35, 'end': 39, 'pos': 'VERB', 'tag': 'MD', 'dep': 'aux', 'head': 9}, {'id': 9, 'start': 40, 'end': 44, 'pos': 'VERB', 'tag': 'VB', 'dep': 'ccomp', 'head': 5}, {'id': 10, 'start': 45, 'end': 49, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 11}, {'id': 11, 'start': 50, 'end': 54, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 9}, {'id': 12, 'start': 55, 'end': 57, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 13}, {'id': 13, 'start': 58, 'end': 63, 'pos': 'VERB', 'tag': 'VB', 'dep': 'advcl', 'head': 9}, {'id': 14, 'start': 64, 'end': 65, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 17}, {'id': 15, 'start': 66, 'end': 69, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 17}, {'id': 16, 'start': 70, 'end': 75, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 17}, {'id': 17, 'start': 76, 'end': 85, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 13}, {'id': 18, 'start': 86, 'end': 88, 'pos': 'PART', 'tag': 'TO', 'dep': 'aux', 'head': 19}, {'id': 19, 'start': 89, 'end': 96, 'pos': 'VERB', 'tag': 'VB', 'dep': 'advcl', 'head': 13}, {'id': 20, 'start': 97, 'end': 100, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'dobj', 'head': 19}, {'id': 21, 'start': 101, 'end': 103, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 19}, {'id': 22, 'start': 104, 'end': 108, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 21}, {'id': 23, 'start': 109, 'end': 110, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 5}, {'id': 24, 'start': 111, 'end': 114, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 5}, {'id': 25, 'start': 115, 'end': 119, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 26}, {'id': 26, 'start': 120, 'end': 129, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'nsubj', 'head': 27}, {'id': 27, 'start': 130, 'end': 133, 'pos': 'VERB', 'tag': 'VBP', 'dep': 'conj', 'head': 5}, {'id': 28, 'start': 134, 'end': 137, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 29}, {'id': 29, 'start': 138, 'end': 142, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubj', 'head': 30}, {'id': 30, 'start': 143, 'end': 155, 'pos': 'VERB', 'tag': 'VBZ', 'dep': 'ccomp', 'head': 27}, {'id': 31, 'start': 156, 'end': 159, 'pos': 'ADV', 'tag': 'RB', 'dep': 'advmod', 'head': 32}, {'id': 32, 'start': 160, 'end': 164, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 33}, {'id': 33, 'start': 165, 'end': 170, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 30}, {'id': 34, 'start': 171, 'end': 173, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 30}, {'id': 35, 'start': 174, 'end': 177, 'pos': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'head': 36}, {'id': 36, 'start': 178, 'end': 184, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 34}, {'id': 37, 'start': 185, 'end': 186, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 27}]}","['\$ARG1 has said the move will give \$ARG2', '\$ARG1 has said the move will give Rose time to groom a new chief executive to replace him by \$ARG2', '\$ARG1 has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in \$ARG2', '\$ARG2 has said the move will give \$ARG1', '\$ARG1 time to groom a new chief executive to replace him by \$ARG2', '\$ARG1 time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in \$ARG2', '\$ARG2 has said the move will give Rose time to groom a new chief executive to replace him by \$ARG1', '\$ARG2 time to groom a new chief executive to replace him by \$ARG1', '\$ARG1 , but some investors say the move concentrates too much power in \$ARG2', '\$ARG2 has said the move will give Rose time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in \$ARG1', '\$ARG2 time to groom a new chief executive to replace him by 2011 , but some investors say the move concentrates too much power in \$ARG1', '\$ARG2 , but some investors say the move concentrates too much power in \$ARG1']"
15602,"She was the poet laureate of Maryland from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and Duke universities .",per:title,"{'text': ' She was the poet laureate of Maryland from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and Duke universities .', 'ents': [{'start': 30, 'end': 38, 'label': 'GPE'}, {'start': 44, 'end': 56, 'label': 'DATE'}, {'start': 74, 'end': 92, 'label': 'WORK_OF_ART'}, {'start': 96, 'end': 100, 'label': 'DATE'}, {'start': 129, 'end': 137, 'label': 'ORG'}, {'start': 142, 'end': 146, 'label': 'ORG'}], 'sents': [{'start': 0, 'end': 161}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'pos': 'SPACE', 'tag': '_SP', 'dep': '', 'head': 1}, {'id': 1, 'start': 1, 'end': 4, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 5, 'end': 8, 'pos': 'AUX', 'tag': 'VBD', 'dep': 'ROOT', 'head': 2}, {'id': 3, 'start': 9, 'end': 12, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 5}, {'id': 4, 'start': 13, 'end': 17, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'compound', 'head': 5}, {'id': 5, 'start': 18, 'end': 26, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'attr', 'head': 2}, {'id': 6, 'start': 27, 'end': 29, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 5}, {'id': 7, 'start': 30, 'end': 38, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 6}, {'id': 8, 'start': 39, 'end': 43, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 2}, {'id': 9, 'start': 44, 'end': 48, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 8}, {'id': 10, 'start': 49, 'end': 51, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 8}, {'id': 11, 'start': 52, 'end': 56, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 10}, {'id': 12, 'start': 57, 'end': 58, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 2}, {'id': 13, 'start': 59, 'end': 60, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 14}, {'id': 14, 'start': 61, 'end': 69, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'attr', 'head': 2}, {'id': 15, 'start': 70, 'end': 73, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 14}, {'id': 16, 'start': 74, 'end': 77, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 18}, {'id': 17, 'start': 78, 'end': 86, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'compound', 'head': 18}, {'id': 18, 'start': 87, 'end': 92, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'pobj', 'head': 15}, {'id': 19, 'start': 93, 'end': 95, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 14}, {'id': 20, 'start': 96, 'end': 100, 'pos': 'NUM', 'tag': 'CD', 'dep': 'pobj', 'head': 19}, {'id': 21, 'start': 101, 'end': 104, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 14}, {'id': 22, 'start': 105, 'end': 106, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 24}, {'id': 23, 'start': 107, 'end': 115, 'pos': 'VERB', 'tag': 'VBG', 'dep': 'amod', 'head': 24}, {'id': 24, 'start': 116, 'end': 125, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'conj', 'head': 14}, {'id': 25, 'start': 126, 'end': 128, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 24}, {'id': 26, 'start': 129, 'end': 137, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nmod', 'head': 29}, {'id': 27, 'start': 138, 'end': 141, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 26}, {'id': 28, 'start': 142, 'end': 146, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'conj', 'head': 26}, {'id': 29, 'start': 147, 'end': 159, 'pos': 'NOUN', 'tag': 'NNS', 'dep': 'pobj', 'head': 25}, {'id': 30, 'start': 160, 'end': 161, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 2}]}","['\$ARG1 from \$ARG2', '\$ARG1 from 1979 to 1985 , a finalist for \$ARG2', '\$ARG1 from 1979 to 1985 , a finalist for the Pulitzer Prize in \$ARG2', '\$ARG1 from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at \$ARG2', '\$ARG1 from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and \$ARG2', '\$ARG2 from \$ARG1', '\$ARG1 , a finalist for \$ARG2', '\$ARG1 , a finalist for the Pulitzer Prize in \$ARG2', '\$ARG1 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at \$ARG2', '\$ARG1 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and \$ARG2', '\$ARG2 from 1979 to 1985 , a finalist for \$ARG1', '\$ARG2 , a finalist for \$ARG1', '\$ARG1 in \$ARG2', '\$ARG1 in 1988 and a visiting professor at \$ARG2', '\$ARG1 in 1988 and a visiting professor at Columbia and \$ARG2', '\$ARG2 from 1979 to 1985 , a finalist for the Pulitzer Prize in \$ARG1', '\$ARG2 , a finalist for the Pulitzer Prize in \$ARG1', '\$ARG2 in \$ARG1', '\$ARG1 and a visiting professor at \$ARG2', '\$ARG1 and a visiting professor at Columbia and \$ARG2', '\$ARG2 from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at \$ARG1', '\$ARG2 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at \$ARG1', '\$ARG2 in 1988 and a visiting professor at \$ARG1', '\$ARG2 and a visiting professor at \$ARG1', '\$ARG1 and \$ARG2', '\$ARG2 from 1979 to 1985 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and \$ARG1', '\$ARG2 , a finalist for the Pulitzer Prize in 1988 and a visiting professor at Columbia and \$ARG1', '\$ARG2 in 1988 and a visiting professor at Columbia and \$ARG1', '\$ARG2 and a visiting professor at Columbia and \$ARG1', '\$ARG2 and \$ARG1']"


In [160]:
def pattern_search(sample_sent, patterns, row):
    for relation, rel_patterns in patterns.iterrows():
        matches = [[row["samples"], sample_sent, p, relation, row["labels"]] 
                   for p in rel_patterns["Regex patterns"] 
                   if re.match(p, sample_sent) is not None]
        if len(matches) > 0:
            return pd.DataFrame(matches, columns = FINAL_DF_COLUMNS)

In [161]:
all_matches = pd.DataFrame(columns = FINAL_DF_COLUMNS)
for _, row in relevant_samples.iterrows():
    for cand_samples in row["candidates"]:
        df_found = pattern_search(cand_samples, patterns, row)
        if isinstance(df_found, pd.DataFrame) and not df_found.empty:
            all_matches = pd.concat([all_matches, df_found])

In [162]:
all_matches.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,sample,extr_sample,pattern,weak_label,gold_label
0,"Anatoly Isaikin , head of Russia 's state-run arms exporter Rosoboronexport , told a news conference Russia would consider exports only after meeting the requirements of its own armed forces .","\$ARG1 , head of \$ARG2","(A )?(a )?(The )?(the )?\\$ARG1\ ,\ head\ of\ (A )?(a )?(The )?(the )?\\$ARG2",org:top_members/employees,org:top_members/employees
0,"But Sergey Chemezov , head of the Russian arms export company Rosoboronexport denied the report .","\$ARG1 , head of the \$ARG2","(A )?(a )?(The )?(the )?\\$ARG1\ ,\ head\ of\ (A )?(a )?(The )?(the )?\\$ARG2",org:top_members/employees,org:top_members/employees
0,"Ronald James Padavona was born in Portsmouth , NH , and grew up in Cortland , NY .",\$ARG1 was born in \$ARG2,(A )?(a )?(The )?(the )?\\$ARG1\ was\ born\ in\ (A )?(a )?(The )?(the )?\\$ARG2,per:date_of_birth,per:cities_of_residence


But here we can observe some misclassified sentences: for example, a sentence 

"A professor emeritus at Yale University , Mandelbrot was born in Poland but as a child moved with his family to France where he was educated" 

was assigned with a label "per:date_of_birth" (presumably by a pattern "ARG1 was born in ARG2"), what is definitely wrong. In order to avoid such mistake, let's add additional constraints on the argument types.

In [176]:
relation_to_types = {"org:alternate_names": ['PERSON', 'PERSON'], 
                     "per:date_of_birth": ['PERSON', 'DATE'],
                     "org:top_members/employees": ['PERSON', 'ORG']}

So, when we look for these patterns in samples, we should take into account the entity types referred to corresponding relation

In [177]:
patterns["types"] = pd.Series(relation_to_types)

In [178]:
patterns.apply(lambda x: [escape_dollar(pattern) for pattern in x]).head()

Unnamed: 0,Raw patterns,Regex patterns,types
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ ', '(A )?(a )?(The )?(the )?\\\$ARG1\\ formerly\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ aka\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ also\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)']","['PERSON', 'PERSON']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ in', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\-', '(A )?(a )?(The )?(the )?\\\$ARG1\\ was\\ born\\ in\\ (A )?(a )?(The )?(the )?\\\$ARG2']","['PERSON', 'DATE']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ executive\\ director\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ head\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ who\\ heads\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ chief\\ executive\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2']","['PERSON', 'ORG']"


In [345]:
final_samples_with_entity_types = pd.DataFrame(columns=['Sentence', 'Pattern', 'Label'])

for sample in parsed_samples:
    sentence = sample["text"]
    for ent1, ent2 in itertools.permutations(sample["ents"],2):
        entity_types = [ent1["label"], ent2["label"]]
        if entity_types in relation_to_types.values():
            curr_relation = [rel for rel,types in relation_to_types.items() if types==entity_types][0]
            extr_sentences = [get_extracted_sample(ARG1, ARG2, ent1, ent2, sentence) if ent1["end"] < ent2["end"] 
                              else get_extracted_sample(ARG2, ARG1, ent2, ent1, sentence)
                              for ent1, ent2 in itertools.permutations(sample["ents"],2)]
            for sent in extr_sentences:
                for pattern in relation_patterns_regex[curr_relation]:
                    if re.match(pattern, sent) is not None:
                        match = [sentence, re.escape(regex_pattern_to_regex[pattern]), curr_relation] 
                        a_series = pd.Series(match, index = final_samples_with_entity_types.columns)
                        final_samples_with_entity_types = final_samples_with_entity_types.append(a_series, ignore_index=True)

In [346]:
pd.set_option('display.max_rows', len(final_samples_with_entity_types))
print(display(final_samples_with_entity_types))

Unnamed: 0,Sentence,Pattern,Label
0,"Gwathmey was born in 1938 , the only child of painter Robert Gwathmey and his wife , Rosalie , a photographer .",\$ARG1\ was\ born\ in\ \$ARG2,per:date_of_birth
1,"Gwathmey was born in 1938 , the only child of painter Robert Gwathmey and his wife , Rosalie , a photographer .",\$ARG1\ was\ born\ in\ \$ARG2,per:date_of_birth
2,"Gwathmey was born in 1938 , the only child of painter Robert Gwathmey and his wife , Rosalie , a photographer .",\$ARG1\ was\ born\ in\ \$ARG2,per:date_of_birth
3,"Khan was born April 14 , 1922 , in Shivpur , East Bengal ( now Bangladesh ) .",\$ARG1\ was\ born\ \$ARG2,per:date_of_birth
4,"`` The model would generate tax revenues while respecting the privacy of bank clients and it would represent an efficient alternative to a system of automatic information exchange , '' said Urs Roth , chief executive of the Swiss Bankers Association which has almost 360 members from the financial industry .","\$ARG1\ ,\ chief\ executive\ of\ \$ARG2",org:top_members/employees


None


Let's try more sophisticated way: not just look for patterns in samples, but to get the entities first and look at the more precise relation between them. We are going to do it with SpaCy package. 