In [21]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from IPython.display import display, HTML
import urllib
import spacy
import sklearn
import pandas as pd
import en_core_web_sm
import re
import itertools
from tqdm import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', -1)

  from pandas import Panel


In [22]:
ARG1 = "$ARG1"
ARG2 = "$ARG2"
FINAL_DF_COLUMNS = ['sample', 'extr_sample', 'pattern', 'weak_label', 'gold_label']

conll_file = "dev.conll"

In [23]:
def escape_dollar(strings):
    return [re.sub("\\$", "\\\\$", str(string)) for string in strings]

Upload the CONLL Data

In [24]:
def process_data(path_to_data):
    samples, relations = [], []
    with open(path_to_data, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("# id="):    # Instance starts
                sample = ""
                label = line.split(" ")[3][5:]
            elif line == "":  # Instance ends
                samples.append(sample)
                relations.append(label)
            elif line.startswith("#"):  # comment
                continue
            else:
                parts = line.split("\t")
                token = parts[1]
                if token == "-LRB-":
                    token = "("
                elif token == "-RRB-":
                    token = ")"
                sample += " " + token
    return pd.DataFrame.from_dict({"sample": samples, "label": relations})

samples = process_data(conll_file)

Let's inspect all relation labels that we have in our dataset

In [25]:
print(set(samples["label"]))

{'org:website', 'org:alternate_names', 'per:other_family', 'org:number_of_employees/members', 'per:parents', 'per:country_of_death', 'per:cities_of_residence', 'per:city_of_death', 'org:top_members/employees', 'per:stateorprovince_of_birth', 'per:date_of_birth', 'per:stateorprovince_of_death', 'org:members', 'per:employee_of', 'org:city_of_headquarters', 'per:siblings', 'per:children', 'per:countries_of_residence', 'per:cause_of_death', 'no_relation', 'org:parents', 'per:schools_attended', 'per:country_of_birth', 'org:shareholders', 'org:subsidiaries', 'org:country_of_headquarters', 'per:title', 'per:religion', 'org:stateorprovince_of_headquarters', 'per:stateorprovinces_of_residence', 'org:founded', 'org:dissolved', 'org:member_of', 'per:charges', 'per:date_of_death', 'org:founded_by', 'org:political/religious_affiliation', 'per:age', 'per:origin', 'per:alternate_names', 'per:city_of_birth', 'per:spouse'}


To make the calculation quicklier, let's choose only the samples that contain a relation (that is, labelled not with "no_relation" label). Moreover, let's reduce the number of relation we want to find and choose 5 relations from the TACRED relation labels.

In [26]:
selected_samples = samples[samples["label"]!='no_relation'].sample(n=400, random_state=5)

In [27]:
selected_samples.head()

Unnamed: 0,sample,label
10731,French filmmaker Claude Chabrol dies at 80,per:title
14931,"But in a reversal of that decision by former CEO Stuart Rose , Bolland said he will cut the range of non-M & S branded foodstuffs from 400 lines to 100 , leaving products the company itself can not replicate .",per:title
10047,"Neal succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told AFP .",per:country_of_death
23392,Deshpande made a fortune in communications companies and is chairman of battery-maker A123 Systems .,org:top_members/employees
17081,"CHICAGO 2009-11-10 00:25:56 UTC The filings are part of prosecutors ' efforts to bolster their case that Chicago grocery store owner and immigration counselor Tahawwur Rana , 48 , should not be freed on bond pending resolution of terrorism charges against him and David C Headley .",per:title


In order to turn the data into distantly supervised one, let's write down for each relation a couple of simple patterns that could help us to find the relations


In [28]:
relation_patterns_df = pd.DataFrame.from_dict({"org:alternate_names": 
                                               [["$ARG1 ( $ARG2 ",
                                                 "$ARG1 formerly known as $ARG2",
                                                 "$ARG1 aka $ARG2", 
                                                 "$ARG1 ( also known as $ARG2 )"]],
                                               "per:date_of_birth": 
                                               [["$ARG1 ( born $ARG2 )", 
                                                 "$ARG1 ( born $ARG2 in",
                                                 "$ARG1 ( $ARG2 -",
                                                 "$ARG1 was born in $ARG2"]],
                                               "org:top_members/employees":
                                                [["$ARG1 , executive director of $ARG2",
                                                  "$ARG1 , head of $ARG2",
                                                  "$ARG1 , who heads $ARG2",
                                                  "$ARG1 , chief executive of $ARG2"]]}, 
                                              orient='index', columns = ["raw pattern"])

In [29]:
relation_patterns_df.apply(lambda x: escape_dollar(x)).head()

Unnamed: 0,raw pattern
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']"


Since we want to make a simple regex search, convert patterns into regexes

In [30]:
def preprocess_patterns(patterns):
    regex_patterns = [re.sub("\\\\\\$ARG", "(A )?(a )?(The )?(the )?\\$ARG", re.escape(pattern)) for pattern in patterns]
    return regex_patterns

relation_patterns_df["regex pattern"] = relation_patterns_df["raw pattern"].apply(preprocess_patterns)

In [31]:
relation_patterns_df.apply(lambda x: escape_dollar(x)).head()

Unnamed: 0,raw pattern,regex pattern
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ ', '(A )?(a )?(The )?(the )?\\\$ARG1\\ formerly\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ aka\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ also\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ in', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\-', '(A )?(a )?(The )?(the )?\\\$ARG1\\ was\\ born\\ in\\ (A )?(a )?(The )?(the )?\\\$ARG2']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ executive\\ director\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ head\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ who\\ heads\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ chief\\ executive\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2']"


Now we can simply search patterns in the sentences. Because of the speed reasons, let's take only samples where we will definetly find smth :)

In [32]:
analyzer = spacy.load("en_core_web_sm")
selected_samples["spacy info"] = selected_samples["sample"].apply(lambda x: analyzer(x).to_json())

After that, take the arguments pairwise in each sentence and try if there is a pattern match

In [33]:
def get_extracted_sample(sample):
    return [(ARG1 + sample["text"][ent1["end"]:ent2["start"]] + ARG2) if ent1["end"] < ent2["end"] 
            else (ARG2 + sample["text"][ent2["end"]:ent1["start"]] + ARG1)
            for ent1, ent2 in itertools.permutations(sample["ents"],2)]

selected_samples["extr"] = selected_samples["spacy info"].apply(lambda x: get_extracted_sample(x))

In [35]:
selected_samples[["sample", "label", "extr"]].apply(lambda x: escape_dollar(x)).head()

Unnamed: 0,sample,label,extr
10731,French filmmaker Claude Chabrol dies at 80,per:title,"['\$ARG1 filmmaker \$ARG2', '\$ARG1 filmmaker Claude Chabrol dies at \$ARG2', '\$ARG2 filmmaker \$ARG1', '\$ARG1 dies at \$ARG2', '\$ARG2 filmmaker Claude Chabrol dies at \$ARG1', '\$ARG2 dies at \$ARG1']"
14931,"But in a reversal of that decision by former CEO Stuart Rose , Bolland said he will cut the range of non-M & S branded foodstuffs from 400 lines to 100 , leaving products the company itself can not replicate .",per:title,"['\$ARG1 , \$ARG2', '\$ARG1 , Bolland said he will cut the range of \$ARG2', '\$ARG1 , Bolland said he will cut the range of non-M & S branded foodstuffs from \$ARG2', '\$ARG1 , Bolland said he will cut the range of non-M & S branded foodstuffs from 400 lines to \$ARG2', '\$ARG2 , \$ARG1', '\$ARG1 said he will cut the range of \$ARG2', '\$ARG1 said he will cut the range of non-M & S branded foodstuffs from \$ARG2', '\$ARG1 said he will cut the range of non-M & S branded foodstuffs from 400 lines to \$ARG2', '\$ARG2 , Bolland said he will cut the range of \$ARG1', '\$ARG2 said he will cut the range of \$ARG1', '\$ARG1 branded foodstuffs from \$ARG2', '\$ARG1 branded foodstuffs from 400 lines to \$ARG2', '\$ARG2 , Bolland said he will cut the range of non-M & S branded foodstuffs from \$ARG1', '\$ARG2 said he will cut the range of non-M & S branded foodstuffs from \$ARG1', '\$ARG2 branded foodstuffs from \$ARG1', '\$ARG1 lines to \$ARG2', '\$ARG2 , Bolland said he will cut the range of non-M & S branded foodstuffs from 400 lines to \$ARG1', '\$ARG2 said he will cut the range of non-M & S branded foodstuffs from 400 lines to \$ARG1', '\$ARG2 branded foodstuffs from 400 lines to \$ARG1', '\$ARG2 lines to \$ARG1']"
10047,"Neal succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told AFP .",per:country_of_death,"['\$ARG1 succumbed to complications from lung cancer in \$ARG2', '\$ARG1 succumbed to complications from lung cancer in Edgarton , \$ARG2', '\$ARG1 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of \$ARG2', '\$ARG1 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG2', '\$ARG1 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG2', '\$ARG1 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in \$ARG1', '\$ARG1 , \$ARG2', '\$ARG1 , Massachussetts , Julie Dougherty , marketing manager of \$ARG2', '\$ARG1 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG2', '\$ARG1 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG2', '\$ARG1 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in Edgarton , \$ARG1', '\$ARG2 , \$ARG1', '\$ARG1 , Julie Dougherty , marketing manager of \$ARG2', '\$ARG1 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG2', '\$ARG1 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG2', '\$ARG1 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of \$ARG1', '\$ARG2 , Massachussetts , Julie Dougherty , marketing manager of \$ARG1', '\$ARG2 , Julie Dougherty , marketing manager of \$ARG1', '\$ARG1 in \$ARG2', '\$ARG1 in Knoxville , \$ARG2', '\$ARG1 in Knoxville , Tennessee , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG1', '\$ARG2 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG1', '\$ARG2 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in \$ARG1', '\$ARG2 in \$ARG1', '\$ARG1 , \$ARG2', '\$ARG1 , Tennessee , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG1', '\$ARG2 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG1', '\$ARG2 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , \$ARG1', '\$ARG2 in Knoxville , \$ARG1', '\$ARG2 , \$ARG1', '\$ARG1 , told \$ARG2', '\$ARG2 succumbed to complications from lung cancer in Edgarton , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG1', '\$ARG2 , Massachussetts , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG1', '\$ARG2 , Julie Dougherty , marketing manager of the Patricia Neal Rehabilitation Center in Knoxville , Tennessee , told \$ARG1', '\$ARG2 in Knoxville , Tennessee , told \$ARG1', '\$ARG2 , Tennessee , told \$ARG1', '\$ARG2 , told \$ARG1']"
23392,Deshpande made a fortune in communications companies and is chairman of battery-maker A123 Systems .,org:top_members/employees,"['\$ARG1 made a fortune in communications companies and is chairman of battery-maker \$ARG2', '\$ARG2 made a fortune in communications companies and is chairman of battery-maker \$ARG1']"
17081,"CHICAGO 2009-11-10 00:25:56 UTC The filings are part of prosecutors ' efforts to bolster their case that Chicago grocery store owner and immigration counselor Tahawwur Rana , 48 , should not be freed on bond pending resolution of terrorism charges against him and David C Headley .",per:title,"['\$ARG1 grocery store owner and immigration counselor \$ARG2', '\$ARG1 grocery store owner and immigration counselor Tahawwur Rana , \$ARG2', '\$ARG1 grocery store owner and immigration counselor Tahawwur Rana , 48 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG2', '\$ARG2 grocery store owner and immigration counselor \$ARG1', '\$ARG1 , \$ARG2', '\$ARG1 , 48 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG2', '\$ARG2 grocery store owner and immigration counselor Tahawwur Rana , \$ARG1', '\$ARG2 , \$ARG1', '\$ARG1 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG2', '\$ARG2 grocery store owner and immigration counselor Tahawwur Rana , 48 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG1', '\$ARG2 , 48 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG1', '\$ARG2 , should not be freed on bond pending resolution of terrorism charges against him and \$ARG1']"


In [431]:
def pattern_search(extr_sample, patterns, row):
    for relation, rel_patterns in patterns.iterrows():
        matches = [[row["sample"], extr_sample, pattern, relation, row["label"]] 
                   for pattern in rel_patterns["regex pattern"]].str.match(pattern) 
                   # if re.match(pattern, extr_sample) is not None]   # todo
        if len(matches) > 0:
            return pd.DataFrame(matches, columns = FINAL_DF_COLUMNS)

In [432]:
all_matches = pd.DataFrame(columns = FINAL_DF_COLUMNS)
for _, row in selected_samples.iterrows():
    for cand_sample in row["extr"]:
        df_found = pattern_search(cand_sample, relation_patterns_df, row)
        if isinstance(df_found, pd.DataFrame) and not df_found.empty:
            all_matches = pd.concat([all_matches, df_found])

In [433]:
all_matches.apply(lambda x: escape_dollar(x)).head()

Unnamed: 0,sample,extr_sample,pattern,weak_label,gold_label
0,"Kissel was born in Adrian , Michigan , but her family had also lived in Minneapolis .",\$ARG1 was born in \$ARG2,(A )?(a )?(The )?(the )?\\$ARG1\ was\ born\ in\ (A )?(a )?(The )?(the )?\\$ARG2,per:date_of_birth,per:cities_of_residence
0,"Gwathmey was born in 1938 , the only child of painter Robert Gwathmey and his wife , Rosalie , a photographer .",\$ARG1 was born in \$ARG2,(A )?(a )?(The )?(the )?\\$ARG1\ was\ born\ in\ (A )?(a )?(The )?(the )?\\$ARG2,per:date_of_birth,per:date_of_birth
0,"Anders Berntell , head of the Stockholm International Water Institute , says that , although `` water is absolutely crucial for all sectors in society , '' water issues have played too small a role in climate talks .","\$ARG1 , head of \$ARG2","(A )?(a )?(The )?(the )?\\$ARG1\ ,\ head\ of\ (A )?(a )?(The )?(the )?\\$ARG2",org:top_members/employees,org:top_members/employees
0,"A professor emeritus at Yale University , Mandelbrot was born in Poland but as a child moved with his family to France where he was educated .",\$ARG1 was born in \$ARG2,(A )?(a )?(The )?(the )?\\$ARG1\ was\ born\ in\ (A )?(a )?(The )?(the )?\\$ARG2,per:date_of_birth,per:countries_of_residence
0,"Gwathmey was born in 1938 , the only child of painter Robert Gwathmey and his wife , Rosalie , a photographer .",\$ARG1 was born in \$ARG2,(A )?(a )?(The )?(the )?\\$ARG1\ was\ born\ in\ (A )?(a )?(The )?(the )?\\$ARG2,per:date_of_birth,per:children


But here we can observe some misclassified sentences: for example, a sentence 

"A professor emeritus at Yale University , Mandelbrot was born in Poland but as a child moved with his family to France where he was educated" 

was assigned with a label "per:date_of_birth" (presumably by a pattern "ARG1 was born in ARG2"), what is definitely wrong. In order to avoid such mistake, let's add additional constraints on the argument types.

In [434]:
relation_to_types = {"org:alternate_names": ['PERSON', 'PERSON'], 
                     "per:date_of_birth": ['PERSON', 'DATE'],
                     "org:top_members/employees": ['PERSON', 'ORG']}

So, when we look for these patterns in samples, we should take into account the entity types referred to corresponding relation

In [418]:
relation_patterns_df["type"] = pd.Series(relation_to_types)

In [419]:
relation_patterns_df.apply(lambda x: escape_dollar(x)).head()

Unnamed: 0,raw pattern,regex pattern,type
org:alternate_names,"['\$ARG1 ( \$ARG2 ', '\$ARG1 formerly known as \$ARG2', '\$ARG1 aka \$ARG2', '\$ARG1 ( also known as \$ARG2 )']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ ', '(A )?(a )?(The )?(the )?\\\$ARG1\\ formerly\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ aka\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ also\\ known\\ as\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)']","['PERSON', 'PERSON']"
per:date_of_birth,"['\$ARG1 ( born \$ARG2 )', '\$ARG1 ( born \$ARG2 in', '\$ARG1 ( \$ARG2 -', '\$ARG1 was born in \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\)', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ born\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ in', '(A )?(a )?(The )?(the )?\\\$ARG1\\ \\(\\ (A )?(a )?(The )?(the )?\\\$ARG2\\ \\-', '(A )?(a )?(The )?(the )?\\\$ARG1\\ was\\ born\\ in\\ (A )?(a )?(The )?(the )?\\\$ARG2']","['PERSON', 'DATE']"
org:top_members/employees,"['\$ARG1 , executive director of \$ARG2', '\$ARG1 , head of \$ARG2', '\$ARG1 , who heads \$ARG2', '\$ARG1 , chief executive of \$ARG2']","['(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ executive\\ director\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ head\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ who\\ heads\\ (A )?(a )?(The )?(the )?\\\$ARG2', '(A )?(a )?(The )?(the )?\\\$ARG1\\ ,\\ chief\\ executive\\ of\\ (A )?(a )?(The )?(the )?\\\$ARG2']","['PERSON', 'ORG']"
