In [1]:
import pandas as pd
import spacy
import re
from spacy.matcher import PhraseMatcher, Matcher
pd.set_option('max_colwidth', -1)
nlp = spacy.load('en_core_web_sm')

In [13]:
labels = pd.read_csv("./data/title_name_label.txt", sep='\t', names=['text', 'name', 'title'])
title_set = frozenset(map(lambda x:x.strip('\t\n'), [x for x in open("./data/title_list_v2.tsv").readlines()if not x.startswith("#")]))

In [14]:
#len(open("./data/title_name_label.txt").readlines())
len(labels)

37

In [15]:
labels['name'] = labels['name'].apply(lambda x:x.strip())
labels['title'] = labels['title'].apply(lambda x:x.strip('\t '))

In [16]:
pat_beg = re.compile('^[\s“”]+')
pat_end = re.compile('[\s“”]+$')

In [17]:
# extract rule1 detect
def post_process(title):
    if title is not None:
        title = pat_beg.sub("",title)
        title = pat_end.sub("",title)
    return title

def debug_show_tokens_in_span(span):
    for token in span:
        print(token.text, '#', token.tag_, '#', token.pos_)
  
def remove_det_punt_for_title_span(doc, span, start, end):
    # only deal with the sentence is starts with name:
    #e.x: Yana Pechenik, a physician assistant at MyBotoxLA,
    if span is not None:
        i, j = span[0].i, span[-1].i
        while(i<j):
            if doc[i].is_punct or doc[i].tag_ == "DT":
                i += 1
            elif doc[j].is_punct:
                j -= 1
            else:
                break
        if j > i :
            span = doc[i:j+1]
    return span
      
def method_depend_parsing(doc, start,  end, debug=False):
    # the root is the last name usually
    root = doc[end-1]
    title_span =  None
    # detect sub-chunks to describe the name:
    seqs = list(root.subtree)
    s = seqs[0].i
    e = seqs[-1].i
    # only deal with this situation so far.
    # sub-tree is continuous and 
    if e-s == len(seqs)-1 and s == start:
        title_span = doc[end:e+1]
        # Guantee the insetation here
        if debug:
            debug_show_tokens_in_span(title_span)
        # only think on the insertation case like Jack, ceo of apple;
        if not (title_span[0].is_punct and title_span[0].lemma_ == ","):
            return None
        title_span = remove_det_punt_for_title_span(doc, title_span, start, end)
        if title_span != None and len(title_span) == 0:
            title_span = None
    return title_span
    
# deal with the before situation like
## U.S. Treasury Secretary Steven Mnuchin
## Redskins pregame host Kevin Sheehan 
def method_noun_chunk(doc, start, end, debug=False):
    title_span = None
    for chunk in doc.noun_chunks:
        if debug:
            print(chunk.text, chunk[0].i,  chunk[-1].i + 1, start, end)
        if chunk[0].i < start and chunk[-1].i + 1 >= end:
            if debug:
                debug_show_tokens_in_span(chunk)
            title_span = doc[chunk[0].i: start]            
            break;
    # normalize to None
    title_span = error_check_method_noun_chunk(title_span)
    if title_span is not None and len(title_span) == 0:
        title_span = None
    return title_span


# deal with case: Justin Williams is a Canadian-American professional ice hockey right winger 
def method_search_noun_chunk(doc, start, end, debug=False):
    title_span = None
    cdts = []
    for chunk in doc.noun_chunks:
        if chunk[-1].text.lower() in title_set:
            cdts.append(chunk)
    for chunk in cdts:
        ancestor = doc[end-1].head
        if debug:
             print(chunk[-1].head == ancestor, ancestor.lemma_)
        # To deal with James is player.
        if chunk[-1].head == ancestor and  ancestor.lemma_ == "be":
            title_span = chunk
        elif chunk[-1].i + 1 == start:
            title_span = chunk
    title_span = error_check_method_noun_chunk(title_span)
    title_span = remove_det_punt_for_title_span(doc, title_span, start, end)       
    return title_span
            
            
            
def error_check_method_noun_chunk(title_span):
    # if starts with punt which probably error in synatic parsing
    # case: (left) greets Nebraska coach Tim Miles
    i, last_i = 0, 0
    if title_span is None:
        return None
    if title_span[0].is_punct:
        #print(title_span)
        while(i< len(title_span)):
            if title_span[i].pos_ == 'VERB':
                last_i = i + 1
            i += 1
        if last_i < len(title_span):
            title_span = title_span[last_i:]
    # if too long, most likely it's wrong.
    #debug_show_tokens_in_span(title_span)
    elif len(title_span)>10 or len(title_span) == 0:
        #debug_show_tokens_in_span(title_span)
        title_span = None
    return title_span
            
        
def extract_title(sent, name, debug=False):
    _name = name.strip('\t \n')
    if _name.lower() not in sent.lower():
        raise Exception('name should be in the sent!')
    title= None
    name_matcher = PhraseMatcher(nlp.vocab)
    name_matcher.add('pat', None, nlp(_name))
    doc = nlp(sent)
    name_matches = name_matcher(doc)
    _, start, end = name_matches[0]
    
    # step 1 use noun_chunks
    title_span = method_noun_chunk(doc, start, end, debug)
    if title_span is None:
        if debug:
            print("[Noun Chunk]: None")
        # step 2 use dependency parsing
        title_span = method_depend_parsing(doc, start, end, debug)
    if title_span is None:
        if debug:
            print("[Depend Parsing]: None")
        # step 3 use noun chunk search
        title_span = method_search_noun_chunk(doc, start, end, debug)
    if title_span is not None:
        title = title_span.text
        # last to precoss
        title = post_process(title)
    return title

def batch_run(labels, debug=False):
    coverage_num = 0
    accuracy_num = 0
    for idx, row in labels.iterrows():
        text, name, title_y = row[0], row[1], row[2]
        title_p = extract_title(text, name, debug)
        is_match= title_p == title_y
        if title_p is not None:
            coverage_num += 1
        if is_match:
            accuracy_num += 1
        #print(text, name)
        if title_p is None:
            print("%s : [%s]->[%s]"%(name, title_p, title_y))   
        """
        if title_p is not None and is_match is False:
            print("%s : [%s]->[%s]"%(name, title_p, title_y))
        
        if is_match:
            print("[%r]%s : [%s]"%(is_match, name, title_p))
        else:
            print("[%r]%s : [%s]->[%s]"%(is_match, name, title_p, title_y))
        """        
    print ("coverage:", coverage_num/len(labels), "accuracy:", accuracy_num/coverage_num )

In [18]:
DEBUG = True
text = labels.iloc[35]['text']
name = labels.iloc[35]['name']
print(text, name)
title = extract_title(text, name, True)
#batch_run(labels, DEBUG)
print(title)

All together, InSight’s readings will help scientists figure out how planets are put together and how they evolve, says Suzanne Smrekar, the mission’s deputy principal investigator. Suzanne Smrekar
InSight’s readings 3 6 23 25
scientists 8 9 23 25
planets 12 13 23 25
they 18 19 23 25
Suzanne Smrekar 23 25 23 25
the mission’s deputy principal investigator 26 32 23 25
[Noun Chunk]: None
, # , # PUNCT
the # DT # DET
mission # NN # NOUN
’s # POS # PART
deputy # NN # NOUN
principal # JJ # ADJ
investigator # NN # NOUN
mission’s deputy principal investigator


In [19]:
batch_run(labels)

Josh Norman : [None]->[Washington Redskins cornerback]
John Huggins : [None]->[former civil engineer]
Landon Dutton : [None]->[with American Family Care]
coverage: 0.918918918918919 accuracy: 1.0


In [10]:
labels

Unnamed: 0,text,name,title
0,"U.S. Treasury Secretary Steven Mnuchin said on Saturday that Washington wants to include a provision to deter currency manipulation in future trade deals, including with Japan, based on the currency chapter in the new deal to revamp NAFTA.",Steven Mnuchin,U.S. Treasury Secretary
1,"Norman was not on the field because he decided to tune out coach Jay Gruden at halftime, said Redskins pregame host Kevin Sheehan on his podcast on Friday.",Kevin Sheehan,Redskins pregame host
2,"Washington Redskins cornerback Josh Norman was burned for a long touchdown pass in Monday night's loss in New Orleans, but that wasn't the reason he was benched to start the second half, according to one report.",Josh Norman,Washington Redskins cornerback
3,"Yana Pechenik, a physician assistant at MyBotoxLA, said the woman agreed to receive $5,000 in services.",Yana Pechenik,physician assistant at MyBotoxLA
4,"“It’s still a search and recovery at this point,” Quogue Village Police Chief Christopher Isola said at a news conference late afternoon on Oct. 13, according to Newsday.",Christopher Isola,Quogue Village Police Chief
5,Quogue resident Tim Carbone told WABC.,Tim Carbone,Quogue resident
6,"Kirby Evans, who lost his nose and one of his eyes during a battle with cancer, told WCSC he was asked to cover up if he wanted to eat at the store and gas station called the Forks Pit Stop.",Kirby Evans,who lost his nose and one of his eyes during a battle with cancer
7,"LYNCHBURG, Va. -- Troy redshirt sophomore kicker Tyler Sumpter had a trio of opportunities to put points on the board for the Trojans.",Tyler Sumpter,Troy redshirt sophomore kicker
8,"Like Dr. Frankenstein, the modder was fully aware that the product was not ideal, practical, or perfect by any stretch. Shank wrote on their blog that the product dubbed “Kill Mii” was “not logical, comfortable, or practical. But it must be done… for the memes.”",Frankenstein,Dr.
9,"Her colleagues are 78-year-old Marion Gilham and 72-year-old former civil engineer John Huggins, who last year lost a leg to diabetes.",John Huggins,former civil engineer
