In [783]:
import spacy
from spacy import displacy
from spacy.tokens import Doc, Token
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher, PhraseMatcher
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import sys

# imports required for BabelNet API calls
import urllib2
import urllib
import json
import gzip

from StringIO import StringIO

# other imports
import os

In [361]:
# load statistical model
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_lg')

In [None]:
# Create Whitespace tokenizer which will be use d 
class WhitespaceTokenizer(object):
    def __init__(self, vocab, ):
        self.vocab = vocab
    
    def __call__(self, text):
        words = text.split(' ')
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces = spaces)    

In [None]:
# use Whitespace tokenizer if my text has already been tokenized
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

doc = nlp(" ".join(word_list))
for token in doc:
    print token.text, token.lemma_, token.pos_, token.head, token.dep_
    
# this line restores the tokeniser back to the spaCy default
nlp.tokenizer = nlp.Defaults.create_tokenizer(nlp)


In [None]:
# if the full pipeline is executed, we can get several details about 
# each token
for token in doc3:
    print token.i, token.idx, token.text, token.lemma_, token.pos_, token.head, token.dep_

# once a document is parsed, named entities can be returned by:
print u"\nPrinting entities"

for ents in doc3.ents:    
    print ents.text, ents.label_, ents.start, ents.end, ents.lower_

# similarly, noun phrases can be emitted by:
print u"\nPrinting noun chunks"

for ents in doc3.noun_chunks:    
    print ents.text, ents.label_, ents.start, ents.end


In [11]:
# we will create a custom pipeline component which will capture
# 2 and 3 token entity and concept terms, merge the spans and 
# tag each token with is_entity or is_concept custom tag

class NGramMerger(object):
    def __init__(self, nlp, **patterns):
        # register custom token extension
        if not (Token.has_extension('is_concept')):            
            Token.set_extension('is_concept', default =False)
        
        if not (Token.has_extension('is_entity')):
            Token.set_extension('is_entity', default=False)        
        
        # create two matchers: one for bigram and another for trigrams
        self.matcher_bi  = PhraseMatcher(nlp.vocab)            
        self.matcher_tri = PhraseMatcher(nlp.vocab)    
        # patterns are passed as dictionary objects with the key
        # conforming to a template: entity|concept 2|3
        for p in patterns:                
            nlp_patterns = [nlp(text) for text in patterns[p]]            
            assert p.endswith("2") or p.endswith("3")        
            if p.endswith("2"):
                self.matcher_bi.add(p, None, *nlp_patterns)
            else:
                self.matcher_tri.add(p, None, *nlp_patterns)  
    
    def __call__(self, doc):
        matchers = [self.matcher_tri, self.matcher_bi]
        entity_spans = []
        concept_spans = []
        # run matchers in order, tri-gram first
        for m in matchers:
            matches = m(doc)
            for match_id, start, end in matches:
                span = doc[start:end]    
                if nlp.vocab.strings[match_id].startswith("entity"):
                    entity_spans.append(span)
                else:
                    concept_spans.append(span)
            
            for span in entity_spans:
                span.merge()
                for t in span:
                    t._.is_entity = True
            
            for span in concept_spans:
                span.merge()
                for t in span:
                    t._.is_concept = True
            
            concept_spans = []
            entity_spans = []
        return doc
        

In [190]:
# patterns will actually come from gold standard datasets
# entity_3
p1 = [u'Robert De Niro', u"John Fitzgerald Kennedy"]
# entity_2
p2 = [u'De Niro', u'Adolf Hitler']
# concept_3
p3 = [u'executive vice president', u'8 mm film']
# concept_2
p4 = [u'vice president', u'Vice president']

nlp = spacy.load('en')
ngram_merger = NGramMerger(nlp, entity3=p1, entity2=p2, concept3=p3, concept2=bi_gram_terms)
nlp.add_pipe(ngram_merger, last=True)
for pipe_id, _ in nlp.pipeline:
    print pipe_id

tagger
parser
ner
NGramMerger


In [318]:
#input_str = u"Al Pacino and Robert De Niro starred together in The Godfather Part II. " +\
 #            "De Niro and executive vice president Robert Evans disagreed on aspects of the film. "+\
  #           "Disagreeing is one thing but apples are another."

input_str = u'The Albanian mafia have been responsible for heroin trafficking in Italy.'

print input_str
doc = nlp(input_str)

print u"\nPrinting entities"

for ents in doc.ents:    
    print ents.text, ents.label_, ents.start, ents.end, ents.lower_



The Albanian mafia have been responsible for heroin trafficking in Italy.

Printing entities
Albanian NORP 1 2 albanian
Italy GPE 10 11 italy


In [194]:
def process_token(t, doc):
    state = dict(count=0)
    def _process_token(t):
        string = None
        if t.ent_type_ not in ['DATE', 'TIME', 'PERCENT', 'ORDINAL', 'CARDINAL'] and t.ent_iob_ == 'B': # if start of named entity                            
            string = doc.ents[state['count']].text.replace(" ", "_")
            state['count'] += 1            
        # if inside named entity or punctuation or space, ignore token
        elif ((t.ent_iob_ == 'I' and t.ent_type_ not in ['DATE', 'TIME', 'PERCENT', 'ORDINAL', 'CARDINAL']) or t.pos_ in ['PUNCT', 'SPACE']):
            pass
        # if entity: retain case, create single token
        elif t._.is_entity: 
            string = t.text.replace(" ","_") 
        # if concept: lower case, create single token
        elif t._.is_concept:
            string = t.text.lower().replace(" ","_") 
        # if uni-gram, grab lemma
        else:
            string = t.lemma_
        
        return string
    return _process_token

In [319]:
pt = process_token(None, doc)
string_list = []
for sent in doc.sents:
    string_list.append(filter(None, map(pt, sent)))

print string_list

[[u'the', u'Albanian', u'mafia', u'have', u'be', u'responsible', u'for', u'heroin', u'trafficking', u'in', u'Italy']]


In [136]:
# let's build the phrase lists

vocab = pd.read_csv("1A.english.vocabulary.txt", header=None, delimiter="\t", names=['term'], keep_default_na=False)
train = pd.read_csv("1A.english.training.data.txt", header=None, delimiter="\t", names=['term', 'term_type'])
test = pd.read_csv("1A.english.test.data.txt", header=None, delimiter="\t", names=['term', 'term_type'])
trial = pd.read_csv("1A.english.trial.data.txt", header=None, delimiter="\t", names=['term', 'term_type'])

In [137]:
# confirm size of training data
[df.shape[0] for df in vocab, train, test, trial]

[218753, 1500, 1500, 50]

In [538]:
# find token count of vocab terms
ser = vocab.term.str.split().str.len()
if 'term_count' not in vocab.columns:
    vocab['term_count'] = ser

bi_gram_terms = list(vocab.query('term_count == 2').term)
# convert to unicode for loading as Patterns in spaCy
bi_gram_terms = [unicode(s, "utf-8") for s in bi_gram_terms]

tri_gram_terms = list(vocab.query('term_count == 3').term)
tri_gram_terms = [unicode(s, "utf-8") for s in tri_gram_terms]

print len(bi_gram_terms)
print len(tri_gram_terms)
print len(bi_gram_terms) + len(tri_gram_terms)



51622
6651
58273


In [521]:
# append training, test and trial data to a single dataframe
full_list = train.append(test).append(trial)
# reset index
full_list = full_list.reset_index(drop=True)

# split full_list between 2 and 3-term concepts and entities
term_count = full_list.term.str.split().str.len()
# add series to dataframe - we now have term count as well
# if it doesn't exist already
if 'term_count' not in full_list.columns:
    full_list['term_count'] = term_count

full_list.head()

# split 
bi_gram_concepts = full_list.query('term_type == \'Concept\' & term_count == 2')
tri_gram_concepts = full_list.query('term_type == \'Concept\' & term_count == 3')

bi_gram_entities = full_list.query('term_type == \'Entity\' & term_count == 2')
tri_gram_entities = full_list.query('term_type == \'Entity\' & term_count == 3')


In [522]:
# print length of train/test/trial data split by entity/concept and token count
print [len(l) for l in bi_gram_concepts, tri_gram_concepts, bi_gram_entities, tri_gram_entities]


[305, 19, 509, 114]


In [539]:
# get trigram terms from vocab.
# entities and concepts are mixed
# i need to know the case to capture them
# by lower-casing everything i lose surface form detail
# therefore, will lose sense as well
bi_gram_entities

Unnamed: 0,term,term_type,term_count
14,Carly Fiorina,Entity,2
18,Erna Brodber,Entity,2
19,Raoul Dufy,Entity,2
22,Skara Brae,Entity,2
23,Murder One,Entity,2
24,Emmett Tyrrell,Entity,2
28,Sachs Harbour,Entity,2
36,A1 motorway,Entity,2
51,North Carolina,Entity,2
53,Forest Way,Entity,2


In [578]:
from nltk.corpus import wordnet as wn

bi_dict = {term:len(wn.synsets(term.replace(" ", "_"))) for term in bi_gram_terms}
tri_dict = {term:len(wn.synsets(term.replace(" ", "_"))) for term in tri_gram_terms}
#wn.synsets("the_simpsons")[0].lemma_names()
#.lemma_names()

In [639]:
# some Wordnet sysnet stats

from collections import Counter
bi_match_counter = Counter(bi_dict.values())
tri_match_counter = Counter(tri_dict.values())

#[key for key, value in bi_dict.iteritems() if value == 19]
#[syn.lemma_names() for syn in wn.synsets("break_up")]

print sum([v for k, v in bi_match_counter.iteritems() if k>0])
print sum([v for k, v in tri_match_counter.iteritems() if k>0])
print bi_match_counter[0]
print tri_match_counter[0]

29205
2460
22417
4191


In [684]:
# get list of bi-gram and tri-gram terms which were not found
# in WordNet
bi_wn_notfound = [k for k, v in bi_dict.iteritems() if v == 0]

tri_wn_notfound = [k for k, v in tri_dict.iteritems() if v == 0]

# there are 446 court cases with a specific surface form X+ v. X+
# these are assumed to be named entities; we don't need to look these
# up in BabelNet
tri_wn_notfound = filter(lambda x: x.split()[1] != 'v.', tri_wn_notfound)

In [692]:
potential_entities = {}
alternate_case = {}

# find potential entities among bi_gram terms
for term in bi_gram_terms:
    synsets = wn.synsets(term.replace(" ", "_"))    
    found_lemma = False
    for syn in synsets:        
        for lemma in syn.lemma_names():            
            lemma = lemma.replace("_", " ")                                    
            if lemma.lower() == term and reduce(lambda x, y: x.istitle() & y.istitle(), lemma.split()):
                potential_entities[term] = lemma
                found_lemma = True
                break
            elif lemma.lower() == term and reduce(lambda x, y: x.istitle() | y.istitle(), lemma.split()):
                alternate_case[term] = lemma
        if found_lemma:
            break
                

In [695]:
# find potential entities among tri-gram terms
# find number of cases        
sum(map(lambda x: 1 if x.split()[1]=='v.' else 0, tri_gram_terms))
# cases can be automatically added to the tri_gram_terms
for term in tri_gram_terms:
    tokens = term.split()
    if tokens[1]== 'v.':
        potential_entities[term] = "%s v. %s" % (tokens[0].title(), tokens[2].title())
        
  

In [712]:
# find potential entities among tri-grams
for term in tri_gram_terms:
    synsets = wn.synsets(term.replace(" ", "_"))    
    found_lemma = False
    for syn in synsets:        
        for lemma in syn.lemma_names():            
            lemma = lemma.replace("_", " ")            
            if lemma.lower() == term and reduce(lambda x, y: x.istitle() & y.istitle(), lemma.split()[0::2]):
                potential_entities[term] = lemma
                #print term, '-', lemma
                found_lemma = True
                break
            elif lemma.lower() == term and reduce(lambda x, y: x.istitle() | y if type(y)==bool else y.istitle(), lemma.split()):
                alternate_case[term] = lemma
        if found_lemma:
            break


In [829]:
# Now we'll write some code to lookup lexicalisations in BabelNet
# using the provided API
class BabelSense():
    #lemma = 'the simpsons'
    lang = 'EN'
    targetLang = 'EN'
    #_key  = 'e906d134-a25f-47ac-8596-2bb4a9cc16cf'
    pos = 'NOUN'
    source = 'WIKI'
    
    def __init__(self, key='e906d134-a25f-47ac-8596-2bb4a9cc16cf'):
        self.params = {            
            'searchLang' : lang,
            'targetLang' : targetLang,
            'key'  : key,
            'pos'  : pos,
            'source' : source
        }
        
        self.service_url = 'https://babelnet.io/v5/getSenses'
    
    def __call__(self, lemma):        
        self.params['lemma'] = lemma.encode('utf-8')
        url = self.service_url + '?' + urllib.urlencode(self.params)
        request = urllib2.Request(url)
        request.add_header('Accept-encoding', 'gzip')
        response = urllib2.urlopen(request)
        
        if response.info().get('Content-Encoding') == 'gzip':
            buf = StringIO( response.read())
            f = gzip.GzipFile(fileobj=buf)
            data = json.loads(f.read())
            results = [result for result in data]
            # retrieving BabelSense data
            if len(results) > 0:
                simpleLemmas =  [result['properties']['simpleLemma'] for result in data]
                return list(set(simpleLemmas))
                #return list(set([sl.encode('utf-8') for sl in simpleLemma]))
            else:
                return []
            

In [834]:
# test look for 50 random examples from bi_terms
babel_results = {}
ran = [np.random.randint(len(tri_wn_notfound)) for _ in range(50)]
tests = [tri_wn_notfound[x] for x in ran]

def get_sense_from_BabelNet(searchList):
    babel = BabelSense()
    babel_results = {}
    for idx, t in enumerate(searchList):
        try:
            if ((idx+1) % 500 == 0):
                print "Done %d" % (idx+1)
            babel_results[t] = babel(t)
        except:
            print idx
            print sys.exc_info()[0]
    return babel_results

babel_results = get_sense_from_BabelNet(tri_wn_notfound)

Done 500
Done 1000
Done 1500
Done 2000
Done 2500
Done 3000
Done 3500


In [837]:
for k, v in babel_results.iteritems():
    for lemma in v:
        lemma = lemma.replace("_", " ")
        
        if ((lemma.lower() == k) and reduce(lambda x,y: x.istitle() & y.istitle(), lemma.split()[0::2])):
            print lemma
            break

Mountain Time Zone
De la Cruz
Theory of Art
Ex parte Bollman
Tropical Storm Arlene
Battle of Cannae
R v Brown
Socialisme ou Barbarie
Agusan del Sur
Wooden Roller Coaster
Code of Ethics
A New Life
Nevus of Ota
Axiom of Choice
First Person Shooter
Tropical Storm Claudette
Piazza del Popolo
Heart of Stone
Cold Dark Matter
Boulevard des Capucines
Happy New Year
Torre del Greco
Ministry of Labor
Puerta del Sol
Shock and Awe
Strength of Materials
The Very Reverend
Koigi wa Wamwere
Jornal do Brasil
Van den Berg
Structural Equation Modeling
Near Eastern Archaeology
Monetary Policy Committee
Torre del Mangia
Sociology of Religion
Past Is Prologue
St. Nicholas Church
Rock Paper Scissors
Freedom from Fear
Carrier Air Wing
History of Psychology
Ministry of Defense
Orion Publishing Group
Juan del Castillo
Potato virus Y
Life Goes On
Paul et Virginie
Salar del Huasco
Lusa News Agency
Tropical Storm Alpha
Barra do Corda
Eduardo Dos Santos
Call to Arms
Laurentide Ice Sheet
Religion of Humanity
Maderas

In [835]:
import pickle
# pickle the results because it took us a long time to query BabelNet
# plus technically, we normally only get 1,000 coins a day

dest = os.path.join('pickle')
if not os.path.exists(dest):
    os.mkdir(dest)
    
#pickle.dump(babel_results, 
#            open(os.path.join(dest,'bi_babel_results.pkl'), 'wb'),
#            protocol=2
#           )

pickle.dump(babel_results, 
            open(os.path.join(dest,'tri_babel_results.pkl'), 'wb'),
            protocol=2
           )

In [794]:
# add new potential entities mined from BabelNet
for term, res  in babel_results.iteritems():
    for lemma in res:
        lemma = lemma.replace("_", " ")        
        if (lemma.lower() == term) and \
            reduce(lambda x, y: x.istitle() & y.istitle(), lemma.split()):
            potential_entities[term] = lemma
            break
    

In [799]:
# so far we have 1,550 potential entities (without babel search)
print len(potential_entities)

# ..to 3070 with babel search
potential_entities

3070


{u'mixed nuts': u'Mixed Nuts',
 u'government printing office': u'Government Printing Office',
 u'shark attack': u'Shark Attack',
 u'rabbit island': u'Rabbit Island',
 u'crown colony': u'Crown Colony',
 u'slavic muslims': u'Slavic Muslims',
 u'holy joe': u'Holy Joe',
 u'alaska native': u'Alaska Native',
 u'dail eireann': u'Dail Eireann',
 u'full court': u'Full Court',
 u'commissioner v. sunnen': u'Commissioner v. Sunnen',
 u'born again': u'Born Again',
 u'northern alliance': u'Northern Alliance',
 u'forced marriage': u'Forced Marriage',
 u'paul v. virginia': u'Paul v. Virginia',
 u'opening act': u'Opening Act',
 u'air corps': u'Air Corps',
 u'model town': u'Model Town',
 u'st. jerome': u'St. Jerome',
 u'speed metal': u'Speed Metal',
 u'weight loss': u'Weight Loss',
 u'liquid courage': u'Liquid Courage',
 u'fermi paradox': u'Fermi Paradox',
 u'department of education': u'Department of Education',
 u'braided river': u'Braided River',
 u'speiser v. randall': u'Speiser v. Randall',
 u'memor

In [453]:
# Idea to set lower-cased lexicalisation token attributes to 
# the title case if that distribution likelihood is higher
# note will not work in call cases:
# sue will be more probably than Sue because "sue" refers to the verb
# rather than to the girl's name

probs={w.orth:w.prob for w in nlp.vocab}
usually_titled = [w for w in nlp.vocab if (w.is_title or w.is_upper) and probs.get(w.lower, -10000) < probs.get(w.orth, -10000)]

In [540]:
nlp.vocab.strings['sue'] # 15556805105345623310L
nlp.vocab.strings['Sue'] # 6419625049723015959L
print 'hurricane' in[w.lower_ for w in usually_titled]
nlp.vocab[nlp.vocab.strings['grappa']].prob



False


-17.643484115600586

In [584]:
for token in nlp(u'My favourite_TV program is The Simpsons.'):
    print token.text, token.lemma_, token.pos_, token.head, token.dep_

My -PRON- ADJ program poss
favourite_TV favourite_tv NOUN program compound
program program NOUN is nsubj
is be VERB is ROOT
The the DET Simpsons det
Simpsons simpsons PROPN is attr
. . PUNCT is punct
