In [3]:
import spacy
from spacy import displacy
from spacy.tokens import Doc, Token
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher, PhraseMatcher

import numpy as np


In [4]:
#nlp = spacy.load('en')
nlp = spacy.load('en')


In [None]:
doc = nlp(u'The tired traveller roamed the sandy desert, seeking food.')
#doc = nlp(u'Procter and Gamble is looking at buying UK startup for € 1 billion')
for token in doc:
    print token.text, token.lemma_, token.pos_, token.dep_, token.shape_, token.is_alpha, token.is_stop

In [None]:
# get an explanation of a POS or dependency  tag
spacy.explain("dobj")


In [None]:
# display dependency parse tree
displacy.render(doc, style='dep', jupyter=True, options={'distance':120})

In [None]:
for ne in doc.ents:
    print ne.text, ne.start_char, ne.end_char, ne.label_

displacy.render(doc, style='ent', jupyter=True)

In [None]:
# play with word vectors
tokens = nlp(u'dog terrier banana apple')
for t in tokens:
    print t.text, t.has_vector, t.vector_norm, t.is_oov

print 
for t1 in tokens:
    for t2 in tokens:
        if (t1.text == t2.text):
            continue
        print t1.text, t2.text, t1.similarity(t2), t2.vector.shape[0]
        

In [None]:
# normalise to unit vector
test_norm = tokens[1].vector / tokens[1].vector_norm
# l2 norm is now equal to 1
print np.sum(test_norm ** 2) ** 0.5



In [None]:
# lets use some corpora that ship with NLTK
import nltk
from nltk.corpus import gutenberg


In [None]:
gutenberg.fileids()

In [None]:
# get a sentence arbitrarily
s = gutenberg.sents('austen-sense.txt')[1201]

#gutenberg.raw('austen-sense.txt')

# let's try load the entire raw corpus into spaCy doc
#doc = nlp(gutenberg.raw('austen-sense.txt'))

#len(gutenberg.sents('austen-sense.txt')[:1000])
sense100 = gutenberg.sents('austen-sense.txt')[:100]
# let's join the first 1000 sentences of Sense & Sensibility in one large sentence
sense = " ".join([" ".join(_s) for _s in sense100])

#corpus = list()


In [None]:
print gutenberg.raw('austen-sense.txt')[:200]
len(gutenberg.raw('austen-sense.txt')[:200])

#"lots of newlines\nlots of newlines\n\n".find("\n")
#"lots of newlines\nlots of newlines\n\n"[:16]
# generator that finds new-line indices in raw corpus
# double new-lines will be used as paragraph delimiters
def GenTest(raw):
    _max = len(raw)
    n = 0;
    while n+1 < _max:
        if (raw[n] == "\n" and raw[n+1] == "\n"):
            yield n
        n += 1

newlines = [i for i in GenTest(gutenberg.raw('austen-sense.txt'))]
newlines = np.array(newlines)
# length of Sense = 673022

In [None]:
# break a corpus into several sub-corpora

interval = 50000
corpus = []
_max = 673022
idx = 0
while idx < _max:
    prev_idx = idx
    idx += interval    
    if (newlines[newlines >= idx].size):
        idx = np.min(newlines[newlines >= idx])        
    else:
        idx = _max
        
    corpus.append(gutenberg.raw('austen-sense.txt')[prev_idx:idx].replace("\n", " "))
    print prev_idx, idx
    
#nlp(gutenberg.raw('austen-sense.txt')[0:50887].replace("\n", " "))


In [None]:
corpus[10]

In [None]:
# parse the first sub-corpus
#doc = nlp(corpus[0])
doc = nlp(corpus[10])

In [None]:
#list(doc.sents)[60]
for chunk in doc.noun_chunks:
    print chunk.text, chunk.label_, chunk.lemma_, chunk.root
    

In [None]:
# find root of sentences
for i in nlp(u"The spacecraft was safely landed by the pilot on the surface of the new planet. The dog caught the frisbee in the park.").sents:    
    print i.root.text, i.root.tag_, i.root.lemma_, i.root.dep_

In [None]:
for token in doc:
    print token.text, token.tag_, token.dep_

print "---------------------"
# get noun chunks
for chunk in doc.noun_chunks:
    print chunk


In [None]:
s = [["hello", "world", "."], ["good-bye", "universe", "."]]
" ".join([" ".join(_s) for _s in s])

In [None]:
nlp.tokenizer = nlp.Defaults.create_tokenizer(nlp)
#doc2 = nlp(u"The tired travelers roamed the sandy desert, seeking food.")
doc2 = nlp(u'This is a ship-shipping ship, shipping shipping ships.')
for chunk in doc2.noun_chunks:
    print "%s | %s | %s | %s" \
    %(chunk.text, chunk.root.lemma_, chunk.root.dep_, chunk.root.head.lemma_)

print "-"*20
for token in doc2:
    print "%d; %s/%s \(%s\) <-- %s --%s/%s" % (token.i, token.lemma_, token.tag_, token.pos_, token.dep_, token.head.lemma_, token.head.tag_)

print [word for word in doc2[3].children if word.pos_ != 'PUNCT']
print [word for word in doc2[3].lefts if word.pos_ != 'PUNCT']
print [word for word in doc2[3].rights if word.pos_ != 'PUNCT']

In [None]:
# experiment with custom tokeniser, ideal to use on pre-tokenised text
class WhitespaceTokenizer(object):
    def __init__(self, vocab, ):
        self.vocab = vocab
    
    def __call__(self, text):
        words = text.split(' ')
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces = spaces)    

In [None]:
# use Whitespace tokenizer if my text has already been tokenized
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

doc3 = nlp(" ".join(word_list))
for token in doc3:
    print token.text, token.lemma_, token.pos_, token.head, token.dep_

#print "-"*30
#for chunk in doc3.noun_chunks:
#    print chunk.text, chunk.label_, chunk.lemma_, chunk.root


In [12]:
nlp.tokenizer = nlp.Defaults.create_tokenizer(nlp)

input_str = u"Al Pacino and Robert De Niro starred together in The Godfather Part II. " +\
             "De Niro and executive vice president Robert Evans disagreed on aspects of the film. "+\
             "Disagreeing is one thing but apples are another."

        
#input_str = u"Lynching him was not a good idea.  A lynching is always exciting.  "+\
 #            "Vice president Silvio Berlusconi quipped that tendentiousness is a Concept."
doc3 = nlp(input_str)


for token in doc3:
    print token.i, token.idx, token.text, token.lemma_, token.pos_, token.head, token.dep_

# create word_list to mock pre-tockenized strings
word_list = [token.text for token in doc3]
print " ".join(word_list)

# let's create n-grams from the string tokens
from nltk.util import ngrams
trigrams = ngrams([token for token in doc3 if token.pos_ != 'PUNCT'], 3)
#print list(trigrams)

print "-"*50
#for ne in doc3.ents:
#    print ne.text, ne.start_char, ne.end_char, ne.label_
print u"\nPrinting entities"

for ents in doc3.ents:    
    print ents.text, ents.label_, ents.start, ents.end, ents.lower_
    
print "-"*50
print u"\nPrinting noun chunks"

for ents in doc3.noun_chunks:    
    print ents.text, ents.label_, ents.start, ents.end
 



0 0 Al al PROPN Pacino compound
1 3 Pacino pacino PROPN starred nsubj
2 10 and and CCONJ Pacino cc
3 14 Robert robert PROPN Niro compound
4 21 De de PROPN Niro compound
5 24 Niro niro PROPN Pacino conj
6 29 starred star VERB starred ROOT
7 37 together together ADV starred advmod
8 46 in in ADP starred prep
9 49 The the DET II det
10 53 Godfather godfather PROPN II compound
11 63 Part part PROPN II compound
12 68 II ii PROPN in pobj
13 70 . . PUNCT starred punct
14 72 De de PROPN Niro compound
15 75 Niro niro PROPN disagreed nsubj
16 80 and and CCONJ Niro cc
17 84 executive executive ADJ president amod
18 94 vice vice NOUN president compound
19 99 president president NOUN Evans compound
20 109 Robert robert PROPN Evans compound
21 116 Evans evans PROPN Niro conj
22 122 disagreed disagree VERB disagreed ROOT
23 132 on on ADP disagreed prep
24 135 aspects aspect NOUN on pobj
25 143 of of ADP aspects prep
26 146 the the DET film det
27 150 film film NOUN of pobj
28 154 . . PUNCT disagreed 

In [6]:
entity_spans = []
concept_spans = []
spans = []

def on_match(matcher, doc, i, matches):
    match_id, start, end = matches[i]    
    span = doc[start:end]    
    if nlp.vocab.strings[match_id].startswith("entity"):
        entity_spans.append(span)
    else:
        concept_spans.append(span)
    #spans.append(span)    


def create_patterns(**patterns):
    matcher2 = PhraseMatcher(nlp.vocab)    
    matcher3 = PhraseMatcher(nlp.vocab)    
    for p in patterns:                
        nlp_patterns = [nlp(text) for text in patterns[p]]
        print p, nlp_patterns
        assert p.endswith("2") or p.endswith("3")        
        if p.endswith("2"):
            matcher2.add(p, on_match, *nlp_patterns)
        else:
            matcher3.add(p, on_match, *nlp_patterns)  
    
    return matcher3, matcher2

#Token.remove_extension('is_input_ent')
if not (Token.has_extension('is_input_ent')):
    Token.set_extension('is_input_ent', default=False)
if not (Token.has_extension('is_input_concept')):
    Token.set_extension('is_input_concept', default=False)

doc3 = nlp(input_str)

#for t in doc3:
 #   print t.text, t.ent_iob_

# we can use a phrase matcher and create larger terminology lists
# also we will implement a call-back event listener to take an action
# when a match is found

p1 = [u'Robert De Niro', u"John Fitzgerald Kennedy"]
p2 = [u'De Niro', u'Adolf Hitler']
p3 = [u'executive vice president']
p4 = [u'vice president', u'Vice president']

matchers = create_patterns(entity3=p1, entity2=p2, concept3=p3, concept2=p4)
matchers = list(matchers)
#nlp_patterns = [nlp(text) for text in p1]
#matcher = PhraseMatcher(nlp.vocab)
#matcher.add("entity2", on_match, *nlp_patterns)
#match = matcher(doc3)

concept3 [executive vice president]
concept2 [vice president, Vice president]
entity2 [De Niro, Adolf Hitler]
entity3 [Robert De Niro, John Fitzgerald Kennedy]


In [7]:
for m in matchers:
    matches = m(doc3)
    print "Entities:"
    for s in entity_spans:
        print s
        s.merge()
        for t in s:
            t._.is_input_ent = True
        
    
    print "Concepts:"    
    for s in concept_spans:
        print s 
        s.merge()
        for t in s:
            t._.is_input_concept = True
        
    concept_spans = []
    entity_spans = []
    print "-"*50+"\n"

for ne in doc3.ents:
    print ne.text, ne.start , ne.end, ne.label_

Entities:
Concepts:
--------------------------------------------------

Entities:
Concepts:
Vice president
--------------------------------------------------

Silvio Berlusconi 17 19 PERSON


In [8]:
def process_token(t, doc):
    state = dict(count=0)
    def _process_token(t):
        string = None
        if t.ent_iob_ == 'B': # if start of named entity                            
            string = doc.ents[state['count']].text.replace(" ", "_")
            state['count'] += 1            
        # if inside named entity or punctuation or space, ignore token
        elif t.ent_iob_ == 'I' or t.pos_ in ['PUNCT', 'SPACE']:
            pass
        # if entity: retain case, create single token
        elif t._.is_input_ent: 
            string = t.text.replace(" ","_") 
        # if concept: lower case, create single token
        elif t._.is_input_concept:
            string = t.text.lower().replace(" ","_") 
        # if uni-gram, grab lemma
        else:
            string = t.lemma_
        
        return string
    return _process_token
   
#filter(None, map(process_token, [1,2,3]))


In [9]:
pt = process_token(None, doc3)
string_list = []
for sent in doc3.sents:
    string_list.append(filter(None, map(pt, sent)))

print string_list


[[u'lynch', u'-PRON-', u'be', u'not', u'a', u'good', u'idea'], [u'a', u'lynching', u'be', u'always', u'exciting'], [u'vice_president', u'Silvio_Berlusconi', u'quip', u'that', u'tendentiousness', u'be', u'a', u'concept']]


In [121]:


string_list = []
entity_counter = 0
for sent in doc3.sents:        
    string = []
    for t in sent:
        if t.ent_iob_ == 'B': # if start of named entity                
            string.append(doc3.ents[entity_counter].text.replace(" ", "_"))
            entity_counter += 1
            continue

        if t.ent_iob_ == 'I' or t.pos_ in ['PUNCT', 'SPACE']:
            continue

        if t._.is_input_ent: 
            string.append(t.text.replace(" ","_") )
        elif t._.is_input_concept:
            string.append(t.text.lower().replace(" ","_") )
        else:
            string.append(t.lemma_)
            
    string_list.append(string)
    #print t.i, t.idx, t.text.replace(" ","_"),t.pos_, t.dep_
    #print t.i, t.idx, t.text, t.pos_, t.dep_
    
    #string_list.append()
print string_list
    




[[u'lynch', u'-PRON-', u'be', u'not', u'a', u'good', u'idea'], [u'a', u'lynching', u'be', u'always', u'exciting'], [u'vice_president', u'Silvio_Berlusconi', u'quip', u'that', u'tendentiousness', u'be', u'a', u'concept']]


In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
matcher.add('HelloWorld', None, pattern)

doc = nlp(u'Hello world! Hello; world!')
for tok in doc:
    print tok.text, tok.pos_

matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]  # the matched span
    print(match_id, string_id, start, end, span.text)