In [2]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.stem.porter import *

from nltk.stem.snowball import SnowballStemmer

from spacy.matcher import Matcher


In [4]:
doc = nlp('"Let\'s go to N.Y.!"')
for token in doc:
    print(token.text)

"
Let
's
go
to
N.Y.
!
"


In [13]:
sentence = nlp(u"Google's monthly revenue is $6 million")

for token in sentence:
    print(token.text, end=' | ')

print('\n______________________________________________________')

for ent in sentence.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Google | 's | monthly | revenue | is | $ | 6 | million | 
______________________________________________________
Google - ORG - Companies, agencies, institutions, etc.
monthly - DATE - Absolute or relative dates or periods
$6 million - MONEY - Monetary values, including unit


In [14]:
doc = nlp(u'Over last few years USA generates $6 million revenue.')
displacy.render(doc, style='ent', jupyter=True)

In [18]:
document = nlp(u'Bangladesh is a beautiful country')
displacy.render(document, style='dep', jupyter=True, options={'distance': 110})

In [20]:
p_stemmer = PorterStemmer()
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [22]:
s_stemmer = SnowballStemmer(language='english')
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [23]:

doc = nlp(u"He is a runner running in a competition because he loves to run since he ran today")

for token in doc:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

He           PRON   1655312771067108281    he
is           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
competition  NOUN   4661638505416061516    competition
because      SCONJ  16950148841647037698   because
he           PRON   1655312771067108281    he
loves        VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
he           PRON   1655312771067108281    he
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


In [24]:
print(nlp.Defaults.stop_words)

{'a', 'some', 'bottom', 'hers', 'around', 'the', 'made', 'these', 'used', 'get', 'latter', 'there', 'anyway', 'call', 'same', 'whether', 'whereby', 'together', 'all', 'done', 'often', 'about', 'top', 'part', 'thereupon', 'here', 'now', '‘re', 'most', 'did', 'should', 'nothing', 'one', 'well', "n't", 'back', "'re", 'herein', 'beforehand', 'none', 'were', 'except', 'because', 'whose', 'others', 'became', 'been', 'everyone', 'last', 'few', 'see', 'indeed', 'just', 'may', 'otherwise', 'via', 'upon', 'various', '’m', 'fifteen', 'twelve', 'due', '’d', 'between', 'and', 'an', 'your', 'using', 'before', 'no', 'someone', 'put', 'keep', 'thence', 'with', "'m", 'alone', 'out', 'him', 'every', 'herself', 'mostly', 'against', 'from', 'although', 'still', 'whole', '’ve', 'perhaps', 'further', 'after', 'behind', 'within', 'itself', 'meanwhile', 'onto', 'does', 'beyond', 'yet', 'empty', 'again', 'if', 'below', 'her', 'many', 'seem', 'above', 'ever', 'move', 'those', 'very', 'but', 'throughout', 'both'

In [27]:
nlp.vocab['myself'].is_stop

True

In [33]:
matcher = Matcher(nlp.vocab)
pattern_1 = [{'LOWER': 'solarpower'}]
pattern_2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern_3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

patterns = [pattern_1, pattern_2, pattern_3]
matcher.add('solarpower', patterns)

In [34]:
document = nlp(u'The Solar Power industry continues to grow as demand \
    for solarpower increases. Solar-power cars are gaining popularity.')
found_matches = matcher(document)
print(found_matches)

[(5703546853475899243, 1, 3), (5703546853475899243, 11, 12), (5703546853475899243, 14, 17)]


In [35]:
for match_id, start, end in found_matches:
    # get string representation
    string_id = nlp.vocab.strings[match_id]
    # get the matched span
    span = doc[start:end]       
    print(match_id, string_id, start, end, span.text)

5703546853475899243 solarpower 1 3 is a
5703546853475899243 solarpower 11 12 to
5703546853475899243 solarpower 14 17 he ran today
