## Stemming

In [2]:
import nltk

In [3]:
from nltk.stem.porter import PorterStemmer

In [4]:
p_stemmer = PorterStemmer()

In [14]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [15]:
for word in words:
    print(word + '------>' + p_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fairli
fairness------>fair


In [8]:
from nltk.stem.snowball import SnowballStemmer

In [10]:
s_stemmer = SnowballStemmer(language='english')

In [16]:
for word in words:
    print(word + '----->' + s_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair
fairness----->fair


In [17]:
words = ['generous', 'generation', 'generously', 'generate']

In [18]:
for word in words:
    print(word + '----->' + s_stemmer.stem(word))

generous----->generous
generation----->generat
generously----->generous
generate----->generat


## Lemmatization

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [8]:
for token in doc:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [10]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [12]:
show_lemmas(doc)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


In [13]:
doc2 = nlp(u'I saw ten mice today.')

In [15]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


## Stop WOrds

In [18]:
print(nlp.Defaults.stop_words)

{'sixty', 'done', 'full', 'again', 'whatever', 'about', 'doing', 'beyond', 'one', 'both', 'ten', 'two', 'get', 'would', 'you', 'was', 'anywhere', 'show', 'whither', 'must', 'therein', 'however', 'four', 'thereupon', 'down', 'further', 'less', 'same', 'with', 'some', 'seemed', 'at', 'made', 'other', 'noone', 'yours', 'for', 'should', 'these', 'himself', 'themselves', 'alone', 'except', 'formerly', 'does', 'has', 'since', 'their', 'towards', 'perhaps', 'everywhere', 'else', 'there', 'during', 'go', 'the', 'any', 'anything', 'if', 'meanwhile', 'top', 'hereby', 'am', 'also', 'three', 'or', 'then', 'anyone', 'nor', 'six', 'forty', 'nine', 'former', 'indeed', 'moreover', 'than', 'whereby', 'never', 'either', 'seeming', 'eleven', 'wherever', 'sometime', 'and', 'to', 'really', 'ours', 'neither', 'before', 'hence', 'above', 'mine', 'nowhere', 'on', 'myself', 'thru', 'under', 'each', 'whoever', 'make', 'five', 'yourselves', 'sometimes', 'something', 'because', 'much', 'after', 'otherwise', 'agai

In [19]:
len(nlp.Defaults.stop_words)

305

In [20]:
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x21b5f757728>

In [21]:
nlp.vocab['is'].is_stop

True

In [22]:
nlp.Defaults.stop_words.add('btw')

In [23]:
nlp.vocab['btw'].is_stop = True

In [24]:
len(nlp.Defaults.stop_words)

306

In [25]:
nlp.vocab['btw'].is_stop

True

In [26]:
nlp.Defaults.stop_words.remove('beyond')

In [28]:
nlp.vocab['beyond'].is_stop = False

In [29]:
len(nlp.Defaults.stop_words)

305

In [30]:
nlp.vocab['beyond'].is_stop

False