# Stemming

In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
p_stemmer = PorterStemmer()

In [4]:
words = ['run','runner','ran','runs','easily','fairly']

In [7]:
for word in words:
    print(word + ' ----> ' + p_stemmer.stem(word))

run ----> run
runner ----> runner
ran ----> ran
runs ----> run
easily ----> easili
fairly ----> fairli


In [8]:
from nltk.stem.snowball import SnowballStemmer

In [9]:
s_stemmer = SnowballStemmer(language='english')

In [11]:
for word in words:
    print(word + ' ----> ' + s_stemmer.stem(word))

run ----> run
runner ----> runner
ran ----> ran
runs ----> run
easily ----> easili
fairly ----> fair


In [15]:
words = ['generous','generation','generously','generate']

In [16]:
for word in words:
    print(word + ' ----> ' + s_stemmer.stem(word))

generous ----> generous
generation ----> generat
generously ----> generous
generate ----> generat


# Lemmatization

In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_sm')

In [20]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [21]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [22]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [23]:
show_lemmas(doc1)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


# Stop words

In [24]:
print(nlp.Defaults.stop_words)

{'what', 'five', 'name', 'whoever', 'onto', 'everywhere', "'re", 'seemed', 'whereby', 'was', 'fifty', 'through', 'as', 'mostly', 'four', 'seeming', 'himself', 'ours', 'somehow', 'hers', 'in', 'they', 'whether', 'give', 'at', 'former', 'whole', 'same', 'last', 'serious', 'too', 'six', 'even', 'around', 'than', 'never', 'since', 'other', 'eight', 'always', 'hereby', 'those', 'side', 'a', 'third', 'whatever', 'more', 'that', 'anywhere', 'ca', 'still', 'nevertheless', 'without', 'using', 'meanwhile', 'does', 'afterwards', '‘ve', 'by', 'thereupon', 'should', 'thru', 'often', 'everyone', 'sixty', 'make', 'anything', 'once', 'least', 'her', 'three', "n't", 'n’t', 'therefore', 'yet', 'nothing', 'next', 'its', 'thereby', 'be', 'when', 'your', 'alone', 'over', 'hereafter', 'namely', 'others', 'hundred', 'nowhere', 'wherever', 'doing', 'have', 'wherein', 'whereupon', 'do', 'somewhere', 'another', 'but', 'whence', 'only', 'how', 'whereafter', 'please', 'take', 'about', '‘d', 'themselves', 'under',

In [26]:
nlp.vocab['is'].is_stop

True

In [27]:
nlp.Defaults.stop_words.add('btw')

In [30]:
nlp.vocab['btw'].is_stop

True

In [31]:
len(nlp.Defaults.stop_words)

327

In [34]:
nlp.Defaults.stop_words.remove('btw')

In [35]:
len(nlp.Defaults.stop_words)

326