<h3>Stemming in NLTK</h3>

In [1]:
print('hello')

hello


In [2]:
import nltk
import spacy

In [4]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [8]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))
# gorunduyu kimi sehv var, ate - ate kimi saxladi, belke de eat elemelidi, belke de ate kimi saxlamalidi. Cumlede de bunu basa dusmur.
# ability - abil eledi. bu da sehvdi.

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [10]:
# basqa stemmer de var, burda da netice eynidir, ferqleri var yeqin ki, amma no need bilmeye, zaten nltk yaxsi islemir
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')

In [11]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", snowball.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


<h3>Lemmatization in Spacy</h3>

In [12]:
import spacy

In [19]:
# perfect
# bu nlp model adlanir, sen pretrained model isledirsen
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting
better  |  well


In [21]:
# SpaCy oz icerisinde map edir, hashing yeni, her soze bir reqem assign edir bir novu, unique olur bu her soz ucun, tekrarlanan sozlerde eyni olur.
for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma)

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
ate  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rafting  |  raft  |  7154368781129989833
ability  |  ability  |  11565809527369121409
meeting  |  meeting  |  14798207169164081740
better  |  well  |  4525988469032889948


In [23]:
# perfect
doc = nlp("Mando talked for 3 hours although talking isn't his thing he became talkative")
for token in doc:
    print(token, " | ", token.lemma_)

Mando  |  Mando
talked  |  talk
for  |  for
3  |  3
hours  |  hour
although  |  although
talking  |  talk
is  |  be
n't  |  not
his  |  his
thing  |  thing
he  |  he
became  |  become
talkative  |  talkative


<h3>Customizing lemmatizer</h3>

In [26]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [28]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)
# Bro ve Brah -> brother'di eslinde, biz bunu bilirik, amma bunlar slang(jarqon) oldugu ucun, model bunu basa dusmur
# ona gore de customize ederek, pre-trained modele ozun elave seyler oyredirsen

Bro | bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [31]:
# mueyyen componenti(stageni ve ya stepi) pipelineden goturmek ucun object olaraq bele yazmaq lazim
ar = nlp.get_pipe('attribute_ruler')

# attribute ruler'in icersine yenilik bele getirmek olur
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [32]:
# hecne elave edilmedi component(stage, step) olaraq pipelineye, sadece attribute rulere elave qayda elave etdin :)
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [33]:
doc[6]

Brah

In [34]:
doc[6].lemma_

'Brother'