In [4]:
import nltk
import spacy

In [5]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [11]:
words = ['eating', 'eats', 'ate', 'adjustable', 'rafting', 'ability', 'better']

for word in words:
    print(word, "|", stemmer.stem(word))

# see how ate is still ate!
# using stemmer can be efficient, it is faster, does not require lang knowledge.

eating | eat
eats | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
better | better


In [12]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rafting ability meeting better")

for token in doc:
    print(token, "|", token.lemma_)

eating | eat
eats | eat
eat | eat
ate | eat
adjustable | adjustable
rafting | raft
ability | ability
meeting | meet
better | well


In [15]:
doc = nlp("Ali literally talked for three damn hours about nutrition, chickens, and shit. Ain't nothing else!")

for token in doc:
    print(token, "-->", token.lemma_)

Ali --> Ali
literally --> literally
talked --> talk
for --> for
three --> three
damn --> damn
hours --> hour
about --> about
nutrition --> nutrition
, --> ,
chickens --> chicken
, --> ,
and --> and
shit --> shit
. --> .
Ai --> ai
n't --> not
nothing --> nothing
else --> else
! --> !


In [17]:
# let's customize the lemmatization here!

doc = nlp("Bro, do you wanna hang out? No brah I'm tired as hell.")

for token in doc:
    print(token, "-->", token.lemma_)

Bro --> Bro
, --> ,
do --> do
you --> you
wanna --> wanna
hang --> hang
out --> out
? --> ?
No --> no
brah --> brah
I --> I
'm --> be
tired --> tired
as --> as
hell --> hell
. --> .


In [20]:
ar = nlp.get_pipe("attribute_ruler")

ar.add([[{'TEXT':'Bro'}], [{'TEXT':'Brah'}]], {'LEMMA':'Brother' }) # here the magic happens, the customization!

doc = nlp("Bro, do you wanna hang out? No brah I'm tired as hell.")

for token in doc:
    print(token, "-->", token.lemma_)

Bro --> Brother
, --> ,
do --> do
you --> you
wanna --> wanna
hang --> hang
out --> out
? --> ?
No --> no
brah --> brah
I --> I
'm --> be
tired --> tired
as --> as
hell --> hell
. --> .


In [26]:
doc = nlp("Elon is a whore of Trump's, they both can suck my ass.")

for token in doc:
    print(token, '---->', token.pos_, "|", spacy.explain(token.pos_))

Elon ----> PROPN | proper noun
is ----> AUX | auxiliary
a ----> DET | determiner
whore ----> NOUN | noun
of ----> ADP | adposition
Trump ----> PROPN | proper noun
's ----> PART | particle
, ----> PUNCT | punctuation
they ----> PRON | pronoun
both ----> PRON | pronoun
can ----> AUX | auxiliary
suck ----> VERB | verb
my ----> PRON | pronoun
ass ----> NOUN | noun
. ----> PUNCT | punctuation


In [29]:
doc = nlp("Elon is a whore of Trump's, they both can suck my ass.")

for token in doc:
    print(token, '---->', token.pos_, "|", spacy.explain(token.pos_), "|", token.tag_, spacy.explain(token.tag_))

Elon ----> PROPN | proper noun | NNP noun, proper singular
is ----> AUX | auxiliary | VBZ verb, 3rd person singular present
a ----> DET | determiner | DT determiner
whore ----> NOUN | noun | NN noun, singular or mass
of ----> ADP | adposition | IN conjunction, subordinating or preposition
Trump ----> PROPN | proper noun | NNP noun, proper singular
's ----> PART | particle | POS possessive ending
, ----> PUNCT | punctuation | , punctuation mark, comma
they ----> PRON | pronoun | PRP pronoun, personal
both ----> PRON | pronoun | DT determiner
can ----> AUX | auxiliary | MD verb, modal auxiliary
suck ----> VERB | verb | VB verb, base form
my ----> PRON | pronoun | PRP$ pronoun, possessive
ass ----> NOUN | noun | NN noun, singular or mass
. ----> PUNCT | punctuation | . punctuation mark, sentence closer


In [38]:
thg_text = "The Hunger Games are a series of young adult dystopian novels written by American author Suzanne Collins. The series consists of a trilogy that follows teenage protagonist Katniss Everdeen, and two prequels. The Hunger Games universe is a dystopia set in Panem, a North American country consisting of the wealthy Capitol and 13 districts in varying states of poverty."

doc = nlp(thg_text)

for token in doc:
    if token.pos_ in ['SPACE', 'X', 'PUNCT']:
        print(token, '---->', token.pos_, "|", spacy.explain(token.pos_), "|", token.tag_, spacy.explain(token.tag_))

. ----> PUNCT | punctuation | . punctuation mark, sentence closer
, ----> PUNCT | punctuation | , punctuation mark, comma
. ----> PUNCT | punctuation | . punctuation mark, sentence closer
, ----> PUNCT | punctuation | , punctuation mark, comma
. ----> PUNCT | punctuation | . punctuation mark, sentence closer


In [41]:
# how to use the tagger to remove the punctuation marks?

doc = nlp(thg_text)

filtered_tokens = [] 

for token in doc:
    if token.pos_ in ['SPACE', 'X', 'PUNCT']:
        filtered_tokens.append(token) 

filtered_tokens

[., ,, ., ,, .]

- NER is used with searching.
