# Tokenization

In [22]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [7]:
data = "Of all the ways to eat eggs, my favourite is the most fussy: devilled, the art of scooping out hard-boiled eggs and re-stuffing them with a jazzed-up yolk mixture."
print(data)

Of all the ways to eat eggs, my favourite is the most fussy: devilled, the art of scooping out hard-boiled eggs and re-stuffing them with a jazzed-up yolk mixture.


In [8]:
# Mise en minuscule
data = data.lower()
print(data)

of all the ways to eat eggs, my favourite is the most fussy: devilled, the art of scooping out hard-boiled eggs and re-stuffing them with a jazzed-up yolk mixture.


In [19]:
# Définition des tokens
dico = set(word_tokenize(data, language="english"))
print(f"Number of tokens: {len(dico)}")
print(sorted(dico))

Number of tokens: 28
[',', '.', ':', 'a', 'all', 'and', 'art', 'devilled', 'eat', 'eggs', 'favourite', 'fussy', 'hard-boiled', 'is', 'jazzed-up', 'mixture', 'most', 'my', 'of', 'out', 're-stuffing', 'scooping', 'the', 'them', 'to', 'ways', 'with', 'yolk']


In [23]:
# Exclusion des stop-words
en_stopwords = stopwords.words('english')
trimmed_dico = [mot for mot in dico if not mot in en_stopwords]
print(f"Number of tokens: {len(trimmed_dico)}")
print(sorted(trimmed_dico))

Number of tokens: 16
[',', '.', ':', 'art', 'devilled', 'eat', 'eggs', 'favourite', 'fussy', 'hard-boiled', 'jazzed-up', 'mixture', 're-stuffing', 'scooping', 'ways', 'yolk']


In [25]:
# Racinisation
stemmer = SnowballStemmer(language='english')
stemmed_dico = [stemmer.stem(mot) for mot in trimmed_dico]
print(sorted(stemmed_dico))

[',', '.', ':', 'art', 'devil', 'eat', 'egg', 'favourit', 'fussi', 'hard-boil', 'jazzed-up', 'mixtur', 're-stuf', 'scoop', 'way', 'yolk']
