In [2]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [4]:
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jorocca/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jorocca/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/jorocca/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Sample Text
text = 'Text pre-processing is an essential task in Natural Language Processing (NLP). It helps to clean and prepare text data for further analysis.'

In [8]:
tokens = word_tokenize(text)
print("Tokens: ", tokens)

Tokens:  ['Text', 'pre-processing', 'is', 'an', 'essential', 'task', 'in', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', '.', 'It', 'helps', 'to', 'clean', 'and', 'prepare', 'text', 'data', 'for', 'further', 'analysis', '.']


In [9]:
tokens = [token.lower() for token in tokens]
print("Lowercased Tokens: ", tokens)

Lowercased Tokens:  ['text', 'pre-processing', 'is', 'an', 'essential', 'task', 'in', 'natural', 'language', 'processing', '(', 'nlp', ')', '.', 'it', 'helps', 'to', 'clean', 'and', 'prepare', 'text', 'data', 'for', 'further', 'analysis', '.']


In [12]:
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
print('Tokens after stop word removal: ', tokens)

Tokens after stop word removal:  ['text', 'pre-processing', 'essential', 'task', 'natural', 'language', 'processing', '(', 'nlp', ')', '.', 'helps', 'clean', 'prepare', 'text', 'data', 'analysis', '.']


In [13]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed Tokens: ", stemmed_tokens)

Stemmed Tokens:  ['text', 'pre-process', 'essenti', 'task', 'natur', 'languag', 'process', '(', 'nlp', ')', '.', 'help', 'clean', 'prepar', 'text', 'data', 'analysi', '.']


In [14]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized Tokens: ", lemmatized_tokens)

Lemmatized Tokens:  ['text', 'pre-processing', 'essential', 'task', 'natural', 'language', 'processing', '(', 'nlp', ')', '.', 'help', 'clean', 'prepare', 'text', 'data', 'analysis', '.']


In [15]:
doc = nlp(text.lower())
lemmatized_tokens_spacy = [token.lemma_ for token in doc if token.text not in stop_words and token.is_alpha]
print('Lemmatized Tokens SpaCy:', lemmatized_tokens_spacy)

Lemmatized Tokens SpaCy: ['text', 'pre', 'processing', 'essential', 'task', 'natural', 'language', 'processing', 'nlp', 'help', 'clean', 'prepare', 'text', 'datum', 'analysis']
