In [28]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TreebankWordTokenizer, TweetTokenizer
from nltk.tokenize.mwe import MWETokenizer
from textblob import TextBlob
import spacy
from gensim.utils import simple_preprocess
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import string

In [29]:
# Download the missing 'punkt_tab' dataset
nltk.download('punkt_tab')

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [31]:
#Define Sample text
text = "Machine learning 🤖🔥 is evolving rapidly! It's not just about models, but data too. Don't ignore feature selection!!!"

Word Tokenization – Splits text into words; useful for text analysis, NLP tasks, and search engines.

In [32]:
#Word Tokenization
word_tokens = word_tokenize(text)
print("Step 4 - Word Tokenization:", word_tokens)

Step 4 - Word Tokenization: ['Machine', 'learning', '🤖🔥', 'is', 'evolving', 'rapidly', '!', 'It', "'s", 'not', 'just', 'about', 'models', ',', 'but', 'data', 'too', '.', 'Do', "n't", 'ignore', 'feature', 'selection', '!', '!', '!']


Sentence Tokenization – Segments text into sentences; crucial for sentiment analysis, chatbots, and summarization.

In [18]:
#Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("Step 5 - Sentence Tokenization:", sentence_tokens)

Step 5 - Sentence Tokenization: ['Machine learning 🤖🔥 is evolving rapidly!', "It's not just about models, but data too.", "Don't ignore feature selection!!", '!']


Punctuation-based Tokenization – Extracts punctuation marks; helps in syntactic analysis and emotion detection.

In [19]:
#Punctuation-based Tokenizer
punctuation_tokens = [char for char in text if char in string.punctuation]
print("Step 6 - Punctuation-based Tokenization:", punctuation_tokens)


Step 6 - Punctuation-based Tokenization: ['!', "'", ',', '.', "'", '!', '!', '!']


Treebank Word Tokenizer – Handles contractions and complex word forms; used in POS tagging and parsing.

In [20]:
#Treebank Word Tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("Step 7 - Treebank Word Tokenization:", treebank_tokens)

Step 7 - Treebank Word Tokenization: ['Machine', 'learning', '🤖🔥', 'is', 'evolving', 'rapidly', '!', 'It', "'s", 'not', 'just', 'about', 'models', ',', 'but', 'data', 'too.', 'Do', "n't", 'ignore', 'feature', 'selection', '!', '!', '!']


Tweet Tokenizer – Processes social media text, handling emojis and hashtags; valuable for sentiment analysis and trend tracking.

In [21]:
#Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("Step 8 - Tweet Tokenization:", tweet_tokens)

Step 8 - Tweet Tokenization: ['Machine', 'learning', '🤖', '🔥', 'is', 'evolving', 'rapidly', '!', "It's", 'not', 'just', 'about', 'models', ',', 'but', 'data', 'too', '.', "Don't", 'ignore', 'feature', 'selection', '!', '!', '!']


Multi-Word Expression Tokenizer – Detects multi-word phrases; useful in named entity recognition and domain-specific NLP.

In [22]:
#Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer([("feature", "selection")])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))
print("Step 9 - Multi-Word Expression Tokenization:", mwe_tokens)

Step 9 - Multi-Word Expression Tokenization: ['Machine', 'learning', '🤖🔥', 'is', 'evolving', 'rapidly', '!', 'It', "'s", 'not', 'just', 'about', 'models', ',', 'but', 'data', 'too', '.', 'Do', "n't", 'ignore', 'feature_selection', '!', '!', '!']


TextBlob Tokenizer – Provides simple tokenization; beneficial for quick prototyping in NLP projects.

In [23]:
#TextBlob Tokenization
blob = TextBlob(text)
textblob_tokens = blob.words
print("Step 10 - TextBlob Word Tokenization:", textblob_tokens)

Step 10 - TextBlob Word Tokenization: ['Machine', 'learning', '🤖🔥', 'is', 'evolving', 'rapidly', 'It', "'s", 'not', 'just', 'about', 'models', 'but', 'data', 'too', 'Do', "n't", 'ignore', 'feature', 'selection']


spaCy Tokenizer – Fast and efficient; used in large-scale NLP applications like chatbots and recommendation systems.

In [24]:
#spaCy Tokenizer
doc = nlp(text)
spacy_tokens = [token.text for token in doc]
print("Step 11 - spaCy Tokenization:", spacy_tokens)


Step 11 - spaCy Tokenization: ['Machine', 'learning', '🤖', '🔥', 'is', 'evolving', 'rapidly', '!', 'It', "'s", 'not', 'just', 'about', 'models', ',', 'but', 'data', 'too', '.', 'Do', "n't", 'ignore', 'feature', 'selection', '!', '!', '!']


Gensim Tokenizer – Optimized for topic modeling and word embedding preprocessing.

In [25]:
#Gensim Tokenizer
gensim_tokens = simple_preprocess(text)
print("Step 12 - Gensim Tokenization:", gensim_tokens)

Step 12 - Gensim Tokenization: ['machine', 'learning', 'is', 'evolving', 'rapidly', 'it', 'not', 'just', 'about', 'models', 'but', 'data', 'too', 'don', 'ignore', 'feature', 'selection']


Keras Tokenizer – Converts text to sequences for deep learning models; applied in text classification and sentiment analysis.

In [33]:
#Keras Tokenization
keras_tokens = text_to_word_sequence(text)
print("Step 13 - Keras Tokenization:", keras_tokens)


Step 13 - Keras Tokenization: ['machine', 'learning', '🤖🔥', 'is', 'evolving', 'rapidly', "it's", 'not', 'just', 'about', 'models', 'but', 'data', 'too', "don't", 'ignore', 'feature', 'selection']
