In [16]:
# Importing Libraries

import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [17]:
# Downloading the required datasets and models for tokenization, lemmatization, and other NLP tasks
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
# Sample sentence
sample_sentence = "NLTK is a leading platform for building Python programs to work with human language data."

### Tokenization

In [19]:
# Whitespace tokenizer
whitespace_tokens = sample_sentence.split()

In [20]:
# Punctuation-based tokenizer
punct_tokenizer = nltk.RegexpTokenizer(r'\w+')
punct_tokens = punct_tokenizer.tokenize(sample_sentence)

In [21]:
# Treebank tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sample_sentence)

In [22]:
# Tweet tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sample_sentence)

In [23]:
# MWE tokenizer
mwe_tokenizer = MWETokenizer([('Python', 'programs')])
mwe_tokens = mwe_tokenizer.tokenize(sample_sentence.split())

### Stemming

In [24]:
# Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(token) for token in punct_tokens]

In [25]:
# Snowball Stemmer
snowball_stemmer = SnowballStemmer('english')
snowball_stems = [snowball_stemmer.stem(token) for token in punct_tokens]

### Lemmatization

In [26]:
wordnet_lemmatizer = WordNetLemmatizer()
lemmas = [wordnet_lemmatizer.lemmatize(token) for token in punct_tokens]

### Print the Results

In [27]:
print("Original Sentence:", sample_sentence)
print("\nTokenization Results:")
print("Whitespace Tokenizer:", whitespace_tokens)
print("Punctuation-based Tokenizer:", punct_tokens)
print("Treebank Tokenizer:", treebank_tokens)
print("Tweet Tokenizer:", tweet_tokens)
print("MWE Tokenizer:", mwe_tokens)
print("\nStemming Results:")
print("Porter Stemmer:", porter_stems)
print("Snowball Stemmer:", snowball_stems)
print("\nLemmatization Results:")
print("WordNet Lemmatizer:", lemmas)

Original Sentence: NLTK is a leading platform for building Python programs to work with human language data.

Tokenization Results:
Whitespace Tokenizer: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data.']
Punctuation-based Tokenizer: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data']
Treebank Tokenizer: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.']
Tweet Tokenizer: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.']
MWE Tokenizer: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python_programs', 'to', 'work', 'with', 'human', 'language', 'data.']

Stemming Results:
Porter Stemmer: ['nltk', 'is', 'a', 'lead', 'platform', 'for',

### ---------------------------

In [28]:
print("Performed by: Hrishikesh Bari || Roll No: 68")

Performed by: Hrishikesh Bari || Roll No: 68
