### Importing NLTK Libraries for Text Processing 

In [1]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import conll2000


### Downloading Necessary NLTK Data and Models

In [16]:
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kkrab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\kkrab\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kkrab\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kkrab\AppData\Roaming\nltk_data...


True

### Converting Text to Lowercase in String

In [17]:
text = "Natural Language Processing is amazing!"
lower_text = text.lower()
print(lower_text)


natural language processing is amazing!


### Removing Punctuation from Text

In [18]:
import string

text = "Hello, world! Let's explore NLP in code ."
no_punct_text = text.translate(str.maketrans('', '', string.punctuation))
print(no_punct_text)


Hello world Lets explore NLP


###  Tokenizing Text into Words

In [19]:
from nltk.tokenize import word_tokenize

text = "Tokenization is the first step in text processing."
tokens = word_tokenize(text)
print(tokens)


['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'processing', '.']


### Removing Stopwords from Tokenized Text

In [20]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)


['Tokenization', 'first', 'step', 'text', 'processing', '.']


### Applying Stemming to Filtered Tokens

In [21]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in filtered_tokens]
print(stemmed_words)


['token', 'first', 'step', 'text', 'process', '.']


###  Lemmatizing Filtered Tokens








In [22]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lemmatized_words)


['Tokenization', 'first', 'step', 'text', 'processing', '.']


### Text Preprocessing and Tokenization with NLTK on Gutenberg Corpus

In [23]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

sample_text = gutenberg.raw('austen-emma.txt')[:1000]

lower_text = sample_text.lower()
no_punct_text = lower_text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(no_punct_text)
filtered_tokens = [word for word in tokens if word not in stop_words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print(lemmatized_words[:50]) 

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\kkrab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


['emma', 'jane', 'austen', '1816', 'volume', 'chapter', 'emma', 'woodhouse', 'handsome', 'clever', 'rich', 'comfortable', 'home', 'happy', 'disposition', 'seemed', 'unite', 'best', 'blessing', 'existence', 'lived', 'nearly', 'twentyone', 'year', 'world', 'little', 'distress', 'vex', 'youngest', 'two', 'daughter', 'affectionate', 'indulgent', 'father', 'consequence', 'sister', 'marriage', 'mistress', 'house', 'early', 'period', 'mother', 'died', 'long', 'ago', 'indistinct', 'remembrance', 'caress', 'place', 'supplied']
