# Tokenization

In [22]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
text = "I love my pet Elephant. Most loved animal in the world."

In [24]:
# word tokenize
word_tokens = word_tokenize(text)
print(word_tokens)

['I', 'love', 'my', 'pet', 'Elephant', '.', 'Most', 'loved', 'animal', 'in', 'the', 'world', '.']


In [25]:
# sentence tokenize
sent_tokens = sent_tokenize(text)
print(sent_tokens)

['I love my pet Elephant.', 'Most loved animal in the world.']


# Stop Words Removal

In [26]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print(filtered_words)

['love', 'pet', 'Elephant', '.', 'loved', 'animal', 'world', '.']


# Stemming

In [28]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [29]:
words = ['running', 'runs', 'ran', 'run']
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

['run', 'run', 'ran', 'run']


# Lemmatization

In [30]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
lemmatizer = WordNetLemmatizer()

words = ['running', 'runs', 'ran', 'run']
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized_words)

['running', 'run', 'ran', 'run']


# Lowercasing

In [32]:
lower_text = text.lower()
print(lower_text)

i love my pet elephant. most loved animal in the world.


# Punctuation Removal

In [33]:
import string

In [34]:
text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
print(text_no_punct)

I love my pet Elephant Most loved animal in the world


# Removing Special Characters and Numbers

In [35]:
import re

In [36]:
text_with_special = 'User123 scored 95% in the exam!'
clean_text = re.sub(r'[^a-zA-Z\s]', '', text_with_special)
print(clean_text)

User scored  in the exam


# Spelling Correction

In [37]:
from textblob import TextBlob

In [38]:
misspelled_text = "Natrual Langage Processin is intresting"
corrected_text = str(TextBlob(misspelled_text).correct())
print(corrected_text)

Natural Language Procession is interesting


# Expanding Contractions

In [41]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [42]:
import contractions

In [43]:
text_with_contraction = "I can't go because I'm tired"
expanded_text = contractions.fix(text_with_contraction)
print(expanded_text)

I cannot go because I am tired


# Removing URLs, Email Address and HTML Tags

In [44]:
from bs4 import BeautifulSoup

In [45]:
text_with_html = "<p>This is an example of <b>HTML</b> text.</p>"
clean_html = BeautifulSoup(text_with_html, 'html.parser').get_text()
print(clean_html)

This is an example of HTML text.


In [47]:
text_with_url = "Visit https://example.com for more info."
clean_url = re.sub(r'http\S+|www.\S+', '', text_with_url)
print(clean_url)

Visit  for more info.


In [48]:
text_with_email = "Contact me at example@email.com"
clean_email = re.sub(r'\S+@\S+', '', text_with_email)
print("Text Without Email:", clean_email)

Text Without Email: Contact me at 


# Text Normalization (Slang Conversion)

In [52]:
import string

slang_dict = {"u": "you", "gr8": "great", "r": "are", "b4": "before"}
text_with_slang = "U r gr8!!"

# Convert to lowercase
text_lower = text_with_slang.lower()

# Remove punctuation
text_clean = text_lower.translate(str.maketrans("", "", string.punctuation))

# Replace slang words
normalized_text = ' '.join([slang_dict.get(word, word) for word in text_clean.split()])

print("Normalized Text:", normalized_text)


Normalized Text: you are great


# Part-of-Speech (POS) Tagging

In [57]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [58]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = "This is a sample sentence."
word_tokens = word_tokenize(text)
pos_tags = pos_tag(word_tokens)
print("POS Tags:", pos_tags)

POS Tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]
