In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
df = pd.read_csv('/content/drive/MyDrive/NLP/text_preprocessing/IMDB Dataset.csv')

In [30]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


<h3><b>Lower Casing</b></h3>

In [31]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


<h3><b>HTML Tags Removal</b></h3>

In [32]:
import re
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [33]:
text = "<div class=content> <h1>Welcome to <span style=color:blue;>My Website</span></h1> <p>This is a <strong>sample paragraph</strong> with <em>multiple</em> <a href=#>HTML</a> tags.</p> <p>Here is a list:</p> <ul> <li><a href=/item1>Item 1</a></li> <li><a href=/item2>Item 2</a></li> <li><a href=/item3><span style=font-weight:bold;>Item 3</span></a></li> </ul> <p>Another paragraph with a <code>&lt;code&gt;</code> tag and a <mark>highlighted</mark> word.</p> <footer> <p>Contact us at <a href=mailto:info@example.com>info@example.com</a></p> <p>&copy; 2025 My Website</p> </footer> </div>"

In [34]:
remove_html_tags(text)

' Welcome to My Website This is a sample paragraph with multiple HTML tags. Here is a list:  Item 1 Item 2 Item 3  Another paragraph with a &lt;code&gt; tag and a highlighted word.  Contact us at info@example.com &copy; 2025 My Website  '

In [35]:
df['review'] = df['review'].apply(remove_html_tags)
df['review'].sample(5)

Unnamed: 0,review
7384,"geez, another lifetime movie, but once again i..."
37589,"i don't know who sue kramer, the director of t..."
26584,bedrooms and hallways is the kind of film that...
35197,"first off, i knew nothing about 'mazes and mon..."
26223,why are the previews so blah for a movie that ...


<h3><b>URLs Removal</b></h3>

In [36]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

In [37]:
url1 = "Check out this link: https://www.google.com"
url2 = "Click here: https://www.example.com/about"
url3 = "Here is the link: https://www.github.com/openai/chatgpt"
remove_urls(url1)

'Check out this link: '

In [38]:
remove_urls(url2)

'Click here: '

In [39]:
remove_urls(url3)

'Here is the link: '

In [40]:
df['review'] = df['review'].apply(remove_urls)

In [41]:
df['review'].sample(10)

Unnamed: 0,review
34452,i have to agree with all the previous commente...
49280,this show disturbs me. it takes up slots on ni...
12640,this version of the charles dickens novel feat...
1241,"better than average world war ii-era ""who-dun-..."
25478,okay. this movie is a pure pleasure. it has th...
22436,"this is indeed a funny show, done in a creepy ..."
20334,in answer to the person who made the comment a...
31911,alfred hitchcock has made many brilliant thril...
47138,"dumb is as dumb does, in this thoroughly unint..."
2288,"as usual, hollywood stereotyped everyone in th..."


<h3><b>Punctuation Removal</b></h3>

In [42]:
tweets = pd.read_csv('/content/drive/MyDrive/NLP/text_preprocessing/test.csv')
tweets.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [43]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
# remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [45]:
tweets

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [46]:
tweets['tweet'].apply(remove_punctuation)

Unnamed: 0,tweet
0,studiolife aislife requires passion dedication...
1,user white supremacists want everyone to see ...
2,safe ways to heal your acne altwaystoheal h...
3,is the hp and the cursed child book up for res...
4,3rd bihday to my amazing hilarious nephew el...
...,...
17192,thought factory leftright polarisation trump u...
17193,feeling like a mermaid ð hairflip neverread...
17194,hillary campaigned today in ohioomg amp used w...
17195,happy at work conference right mindset leads t...


<h3><b>Chat Slangs Conversion</b></h3>

In [47]:
chat_slang = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A**",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A** Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F***",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

In [48]:
def slang_converter(text):
    converted_words = []
    for word in text.split():
        if word.upper() in chat_slang:
            converted_words.append(chat_slang[word.upper()])
        else:
            converted_words.append(word)

    return ' '.join(converted_words)

In [49]:
slang_converter("That joke was so bad I’m ROFL right now")

'That joke was so bad I’m Rolling On The Floor Laughing right now'

In [50]:
slang_converter("TTYL I’ve gotta head out for dinner")

'Talk To You Later I’ve gotta head out for dinner'

In [51]:
slang_converter("LOL that was so funny! BFF let's meet up ASAP")

"Laughing Out Loud that was so funny! Best friends forever let's meet up As Soon As Possible"

<h3><b>Spelling Correction</b></h3>

In [52]:
from textblob import TextBlob

In [53]:
text = "Python is a poplar programing langwage used for web devlopment, data scince, and a.i. Its sintax is simpl and eazy to undarstand, making it great for begginers and expertts alike."

In [54]:
textblb = TextBlob(text)
textblb.correct().string

'Python is a popular programming language used for web development, data since, and a.i. Its santa is simple and easy to understand, making it great for begginers and experts alike.'

<h3><b>Removing Stopwords</b></h3>

In [57]:
import nltk

In [58]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [59]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Use a set for O(1) lookup
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [60]:
remove_stopwords('my name is John Wick and i love to eat pizza')

'name John Wick love eat pizza'

In [61]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [62]:
df['review'] = df['review'].apply(remove_stopwords)

In [63]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


<h3><b>Handling Emojis</b></h3>

In [64]:
# removing
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons 😀-🙏
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs 🌀-🗿
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols 🚀-🚧
        u"\U0001F1E0-\U0001F1FF"  # Flags 🇦-🇿
        u"\U00002700-\U000027BF"  # Dingbats ✀-➿
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols 🤐-🧿
        u"\U00002600-\U000026FF"  # Misc symbols ☀-⛿
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended 🩰-🛸
        u"\U000025A0-\U000025FF"  # Geometric Shapes ■-◿
        "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', text)

In [65]:
remove_emojis('Just finished my Python script 🐍💻—feeling like a wizard!🪄')

'Just finished my Python script —feeling like a wizard!'

In [66]:
remove_emojis("Found the perfect Pandas tutorial 🐼🎓—finally making sense now")

'Found the perfect Pandas tutorial —finally making sense now'

In [76]:
# Replacing
!pip install emoji
import emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [77]:
emoji.demojize("Found the perfect Pandas tutorial 🐼🎓—finally making sense now")

'Found the perfect Pandas tutorial :panda::graduation_cap:—finally making sense now'

In [78]:
emoji.demojize('Just finished my Python script 🐍💻—feeling like a wizard!🪄')

'Just finished my Python script :snake::laptop:—feeling like a wizard!:magic_wand:'

<h3><b>Tokenization</b></h3>

**1. Using Split() ftn**

In [79]:
a = "Pandas is a powerful library, used for data manipulation, especially with tabular data."
b = "The DataFrame structure in pandas; is similar to a spreadsheet; with labeled rows and columns."
c = "You can read CSV files with pandas - clean them - and export them back to various formats."

In [80]:
a.split(',')

['Pandas is a powerful library',
 ' used for data manipulation',
 ' especially with tabular data.']

In [81]:
b.split(';')

['The DataFrame structure in pandas',
 ' is similar to a spreadsheet',
 ' with labeled rows and columns.']

In [82]:
c.split('-')

['You can read CSV files with pandas ',
 ' clean them ',
 ' and export them back to various formats.']

**2. Regular Expression**

In [83]:
import re
a = "Can you believe how powerful Python is? It’s fast, flexible, and fun to learn—no wonder everyone’s talking about it!"
tokens = re.findall("[\\w']+" ,a)
tokens

['Can',
 'you',
 'believe',
 'how',
 'powerful',
 'Python',
 'is',
 'It',
 's',
 'fast',
 'flexible',
 'and',
 'fun',
 'to',
 'learn',
 'no',
 'wonder',
 'everyone',
 's',
 'talking',
 'about',
 'it']

In [84]:
re.compile('[.!?—]').split(a)


['Can you believe how powerful Python is',
 ' It’s fast, flexible, and fun to learn',
 'no wonder everyone’s talking about it',
 '']

**3. NLTK**

In [88]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [89]:
word_tokenize(a)

['Can',
 'you',
 'believe',
 'how',
 'powerful',
 'Python',
 'is',
 '?',
 'It',
 '’',
 's',
 'fast',
 ',',
 'flexible',
 ',',
 'and',
 'fun',
 'to',
 'learn—no',
 'wonder',
 'everyone',
 '’',
 's',
 'talking',
 'about',
 'it',
 '!']

In [90]:
sent_tokenize(a)

['Can you believe how powerful Python is?',
 'It’s fast, flexible, and fun to learn—no wonder everyone’s talking about it!']

**4. Spacy**

In [92]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [95]:
print(a)
print(b)
print(c)

Can you believe how powerful Python is? It’s fast, flexible, and fun to learn—no wonder everyone’s talking about it!
The DataFrame structure in pandas; is similar to a spreadsheet; with labeled rows and columns.
You can read CSV files with pandas - clean them - and export them back to various formats.


In [96]:
doc1 = nlp(a)
doc2 = nlp(b)
doc3 = nlp(c)

In [97]:
for token in doc1:
  print(token)

Can
you
believe
how
powerful
Python
is
?
It
’s
fast
,
flexible
,
and
fun
to
learn
—
no
wonder
everyone
’s
talking
about
it
!


In [98]:
for token in doc2:
  print(token)

The
DataFrame
structure
in
pandas
;
is
similar
to
a
spreadsheet
;
with
labeled
rows
and
columns
.


In [99]:
for token in doc3:
  print(token)

You
can
read
CSV
files
with
pandas
-
clean
them
-
and
export
them
back
to
various
formats
.


<h2><b>Stemming</b></h2>

In [100]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [101]:
text = "The children are playing happily in the garden while their parents are watching them. They played games and enjoyed running, jumping, and laughing together."
stem_words(text)

'the children are play happili in the garden while their parent are watch them. they play game and enjoy running, jumping, and laugh together.'

In [104]:
stem_words("Jump jumped jumping jumps")

'jump jump jump jump'

<h2><b>Lemmetization</b></h2>

In [105]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [108]:
punctuations = "?:;,.!-_"

word_tokens = nltk.word_tokenize(text)

for word in word_tokens:
  if word in punctuations:
    word_tokens.remove(word)
word_tokens

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in word_tokens:
  print("{0:20}{1:20}".format(word, WordNetLemmatizer().lemmatize(word, pos = 'v')))

Word                Lemma               
The                 The                 
children            children            
are                 be                  
playing             play                
happily             happily             
in                  in                  
the                 the                 
garden              garden              
while               while               
their               their               
parents             parent              
are                 be                  
watching            watch               
them                them                
They                They                
played              play                
games               game                
and                 and                 
enjoyed             enjoy               
running             run                 
jumping             jump                
and                 and                 
laughing            laugh               
together        