In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('A:\\NLP\\text_preprocessing\\IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


<h3><b>Lower Casing</b></h3>

In [4]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


<h3><b>HTML Tags Removal</b></h3>

In [5]:
import re
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [6]:
text = "<div class=content> <h1>Welcome to <span style=color:blue;>My Website</span></h1> <p>This is a <strong>sample paragraph</strong> with <em>multiple</em> <a href=#>HTML</a> tags.</p> <p>Here is a list:</p> <ul> <li><a href=/item1>Item 1</a></li> <li><a href=/item2>Item 2</a></li> <li><a href=/item3><span style=font-weight:bold;>Item 3</span></a></li> </ul> <p>Another paragraph with a <code>&lt;code&gt;</code> tag and a <mark>highlighted</mark> word.</p> <footer> <p>Contact us at <a href=mailto:info@example.com>info@example.com</a></p> <p>&copy; 2025 My Website</p> </footer> </div>"

In [7]:
remove_html_tags(text)

' Welcome to My Website This is a sample paragraph with multiple HTML tags. Here is a list:  Item 1 Item 2 Item 3  Another paragraph with a &lt;code&gt; tag and a highlighted word.  Contact us at info@example.com &copy; 2025 My Website  '

In [8]:
df['review'] = df['review'].apply(remove_html_tags)
df['review'].sample(5)

12656    'capital city' fans rejoice! this first season...
43279    i have always been keen on watching hong kong ...
30873    i thought that this movie might be a good spoo...
27630    i'd have to say this is one of the best animat...
13667    i really enjoyed this movie as a young kid. at...
Name: review, dtype: object

<h3><b>URLs Removal</b></h3>

In [9]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

In [10]:
url1 = "Check out this link: https://www.google.com"
url2 = "Click here: https://www.example.com/about"
url3 = "Here is the link: https://www.github.com/openai/chatgpt"
remove_urls(url1)

'Check out this link: '

In [11]:
remove_urls(url2)

'Click here: '

In [12]:
remove_urls(url3)

'Here is the link: '

In [13]:
df['review'] = df['review'].apply(remove_urls)

In [14]:
df['review'].sample(10)

13890    preston waters is off to a bad summer. besides...
39060    although i think the reviewers who hated this ...
40503    seven pioneer kids strive independently across...
22542    this is a film that has to be taken in context...
10674    the danish movie "slim slam slum" surprised me...
19726    i have read both the book and saw the movie to...
39983    i cannot believe i sat through this utter wast...
21367    a light, uplifting and engaging movie. watchin...
20780    i'm a fan of tv movies in general and this was...
1644     this was the most unrealistic movie i ever see...
Name: review, dtype: object

<h3><b>Punctuation Removal</b></h3>

In [15]:
tweets = pd.read_csv('A:\\NLP\\text_preprocessing\\test.csv')
tweets.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [16]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
# remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [18]:
tweets

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [19]:
tweets['tweet'].apply(remove_punctuation)

0        studiolife aislife requires passion dedication...
1         user white supremacists want everyone to see ...
2        safe ways to heal your acne    altwaystoheal h...
3        is the hp and the cursed child book up for res...
4          3rd bihday to my amazing hilarious nephew el...
                               ...                        
17192    thought factory leftright polarisation trump u...
17193    feeling like a mermaid ð hairflip neverread...
17194    hillary campaigned today in ohioomg amp used w...
17195    happy at work conference right mindset leads t...
17196    my   song so glad free download  shoegaze newm...
Name: tweet, Length: 17197, dtype: object

<h3><b>Chat Slangs Conversion</b></h3>

In [20]:
chat_slang = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A**",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A** Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F***",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

In [21]:
def slang_converter(text):
    converted_words = []
    for word in text.split():
        if word.upper() in chat_slang:
            converted_words.append(chat_slang[word.upper()])
        else:
            converted_words.append(word)
    
    return ' '.join(converted_words)

In [22]:
slang_converter("That joke was so bad I’m ROFL right now")

'That joke was so bad I’m Rolling On The Floor Laughing right now'

In [23]:
slang_converter("TTYL I’ve gotta head out for dinner")

'Talk To You Later I’ve gotta head out for dinner'

In [24]:
slang_converter("LOL that was so funny! BFF let's meet up ASAP")

"Laughing Out Loud that was so funny! Best friends forever let's meet up As Soon As Possible"

<h3><b>Spelling Correction</b></h3>

In [26]:
from textblob import TextBlob

In [31]:
text = "Python is a poplar programing langwage used for web devlopment, data scince, and a.i. Its sintax is simpl and eazy to undarstand, making it great for begginers and expertts alike."

In [32]:
textblb = TextBlob(text)
textblb.correct().string

'Python is a popular programming language used for web development, data since, and a.i. Its santa is simple and easy to understand, making it great for begginers and experts alike.'

<h3><b>Removing Stopwords</b></h3>

In [35]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [48]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Use a set for O(1) lookup
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [44]:
remove_stopwords('my name is John Wick and i love to eat pizza')

'name John Wick love eat pizza'

In [42]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [46]:
df['review'] = df['review'].apply(remove_stopwords)

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


<h3><b>Handling Emojis</b></h3>

In [59]:
# removing
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["                                
        u"\U0001F600-\U0001F64F"  # Emoticons 😀-🙏
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs 🌀-🗿
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols 🚀-🚧
        u"\U0001F1E0-\U0001F1FF"  # Flags 🇦-🇿
        u"\U00002700-\U000027BF"  # Dingbats ✀-➿
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols 🤐-🧿
        u"\U00002600-\U000026FF"  # Misc symbols ☀-⛿
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended 🩰-🛸
        u"\U000025A0-\U000025FF"  # Geometric Shapes ■-◿
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [52]:
remove_emojis('Just finished my Python script 🐍💻—feeling like a wizard!🪄')

'Just finished my Python script —feeling like a wizard!'

In [51]:
remove_emojis("Found the perfect Pandas tutorial 🐼🎓—finally making sense now")

'Found the perfect Pandas tutorial —finally making sense now'

In [1]:
# Replacing
import emoji

In [57]:
emoji.demojize("Found the perfect Pandas tutorial 🐼🎓—finally making sense now")

'Found the perfect Pandas tutorial :panda::graduation_cap:—finally making sense now'

In [58]:
emoji.demojize('Just finished my Python script 🐍💻—feeling like a wizard!🪄')

'Just finished my Python script :snake::laptop:—feeling like a wizard!:magic_wand:'

<h3><b>Tokenization</b></h3>

**1. Using Split() ftn**

In [79]:
a = "Pandas is a powerful library, used for data manipulation, especially with tabular data."
b = "The DataFrame structure in pandas; is similar to a spreadsheet; with labeled rows and columns."
c = "You can read CSV files with pandas - clean them - and export them back to various formats."

In [80]:
a.split(',')

['Pandas is a powerful library',
 ' used for data manipulation',
 ' especially with tabular data.']

In [81]:
b.split(';')

['The DataFrame structure in pandas',
 ' is similar to a spreadsheet',
 ' with labeled rows and columns.']

In [82]:
c.split('-')

['You can read CSV files with pandas ',
 ' clean them ',
 ' and export them back to various formats.']

**2. Regular Expression**

In [1]:
import re
a = "Can you believe how powerful Python is? It’s fast, flexible, and fun to learn—no wonder everyone’s talking about it!"
tokens = re.findall("[\\w']+" ,a)
tokens

['Can',
 'you',
 'believe',
 'how',
 'powerful',
 'Python',
 'is',
 'It',
 's',
 'fast',
 'flexible',
 'and',
 'fun',
 'to',
 'learn',
 'no',
 'wonder',
 'everyone',
 's',
 'talking',
 'about',
 'it']

In [2]:
re.compile('[.!?—]').split(a)


['Can you believe how powerful Python is',
 ' It’s fast, flexible, and fun to learn',
 'no wonder everyone’s talking about it',
 '']

**3. NLTK**

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

  from scipy.stats import fisher_exact

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
 

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\AliRaza\anaconda3\Lib\s

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\AliRaza\anaconda3\Lib\s

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\AliRaza\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\AliRaza\anaconda3\Lib\s

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



In [7]:
word_tokenize(a)

['Can',
 'you',
 'believe',
 'how',
 'powerful',
 'Python',
 'is',
 '?',
 'It',
 '’',
 's',
 'fast',
 ',',
 'flexible',
 ',',
 'and',
 'fun',
 'to',
 'learn—no',
 'wonder',
 'everyone',
 '’',
 's',
 'talking',
 'about',
 'it',
 '!']

In [8]:
sent_tokenize(a)

['Can you believe how powerful Python is?',
 'It’s fast, flexible, and fun to learn—no wonder everyone’s talking about it!']

**4. Spacy**

In [9]:
import spacy

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject