In [None]:
# Text Preprocessing in NLP
# author: Muhammad Humayun Khan
# The mandatory steps in text preprocessing include:
# 1. Lowercasing
# 2. Removing HTML tags
# 3. Removing URLs  
# 4. Removing punctuation
# 5. Chat words treatment
# 6. Spelling correction
# 7. Removing stop words
# 8. Handling emojis
# 9. Tokenization
# 10. Stemming
# 11. Lemmatization

In [2]:
import pandas as pd


In [3]:
dataset_path = 'datasets/IMDB Dataset.csv'

df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# 1. lowercase one of the record

df['review'][2].lower()

'i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). while some may be disappointed when they realize this is not match point 2: risk addiction, i thought it was proof that woody allen is still fully in control of the style many of us have grown to love.<br /><br />this was the most i\'d laughed at one of woody\'s comedies in years (dare i say a decade?). while i\'ve never been impressed with scarlet johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />this may not be the crown jewel of his career, but it was wittier than "devil wears prada" and more interesting than "superman" a great comedy to go see with friends.'

In [7]:
# lowercase all the records in the review column
df['review'] = df['review'].str.lower()

In [8]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [None]:
# 2. remove the unimportant things such as html tags
import re
def remove_html_tags(text):
    """Remove HTML tags from a string."""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# apply the function to the review column
df['review'] = df['review'].apply(remove_html_tags)

df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [None]:
# 3. the URL links are not important and needs to be removed
def remove_urls(text):
    """Remove URLs from a string."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

# apply the function to the review column
df['review'] = df['review'].apply(remove_urls)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [None]:
# 4. Remove the punctuation marks
# first check the punctuation list
import string

string.punctuation



'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# exclude the punctuation marks
exclude = string.punctuation

In [27]:
def remove_punctuation(text):
    """Remove punctuation from a string."""
    return text.translate(str.maketrans('', '', string.punctuation))
    

In [28]:
text = "String with punctuation! # @"

text = remove_punctuation(text)
text

'String with punctuation  '

In [30]:
# now want to apply the function to the review column
df['review'] = df['review'].apply(remove_punctuation)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [32]:
# 6. Chatword treatment - short words like "u" for "you", "r" for "are", etc.

chatwords = {
    "u": "you",
    "r": "are",
    "ur": "your",
    "gr8": "great",
    "b4": "before",
    "l8r": "later",
    "pls": "please",
    "thx": "thanks",
    "btw": "by the way",
    "imo": "in my opinion",
    "idk": "I don't know",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "brb": "be right back",
    "ttyl": "talk to you later",
    "cya": "see you",
    "w8": "wait",
    "xoxo": "hugs and kisses",
    "smh": "shaking my head",   
    "btw": "by the way",
    "imho": "in my humble opinion",
    "jk": "just kidding",
    "np": "no problem",
    "bff": "best friends forever",
    "lmao": "laughing",
    "tmi": "too much information",
    "sry": "sorry",
    "wbu": "what about you",
    "fomo": "fear of missing out",
    "yolo": "you only live once",
    "tbh": "to be honest",
    "smh": "shaking my head",
    "wyd": "what are you doing",
    "asap": "as soon as possible",
    "imo": "in my opinion",
    "idc": "I don't care",
    "lmk": "let me know",
    "bday": "birthday",
    "omw": "on my way",
    "gtg": "got to go",
    "fyi": "for your information",
    "cuz": "because",
    "thx": "thanks",    
    "k": "okay",
    "plz": "please",
    "wth": "what the hell",
    "tbh": "to be honest",
    "smh": "shaking my head",
    "b4n": "bye for now",
    "fml": "f*** my life",
    "tbh": "to be honest"
}
def chatword_treatment(text):        
    for word, replacement in chatwords.items():
        text = text.replace(word, replacement)
    
    return text

# apply the function to the review column
df['review'] = df['review'].apply(chatword_treatment)

In [35]:
# 5. Spelling correction
from textblob import TextBlob

text = "I havv a dreem that one day this nation will rise up and live out the true meaning of its creed: 'We hold these truths to be self-e"

def correct_spelling(text):
    """Correct spelling in a string."""
    return str(TextBlob(text).correct())

text = correct_spelling(text)
text

"I have a dream that one day this nation will rise up and live out the true meaning of its creed: 'He hold these truths to be self-e"

In [38]:
# 6. Removing stop words - use the nltk library. Avoid stop words when your task is POS tagging 
import nltk
nltk.download('stopwords')  # Download the stopwords dataset if not already downloaded

[nltk_data] Downloading package stopwords to C:\Users\Just
[nltk_data]     Bring\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [39]:
from nltk.corpus import stopwords   
stop_words = set(stopwords.words('english'))   # only english stop words. Others can be added as needed

def remove_stopwords(text):
    """Remove stop words from a string."""
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])
# apply the function to the review column
df['review'] = df['review'].apply(remove_stopwords)
df['review']

0        one otheare areevieweares mentioned afteare wa...
1        wondearefyoul little pareodyouction filming te...
2        thoyought wondearefyoul way spend time hot syo...
3        basically thearees family whearee little boy j...
4        petteare matteis love time money visyoually st...
                               ...                        
49995    thoyought movie areight good job wasnt careeat...
49996    bad plot bad dialogyoue bad acting idiotic dia...
49997    catholic tayought paareochial elementaarey sch...
49998    im going disagareee pareevioyous comment side ...
49999    one expects staare tareeokay movies high aaret...
Name: review, Length: 50000, dtype: object

In [41]:
# 7. Handling emojis - use the emoji library
import emoji

text = "I love Programming ❤️"
def handle_emojis(text):
    """Convert emojis to text."""
    return emoji.demojize(text)

text = handle_emojis(text)
text

'I love Programming :red_heart:'

In [None]:
# 8. Tokenization - 
# There are different techniques such as using the split function, regular expressions, use the nltk library and spacy library

# split function technique
text = "I love programming and natural language processing. It's amazing!"
tokens = text.split()
print(tokens)   # word tokenization

text_two = "Do you love programming and natural language processing? It's amazing! Isn't it? Yes, it is."
tokens_two = text_two.split('.')
print(tokens_two)   # sentence tokenization


['I', 'love', 'programming', 'and', 'natural', 'language', 'processing.', "It's", 'amazing!']
["Do you love programming and natural language processing? It's amazing! Isn't it? Yes, it is", '']


In [None]:
# regular expression technique
import re

text = "I love programming and natural language processing. It's amazing!"
tokens = re.findall(r'\b\w+\b', text)
print(tokens)   # word tokenization

text_two = "Do you love programming and natural language processing? It's amazing! Isn't it? Yes, it is."
tokens_two = re.split(r'[.!]', text_two)
print(tokens_two)   # sentence tokenization and issue is ? is missing and it is not identified as a token

['I', 'love', 'programming', 'and', 'natural', 'language', 'processing', 'It', 's', 'amazing']
["Do you love programming and natural language processing? It's amazing", " Isn't it? Yes, it is", '']


In [4]:
# the third technique is using the nltk library
# make sure to install the nltk library first
import nltk
import os
from nltk.tokenize import word_tokenize, sent_tokenize

# 1. Define the NLTK data path absolutely
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
nltk_data_path = os.path.join(script_dir, "venv", "nltk_data")

# 2. Ensure the folder exists
os.makedirs(nltk_data_path, exist_ok=True)

# 3. Explicitly set the NLTK_DATA environment variable
os.environ["NLTK_DATA"] = nltk_data_path

# 4. Add the path to NLTK's internal search path list
if nltk_data_path not in nltk.data.path:
    nltk.data.path.insert(0, nltk_data_path)

print(f"NLTK Data Path being used: {nltk_data_path}")
print(f"NLTK data paths (internal): {nltk.data.path}")


# 5. Download the 'punkt' package
try:
    print("Attempting to download 'punkt' package...")
    nltk.download("punkt", download_dir=nltk_data_path, quiet=False)
    print("'punkt' package download/check complete.")
except Exception as e:
    print(f"Error during NLTK punkt download/check: {e}")
    print("This might be okay if 'punkt' was already correctly downloaded.")

# NEW: 6. Download the 'punkt_tab' package
try:
    print("\nAttempting to download 'punkt_tab' package...")
    nltk.download("punkt_tab", download_dir=nltk_data_path, quiet=False)
    print("'punkt_tab' package download/check complete.")
except Exception as e:
    print(f"Error during NLTK punkt_tab download/check: {e}")
    print("This might be okay if 'punkt_tab' was already correctly downloaded.")


# 7. Verify that the 'punkt' model file exists at the expected location
expected_punkt_path = os.path.join(nltk_data_path, "tokenizers", "punkt", "english.pickle")
if not os.path.exists(expected_punkt_path):
    print(f"ERROR: Expected punkt file NOT found at: {expected_punkt_path}")
    print("Please check the contents of your venv/nltk_data folder manually.")
    print("It should contain 'tokenizers/punkt/english.pickle'.")
else:
    print(f"SUCCESS: punkt file found at: {expected_punkt_path}")

# NEW: Verify that the 'punkt_tab' model file exists
expected_punkt_tab_path = os.path.join(nltk_data_path, "tokenizers", "punkt_tab", "english.pickle")
if not os.path.exists(expected_punkt_tab_path):
    print(f"ERROR: Expected punkt_tab file NOT found at: {expected_punkt_tab_path}")
    print("Please check the contents of your venv/nltk_data folder manually.")
    print("It should contain 'tokenizers/punkt_tab/english.pickle'.")
else:
    print(f"SUCCESS: punkt_tab file found at: {expected_punkt_tab_path}")



NLTK Data Path being used: d:\Drive I\Work\Summer 2025 Personal Growth\natural_language_processing\venv\nltk_data
NLTK data paths (internal): ['C:\\Users\\Just Bring/nltk_data', 'd:\\Drive I\\Work\\Summer 2025 Personal Growth\\natural_language_processing\\venv\\nltk_data', 'd:\\Drive I\\Work\\Summer 2025 Personal Growth\\natural_language_processing\\venv\\share\\nltk_data', 'd:\\Drive I\\Work\\Summer 2025 Personal Growth\\natural_language_processing\\venv\\lib\\nltk_data', 'C:\\Users\\Just Bring\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
Attempting to download 'punkt' package...
'punkt' package download/check complete.

Attempting to download 'punkt_tab' package...
'punkt_tab' package download/check complete.
SUCCESS: punkt file found at: d:\Drive I\Work\Summer 2025 Personal Growth\natural_language_processing\venv\nltk_data\tokenizers\punkt\english.pickle
ERROR: Expected punkt_tab file NOT found at: d:\Drive I\Work\Summer 2025 Personal Growth\natu

[nltk_data] Downloading package punkt to d:\Drive I\Work\Summer 2025
[nltk_data]     Personal
[nltk_data]     Growth\natural_language_processing\venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to d:\Drive I\Work\Summer
[nltk_data]     2025 Personal
[nltk_data]     Growth\natural_language_processing\venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
# 8. Perform tokenization
#text = "My interest in natural language processing is growing. My id is humayun.devv@gmail.com"
text = "I have PH.D in Large language models."
try:
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)
    print("\nWord Tokens:", word_tokens)
    print("Sentence Tokens:", sent_tokens)
except LookupError as e:
    print(f"\nCaught LookupError during tokenization: {e}")
    print("This indicates NLTK still cannot find the required tokenizer.")
    print("Double-check the folder structure inside:")
    print(f"  {nltk_data_path}")
    print("It should contain 'tokenizers/punkt/english.pickle' and 'tokenizers/punkt_tab/english.pickle'.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")


Word Tokens: ['I', 'have', 'PH.D', 'in', 'Large', 'language', 'models', '.']
Sentence Tokens: ['I have PH.D in Large language models.']


In [12]:
# 4. Spacy library technique
# make sure to install the spacy library first
import spacy

nlp = spacy.load("en_core_web_sm") # python -m spacy download en_core_web_sm
text = "I have PH.D in Large language models."

doc = nlp(text)
# Extract tokens
tokens = [token.text for token in doc]
print("\nSpacy Tokens:", tokens)

text_two = "Do you love programming and natural language processing? It's amazing! Isn't it? Yes, it is."
doc_two = nlp(text_two)
# Extract sentences
sentences = [sent.text for sent in doc_two.sents]
print("Spacy Sentences:", sentences)




Spacy Tokens: ['I', 'have', 'PH.D', 'in', 'Large', 'language', 'models', '.']
Spacy Sentences: ['Do you love programming and natural language processing?', "It's amazing!", "Isn't it?", 'Yes, it is.']


In [None]:
# 10. Stemming - # Stemming is the process of reducing a word to its base or root form.
# It is often used in information retrieval and natural language processing tasks to reduce words to their base

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_word(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Example usage
text = "run runs running"
text_two = "walk walks walked walking"
stem_word(text)

'run run run'

In [None]:
# stemming the text doesn't always brings the expected results as can be seen below in case of movie and probably
text = "Probably the best movie isn't release yet"
stem_word(text)

"probabl the best movi isn't releas yet"

In [None]:
# 11. Lemmatization - process of reducing a word to its base or dictionary form.
import nltk
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download('wordnet')

# Create the lemmatizer
lemmatizer = WordNetLemmatizer()

# Sample words
words = ['running', 'flies', 'better', 'cars', 'geese']

# Lemmatize without POS (defaults to noun)
print("Lemmatizing as nouns (default):")
for word in words:
    print(f"{word} → {lemmatizer.lemmatize(word)}")

# Lemmatize with correct POS
print("\nLemmatizing with correct POS:")
print(f"running (verb) → {lemmatizer.lemmatize('running', pos='v')}")
print(f"better (adjective) → {lemmatizer.lemmatize('better', pos='a')}")


[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Bring\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Just
[nltk_data]     Bring\AppData\Roaming\nltk_data...


KeyboardInterrupt: 