In [59]:
import numpy as np
import pandas as pd

In [60]:
df = pd.read_csv(r"D:\Data Science\NLP\Day2_Text_preprocessing\IMDB Dataset.csv")

In [61]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [62]:
df.shape

(50000, 2)

# Lowercase

In [63]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [64]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [65]:
#to all
df['review'] = df['review'].str.lower()

In [66]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Removing the HTML tags

In [67]:
import re
def remove_html(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [68]:
raw_data = """
    "<p>This movie was <b>amazing</b>! I loved the plot and the visuals.</p>",
    "<div>Worst movie ever. <br>Don't waste your time.</div>",
    "<span style='color:red'>Absolutely loved</span> the characters!",
    "It was <i>okay</i>, not great, not terrible.",
    "Check out the trailer <a href='https://example.com'>here</a>."
"""

In [69]:
remove_html(raw_data)

'\n    "This movie was amazing! I loved the plot and the visuals.",\n    "Worst movie ever. Don\'t waste your time.",\n    "Absolutely loved the characters!",\n    "It was okay, not great, not terrible.",\n    "Check out the trailer here."\n'

In [70]:
#on whole dataset
df['review'] = df['review'].apply(remove_html)

In [71]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Removing the URLs

In [72]:
def remove_urls(text):
    # Regex to remove http, https, www, and bare domain names
    pattern = re.compile(
        r'(https?://\S+|www\.\S+|\S+\.(com|org|net|info|io|in|me|co)(\S*)?)',
        re.IGNORECASE
    )
    return pattern.sub('', text)

In [73]:
df['review'] = df['review'].apply(remove_urls)

In [74]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Removing the punctuation

In [75]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [76]:
exclude = string.punctuation

In [77]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [78]:
text = 'string. with. punctuation?'

In [79]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1)
#for one text it takes this much time what if we do for the entire dataset it will take more time and therefore it is slow
#below use another technique

string with punctuation
0.0


In [80]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [81]:
start = time.time()
print(remove_punc1(text))
time1 = time.time() - start
print(time1) #this is faster

string with punctuation
0.002017498016357422


# Chat word treatment

In [82]:
chat_words = {}

with open(r"D:\Data Science\NLP\Day2_Text_preprocessing\slang.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        if not line:
            continue  # skip empty lines

        # Handle lines like "LOL=Laugh Out Loud", "BFF: Best Friends Forever", or "TFW ‚Äì That feeling when"
        if '=' in line:
            key, val = line.split('=', 1)
        elif ':' in line:
            key, val = line.split(':', 1)
        elif '‚Äì' in line:
            key, val = line.split('‚Äì', 1)
        else:
            continue  # skip lines that don't match

        chat_words[key.strip()] = val.strip()

for k in list(chat_words)[:10]:
    print(f"{k}: {chat_words[k]}")


AFAIK: As Far As I Know
AFK: Away From Keyboard
ASAP: As Soon As Possible
ATK: At The Keyboard
ATM: At The Moment
A3: Anytime, Anywhere, Anyplace
BAK: Back At Keyboard
BBL: Be Back Later
BBS: Be Back Soon
BFN: Bye For Now


In [83]:
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [84]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [85]:
chat_conversion('IMHO  he is the best')

'In My Honest/Humble Opinion he is the best'

In [86]:
chat_conversion('FYI delhi is the best capital of india')

'For Your Information delhi is the best capital of india'

# Spelling correction

In [87]:
from textblob import TextBlob

In [88]:
spell_text = """
I reaaly enjoyd the movi last nite. The acters were fantstic and the storry was gripping.
Unfortunatly, the ending was abit disapointing, but overal it was worth waching.
I wud recomend it to anywon who likes thrillers or suspence films.
The cinamatography was also amzing, especaily the night scens.
Howevr, sum dialouges felt unnaturall and forced.
On the whole, it was an intresting experiance!
"""
txtblb = TextBlob(spell_text)
txtblb.correct().string

'\nI really enjoyed the move last note. The actors were fantastic and the story was gripping.\nUnfortunately, the ending was bit disappointing, but overall it was worth watching.\nI mud recommend it to anyone who likes tillers or suspense films.\nThe cinamatography was also amazing, especially the night scene.\nPower, sum dialogue felt unnatural and forced.\nIn the whole, it was an interesting experience!\n'

# Removing the stopwords

In [89]:
from nltk.corpus import stopwords

In [90]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [91]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [92]:
text = """
I really enjoyed the movie last night. The actors were fantastic and the story was gripping.
However, the ending was a bit disappointing, but overall it was worth watching.
"""
remove_stopwords(text)

'I really enjoyed  movie last night. The actors  fantastic   story  gripping. However,  ending   bit disappointing,  overall   worth watching.'

In [93]:
df['review'].apply(remove_stopwords)

KeyboardInterrupt: 

In [94]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Removing the emojis

In [95]:
#method 1
# Install if not already done: pip install emoji
import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')  # removes emojis

In [96]:
text = "I love this! üòçüíñ So funny üòÇüòÇüòÇ but also a bit sad üò¢ :') XD :P"
remove_emojis(text)

"I love this!  So funny  but also a bit sad  :') XD :P"

In [97]:
#method 2
import re

def remove_emojis_and_emoticons(text):
    # Emoji ranges
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons (smileys)
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map
        u"\U0001F1E0-\U0001F1FF"  # Flags
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE)

    # Emoticons like :-) :P :/ etc.
    emoticon_pattern = re.compile(
        r'(:\s?\)|:-\)|:\s?D|:-D|:\s?\(|:-\(|:\'\(|:-\/|:\/|:P|:-P|<3|:\||;\)|;-D|:-O|O_O|:\^\)|:-X|:3|:v|:\*|XD)',
        flags=re.IGNORECASE
    )

    text = emoji_pattern.sub('', text)
    text = emoticon_pattern.sub('', text)
    return text


In [98]:
remove_emojis_and_emoticons(text)

"I love this!  So funny  but also a bit sad  :')  "

In [99]:
#replacing with the word
print(emoji.demojize('I love this! üòçüíñ'))

I love this! :smiling_face_with_heart-eyes::sparkling_heart:


# Tokenization

#### 1.Using the split function

In [100]:
#word tokenization
sent1 = 'I am going to Delhi'
sent1.split()

['I', 'am', 'going', 'to', 'Delhi']

In [103]:
#Sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. let\'s hope that trip will be good.'
sent2.split()

['I',
 'am',
 'going',
 'to',
 'delhi.',
 'I',
 'will',
 'stay',
 'there',
 'for',
 '3',
 'days.',
 "let's",
 'hope',
 'that',
 'trip',
 'will',
 'be',
 'good.']

In [104]:
#problem with split function
sent3 = 'I am going to delhi!'
sent3.split() #here delhi! comes togehter

['I', 'am', 'going', 'to', 'delhi!']

In [106]:
sent4 = 'Where do think I should go? I have 3 day holidays'
sent4.split('.') #here unable to split bcoz we are doing in full stop

['Where do think I should go? I have 3 day holidays']

#### 2. Regular Expression

In [107]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+",sent3) #exclamatory mark will be removed
tokens

  tokens = re.findall("[\w']+",sent3) #exclamatory mark will be removed


['I', 'am', 'going', 'to', 'delhi']

#### 3. NLTK

In [108]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [109]:
sent1 = 'I am going to delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

In [111]:
sent2 = 'my mail id is kunal@gmail.com' #here it fails 
word_tokenize(sent2)

['my', 'mail', 'id', 'is', 'kunal', '@', 'gmail.com']

In [112]:
sent3 = 'A 5km ride cost $10.50'  #here also fails
word_tokenize(sent3)

['A', '5km', 'ride', 'cost', '$', '10.50']

#### 4. Spacy

In [113]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp312-cp312-win_amd6

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.6 which is incompatible.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.2 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.


In [130]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [131]:
doc2 = nlp(sent2)
for token in doc2:
    print(token)

my
mail
i
d
is
kunal@gmail.com


# Stemming

In [132]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [133]:
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [134]:
text = 'walk walked walks walking'
stem_words(text)

'walk walk walk walk'

# Lemmatization

In [142]:
text = "The striped bats are hanging on their feet for best"