In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**1. Lowercasing**

• What it is: Converting all characters in the text document to lowercase.

• Why it's done: To avoid issues where the same word is treated as different due to varying capitalization (e.g., "Basically" vs. "basically"), which adds unnecessary complexity to the model.

• How it's done: Typically using simple string methods like .lower() in Python.

In [5]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [6]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [7]:
#To convert entire dataframe to lower
df['review'] = df['review'].str.lower()

In [8]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


**2. Removing HTML Tags**
 
• What it is: Eliminating HTML tags ( \<p>, \<body>, \<br/>) from text data, which are often present when data is scraped from websites.

• Why it's done: These tags are for browser display and do not contribute to the meaning or sentiment of the text for NLP models; keeping them can confuse the model.

In [9]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [10]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [11]:
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [12]:
df['review'] = df['review'].apply(remove_html_tags)

In [13]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

**3. Removing URLs**
 
• What it is: Deleting URLs (e.g., http://, https://, www.) from the text.

• Why it's done: URLs usually don't contribute significantly to the sentiment or classification of text in many NLP tasks and can potentially confuse the model.

In [14]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [15]:
text1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'
text2 = 'Check out my notebook http://www.kaggle.com/campusx/notebook8223fc1abb'
text3 = 'Google search here www.google.com'
text4 = 'For notebook click https://www.kaggle.com/campusx/notebook8223fc1abb to search check www.google.com'

In [16]:
remove_url(text4)

'For notebook click  to search check '

**4. Removing Punctuations**
   
• What it is: Removing punctuation marks (e.g., periods, commas, exclamation marks, question marks, percentages) from the text.

• Why it's done:
    ◦ To prevent them from being treated as separate words: When tokenizing, punctuations might become individual tokens, unnecessarily increasing document size and confusing the model.
    
    ◦ To prevent them from being part of words: Sometimes, punctuation attached to a word can make the system treat it as a unique word (e.g., "Hello!" vs. "Hello"), which also confuses the model.
    
    ◦ While usually beneficial, in very rare cases, punctuations might be needed.

    
• How it's done:
    ◦ A simple looping approach can be used, but it's very slow for large datasets.
    
    ◦ A more efficient and standard technique involves using str.translate and str.maketrans in Python along with the string.punctuation set, which is significantly faster.

In [17]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
exclude = string.punctuation

In [19]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text
        

In [20]:
text = 'string. With. Punctuation?'

In [21]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1*50000)

string With Punctuation
65.86313247680664


In [22]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [23]:
start = time.time()
remove_punc1(text)
time2 = time.time() - start
print(time2*50000)

4.5299530029296875


In [24]:
time1/time2

14.539473684210526

In [25]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [26]:
remove_punc1(df['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

**5. Chat Word Treatment (Short Hand Expansions)**
   
• What it is: Expanding common chat abbreviations or "short-hands" (e.g., "IMHO" to "In My Humble Opinion", "ASAP" to "As Soon As Possible") to their full forms.

• Why it's done: When working with social media or messaging app data (e.g., building a chatbot), these short-hands are common. Expanding them helps the model understand the text more accurately.

• How it's done: By creating or utilizing a dictionary mapping short-hands to their full forms, and then replacing them in the text.

In [1]:
chat_words = {
    'WB': 'Welcome Back',
    'WTF': 'What the f***',
    'LOL': 'Laugh Out Loud',
    'BRB': 'Be Right Back',
    'TTYL': 'Talk To You Later',
    'OMG': 'Oh My God',
    'IDK': 'I Don’t Know',
    'TBH': 'To Be Honest',
    'FYI': 'For Your Information',
    'IMO': 'In My Opinion',
    'IMHO': 'In My Humble Opinion',
    'ROFL': 'Rolling On Floor Laughing',
    'LMAO': 'Laughing My A** Off',
    'SMH': 'Shaking My Head',
    'BTW': 'By The Way',
    'GTG': 'Got To Go',
    'BFF': 'Best Friends Forever',
    'DM': 'Direct Message',
    'TMI': 'Too Much Information',
    'NSFW': 'Not Safe For Work',
    'ICYMI': 'In Case You Missed It',
    'IKR': 'I Know, Right?',
    'NVM': 'Never Mind',
    'TYT': 'Take Your Time',
    'AFK': 'Away From Keyboard',
    'IDC': 'I Don’t Care',
    'JK': 'Just Kidding',
    'YOLO': 'You Only Live Once',
    'FOMO': 'Fear Of Missing Out',
    'GG': 'Good Game',
    'NP': 'No Problem',
    'TBA': 'To Be Announced',
    'TL;DR': 'Too Long; Didn’t Read',
    'BFFL': 'Best Friends For Life',
    'ILY': 'I Love You',
    'CU': 'See You',
    'TTYS': 'Talk To You Soon',
    'ASAP': 'As Soon As Possible',
    'OMW': 'On My Way',
    'GR8': 'Great',
    'THX': 'Thanks',
    'PLZ': 'Please',
    'GN': 'Good Night',
    'GM': 'Good Morning'
}


In [2]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [3]:
chat_conversion('IMHO he is the best')

'In My Humble Opinion he is the best'

In [4]:
chat_conversion('FYI delhi is the capital of india')

'For Your Information delhi is the capital of india'

**6. Spelling Correction**

• What it is: Correcting spelling mistakes in the text.

• Why it's done: Due to fast typing or "fat fingering" (accidental key presses on small smartphone keyboards), spelling errors are common, especially in social media text. These errors can lead the model to treat misspelled words as unique, causing performance degradation.

• How it's done: Libraries like TextBlob or custom spell checkers can be used. TextBlob is effective for common mistakes, but regional words or specific data might require more tailored solutions.

In [32]:
from textblob import TextBlob

In [33]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

**7. Stop Word Removal**

• What it is: Eliminating "stop words" – common words in a language (like "the," "a," "is," "and") that contribute to sentence formation but generally do not add significant meaning to the sentence's core context.

• Why it's done: For tasks like sentiment analysis or document classification, removing stop words can reduce noise and improve model focus on more meaningful words.

• When not to remove: For tasks like Parts of Speech (POS) tagging or parsing, stop words are crucial for grammatical structure, so they are not removed.

• How it's done: The NLTK (Natural Language Toolkit) library provides pre-compiled lists of stop words for various languages, which can be easily used for removal.

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords.words('spanish')

In [None]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [None]:
remove_stopwords('probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

In [None]:
df.head()

In [None]:
df['review'].apply(remove_stopwords)

**8. Handling Emojis**

• What it is: Processing emojis present in the text data.

• Why it's done: Emojis are important for expressing emotion in human communication but are not directly understood by machine learning algorithms.

• Options:
    ◦ Remove Emojis: Simple removal using Regular Expressions.
    ◦ Replace with Meaning: Convert emojis into their textual descriptions (e.g., "😂" to "face with tears of joy").
    
• How it's done: For replacement, a module like demojize can be used.

In [34]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [35]:
remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [36]:
remove_emoji("Lmao 😂😂")

'Lmao '

In [37]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


In [38]:
print(emoji.demojize('Loved the movie. It was 😘'))

Loved the movie. It was :face_blowing_a_kiss:


### 1. Using the split function

**9. Tokenization**
    
• What it is: The fundamental process of breaking down raw text into smaller, meaningful units called "tokens". These tokens can be words, sentences, or even subword units.

• Why it's important: It's a crucial precursor to feature engineering. Incorrect tokenization can confuse the model and lead to poor results, especially when trying to extract unique words or other features.

• Types:
    ◦ Word Tokenization: Breaking text into individual words.
    ◦ Sentence Tokenization: Breaking a paragraph into individual sentences.
    
• Challenges: Tokenization isn't always straightforward due to prefixes, suffixes, infixes, special characters (e.g., "20$" should be "20" and "$"), contractions (e.g., "I'm"), and proper nouns (e.g., "New York").


• Techniques:

    ◦ Python's split() function: Simple, works on spaces, but fails in complex scenarios (e.g., "Delhi!" splits as "Delhi!" not "Delhi" and "!").
    
    ◦ Regular Expressions: More sophisticated than split(), allows defining patterns for splitting, offering better control but can still be complex to handle all edge cases.
    
    ◦ Libraries (NLTK & spaCy): These are the recommended approach.
        ▪ NLTK: Provides word_tokenize and sent_tokenize functions with internal algorithms to handle many complexities. However, it can still struggle with certain cases like email IDs being split or incorrect handling of numbers.
        ▪ spaCy: Often provides the best tokenization results in observed cases, handling many difficult scenarios more effectively than NLTK (e.g., email IDs, numbers like "5km").

In [39]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [40]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [41]:
# Problems with split function
sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

In [42]:
sent4 = 'Where do think I should go? I have 3 day holiday'
sent4.split('.')

['Where do think I should go? I have 3 day holiday']

### 2. Regular Expression

In [43]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [44]:

text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 "\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

### 3. NLTK

In [45]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [46]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [47]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [48]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'

word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [49]:
word_tokenize(sent6)

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'nks',
 '@',
 'gmail.com']

In [50]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

### 4. Spacy

In [51]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [52]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [53]:
for token in doc4:
    print(token)

I
am
going
to
visit
delhi
!


**10. Stemming and Lemmatization**

These two techniques aim to reduce inflected words to their root forms.

• Inflexion: A grammatical modification of a word to express different categories like tense, case, voice, etc. (e.g., "walk," "walking," "walked," "walks" are inflections of "work").

• Stemming:
    ◦ What it is: The process of reducing inflected words to their "root" or "stem" form by removing suffixes or prefixes. The stem itself might not be a valid word in the language (e.g., "probably" stems to "probabl").
    
    ◦ Why it's used: Most useful in Information Retrieval Systems (like search engines) where searching for "fish," "fishing," or "fished" should all return relevant results related to "fish".
    
    ◦ How it's done: Using algorithms called "stemmers" (e.g., NLTK's Porter Stemmer for English, Snowball Stemmer for other languages).
    
    ◦ Caveat: Since the output might not be a real word, it's generally not suitable if the output needs to be displayed to users. It is, however, faster than lemmatization.


    
**• Lemmatization:**

    ◦ What it is: Similar to stemming, it reduces inflected words to their root form, but the crucial difference is that the output (called a "lemma") is always a valid word in the language (e.g., "better" lemmas to "good").
    
    ◦ How it differs from Stemming: Lemmatization is typically slower than stemming because it relies on a lexical dictionary (like WordNet in NLTK) to find the correct base form, whereas stemming uses algorithmic rules.
    
    ◦ Why it's used: Preferred when the output needs to be interpretable or displayed to users, or when linguistic accuracy is more important than speed.
    
    ◦ How it's done: Using tools like NLTK's WordNet Lemmatizer. It's important to specify the Part of Speech (POS) of the word for accurate lemmatization (e.g., whether "run" is a verb or a noun).

In [54]:
from nltk.stem.porter import PorterStemmer

In [55]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [56]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [57]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
print(text)

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie


In [58]:
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

In [59]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
