## Preprocessing Sample Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
#importing our dataset
df = pd.read_csv('imdb_movie_reviews.csv')

In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,label,review
0,negative,"In the ten years since Wildside aired, nothing..."
1,positive,This is a better-than-average entry in the Sai...
2,negative,"""The Mayor Of Hell"" has the feel of an early D..."
3,positive,This is a really great short from Hal Roach. T...
4,positive,A rather charming depiction of European union ...


In [5]:
df.label.value_counts()

label
negative    25000
positive    25000
Name: count, dtype: int64

Dataset is equally divided into positive and negative reviews.

-----------------------------------------------------

### Removing Punctuation

In [6]:
import string

punctuation = string.punctuation

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
#function to remove punctuation
def remove_punctuation(review):
    review = "".join([char for char in review if char not in punctuation])
    return review

In [8]:
#applying the function to dataset
df['review-no-punc'] = df['review'].apply(lambda x : remove_punctuation(x))

In [9]:
df.head()

Unnamed: 0,label,review,review-no-punc
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...


### Tokenization (with text split)

A function that takes a review as an argument and returns a list of lowercase individual words.

In [10]:
def tokenize(review):
    review = review.lower().split()
    return review

In [11]:
#applying the function
df['review-tokens']  = df['review-no-punc'].apply(lambda x : tokenize(x))

In [12]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ..."
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th..."
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,..."
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,..."
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,..."


### Tokenization (with regex) as some words combined in review after punctuation removal became a big word 
example : ***better-than-average*** became ***betterthanaverage***

In [13]:
import re
def tokenize_regex(text):
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

In [14]:
df['review-tokens-regex'] = df['review'].apply(lambda x : tokenize_regex(x.lower()))

In [15]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ..."
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in..."
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,..."
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,..."
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,..."


#### Analyzing what is the minimum, maximum, and average number of words in the tokenized reviews

In [16]:
df['review-tokens'].apply(len).min()

10

In [17]:
df['review-tokens'].apply(len).max()

2469

In [18]:
df['review-tokens'].apply(len).mean()

231.99652

In [19]:
df['review-tokens-regex'].apply(len).min()

10

In [20]:
df['review-tokens-regex'].apply(len).max()

2525

In [21]:
df['review-tokens-regex'].apply(len).mean()

240.886

In [22]:
df['tokenized_len']  = df['review-tokens'].apply(len)
df['tokenized_len_regex']  = df['review-tokens-regex'].apply(len)


In [23]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex,tokenized_len,tokenized_len_regex
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ...",250,258
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in...",107,113
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,...",464,482
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,...",179,186
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,...",142,144


### Stop Word Removal

In [None]:
pip install -U nltk

In [25]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [26]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

A function that removes stop words from the tokenized reviews.

In [27]:
def remove_stopwords(review):
    review = [word for word in review if word not in stop_words]
    return review

In [28]:
#testing the function on a sentence
test_sentence = ['hi','is','the','harman']

#calling the function
test_sentence = remove_stopwords(test_sentence)

#printing the results
test_sentence

['hi', 'harman']

In [29]:
df['reviews-no-stop'] = df['review-tokens'].apply(lambda x : remove_stopwords(x))

In [30]:
df['reviews-no-stop-regex'] = df['review-tokens-regex'].apply(lambda x : remove_stopwords(x))

In [31]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex,tokenized_len,tokenized_len_regex,reviews-no-stop,reviews-no-stop-regex
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ...",250,258,"[ten, years, since, wildside, aired, nothing, ...","[ten, years, since, wildside, aired, nothing, ..."
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in...",107,113,"[betterthanaverage, entry, saint, series, hold...","[better, average, entry, saint, series, holds,..."
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,...",464,482,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, early, dead, end, kids, fi..."
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,...",179,186,"[really, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ..."
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,...",142,144,"[rather, charming, depiction, european, union,...","[rather, charming, depiction, european, union,..."


#### Compare the minimum, maximum, and average number of words from the tokenized reviews to the tokenized reviews with stop words removed.

In [32]:
print("The minimum number of words with stop words removed : ",df['reviews-no-stop'].apply(len).min())
print("The maximum number of words with stop words removed : ",df['reviews-no-stop'].apply(len).max())
print("The average number of words with stop words removed : ",df['reviews-no-stop'].apply(len).mean())

print("\nWith Regex tokenized column")
print("The minimum number of words with stop words removed : ",df['reviews-no-stop-regex'].apply(len).min())
print("The maximum number of words with stop words removed : ",df['reviews-no-stop-regex'].apply(len).max())
print("The average number of words with stop words removed : ",df['reviews-no-stop-regex'].apply(len).mean())

The minimum number of words with stop words removed :  6
The maximum number of words with stop words removed :  1449
The average number of words with stop words removed :  124.062

With Regex tokenized column
The minimum number of words with stop words removed :  6
The maximum number of words with stop words removed :  1455
The average number of words with stop words removed :  125.24076


Reflection : There would be 1449/1455 columns in dataframe as each word would be a dimension

In [33]:
df['tokenized_len_stop']  = df['reviews-no-stop'].apply(len)
df['tokenized_len_regex_stop']  = df['reviews-no-stop-regex'].apply(len)


In [34]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex,tokenized_len,tokenized_len_regex,reviews-no-stop,reviews-no-stop-regex,tokenized_len_stop,tokenized_len_regex_stop
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ...",250,258,"[ten, years, since, wildside, aired, nothing, ...","[ten, years, since, wildside, aired, nothing, ...",147,148
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in...",107,113,"[betterthanaverage, entry, saint, series, hold...","[better, average, entry, saint, series, holds,...",55,58
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,...",464,482,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, early, dead, end, kids, fi...",282,278
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,...",179,186,"[really, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ...",94,96
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,...",142,144,"[rather, charming, depiction, european, union,...","[rather, charming, depiction, european, union,...",80,81


### Stemming

In [None]:
#importing libraries
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer

In [36]:
#instantiating an object
stemmer = PorterStemmer()

In [44]:
def stem_text(review):
    stemmed_words = [stemmer.stem(word) for word in review]
    return stemmed_words

text = ['Helping', 'Playing', 'harman' ,'fun','messaging']

text = stem_text(text)
print(text)
    

['help', 'play', 'harman', 'fun', 'messag']


In [63]:
df['reviews-no-stop-stemmed']  = df['reviews-no-stop'].apply(lambda x : stem_text(x))
df['reviews-no-stop-regex-stemmed']  = df['reviews-no-stop-regex'].apply(lambda x : stem_text(x))

In [46]:
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex,tokenized_len,tokenized_len_regex,reviews-no-stop,reviews-no-stop-regex,tokenized_len_stop,tokenized_len_regex_stop,reviews-no-stop-stemmed
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ...",250,258,"[ten, years, since, wildside, aired, nothing, ...","[ten, years, since, wildside, aired, nothing, ...",147,148,"[ten, year, sinc, wildsid, air, noth, realli, ..."
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in...",107,113,"[betterthanaverage, entry, saint, series, hold...","[better, average, entry, saint, series, holds,...",55,58,"[betterthanaverag, entri, saint, seri, hold, i..."
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,...",464,482,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, early, dead, end, kids, fi...",282,278,"[mayor, hell, feel, earli, dead, end, kid, fil..."
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,...",179,186,"[really, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ...",94,96,"[realli, great, short, hal, roach, two, main, ..."
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,...",142,144,"[rather, charming, depiction, european, union,...","[rather, charming, depiction, european, union,...",80,81,"[rather, charm, depict, european, union, begin..."


### Lemmatization

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [48]:
#instantiating an object
lemmater = WordNetLemmatizer()

In [61]:
def lemmatize_text(review):
    review = [lemmater.lemmatize(word,'v') for word in review]
    return review

#testing the function
text = ['Helping', 'Playing', 'harman' ,'fun','messaging']

text = lemmatize_text(text)
print(text)

['Helping', 'Playing', 'harman', 'fun', 'message']


In [64]:
df['reviews-no-stop-lem']  = df['reviews-no-stop'].apply(lambda x : lemmatize_text(x))
df['reviews-no-stop-regex-lem']  = df['reviews-no-stop-regex'].apply(lambda x : lemmatize_text(x))
df.head()

Unnamed: 0,label,review,review-no-punc,review-tokens,review-tokens-regex,tokenized_len,tokenized_len_regex,reviews-no-stop,reviews-no-stop-regex,tokenized_len_stop,tokenized_len_regex_stop,reviews-no-stop-stemmed,reviews-no-stop-lem,reviews-no-stop-regex-stemmed,reviews-no-stop-regex-lem
0,negative,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ...","[in, the, ten, years, since, wildside, aired, ...",250,258,"[ten, years, since, wildside, aired, nothing, ...","[ten, years, since, wildside, aired, nothing, ...",147,148,"[ten, year, sinc, wildsid, air, noth, realli, ...","[ten, years, since, wildside, air, nothing, re...","[ten, year, sinc, wildsid, air, noth, realli, ...","[ten, years, since, wildside, air, nothing, re..."
1,positive,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th...","[this, is, a, better, than, average, entry, in...",107,113,"[betterthanaverage, entry, saint, series, hold...","[better, average, entry, saint, series, holds,...",55,58,"[betterthanaverag, entri, saint, seri, hold, i...","[betterthanaverage, entry, saint, series, hold...","[better, averag, entri, saint, seri, hold, int...","[better, average, entry, saint, series, hold, ..."
2,negative,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,...","[the, mayor, of, hell, has, the, feel, of, an,...",464,482,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, early, dead, end, kids, fi...",282,278,"[mayor, hell, feel, earli, dead, end, kid, fil...","[mayor, hell, feel, early, dead, end, kid, fil...","[mayor, hell, feel, earli, dead, end, kid, fil...","[mayor, hell, feel, early, dead, end, kid, fil..."
3,positive,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,...","[this, is, a, really, great, short, from, hal,...",179,186,"[really, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ...",94,96,"[realli, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ...","[realli, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ..."
4,positive,A rather charming depiction of European union ...,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,...","[a, rather, charming, depiction, of, european,...",142,144,"[rather, charming, depiction, european, union,...","[rather, charming, depiction, european, union,...",80,81,"[rather, charm, depict, european, union, begin...","[rather, charm, depiction, european, union, be...","[rather, charm, depict, european, union, begin...","[rather, charm, depiction, european, union, be..."


In [79]:
print("Comaring the Stemming and Lemmatization Result\n")
print("For String split tokens")
print(df[:1]['reviews-no-stop-stemmed'].to_list())
print(df[:1]['reviews-no-stop-lem'].to_list())
print("\n------------------------------\n")
print("For Regex split tokens")
print(df[0:1]['reviews-no-stop-regex-stemmed'].to_list())
print(df[0:1]['reviews-no-stop-regex-lem'].to_list())

Comaring the Stemming and Lemmatization Result

For String split tokens
[['ten', 'year', 'sinc', 'wildsid', 'air', 'noth', 'realli', 'come', 'close', 'qualiti', 'local', 'product', 'includ', 'two', 'seri', 'enjoy', 'overr', 'underbelli', 'brought', 'life', 'event', 'recent', 'crimin', 'histori', 'sydney', 'melbourn', 'miniseri', 'blue', 'murder', 'also', 'star', 'toni', 'martin', 'someon', 'side', 'law', 'may', 'exceptionbr', 'br', 'wildsid', 'current', 'repeat', 'late', 'night', 'abc', 'watch', 'show', 'quit', 'im', 'still', 'impress', 'uncompromis', 'stori', 'line', 'human', 'charact', 'cast', 'excel', 'toni', 'martin', 'detect', 'haunt', 'disappear', 'son', 'rachael', 'blake', 'later', 'hook', 'martin', 'real', 'life', 'commun', 'worker', 'struggl', 'alcohol', 'alex', 'dimitriad', 'young', 'cop', 'whose', 'vice', 'gambl', 'equal', 'good', 'support', 'role', 'provid', 'aaron', 'pederson', 'jessica', 'napier', 'mari', 'cousta', 'ye', 'effi', 'young', 'abbi', 'cornishbr', 'br', 'abc', 