# Step 1: Read the source data

In [1]:
#import pandas module for creating dataframe
import pandas as pd

In [2]:
#read CSV into DataFrame
data = pd.read_csv("Review.csv")
data

Unnamed: 0,Review
0,I like this books very much!!! It is VERY INTE...
1,Do not like this book. so boring 2. Too length...


# Step 2: Remove punctuations and standardize words into lowercases in the documents using string library

In [3]:
#import string module for string manipulation
import string

In [4]:
#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    #punctuationfree = "".join([i for i in text if i not in string.punctuation])
    #return punctuationfree
    # Initialize an empty string to store the result
    punctuation_free = ""
    
    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    
    return punctuation_free

In [5]:
#applying the remove_punctuation function to the 'Review' column and storing the result in a new column 'clean_punctuation'
data['clean_punctuation']= data['Review'].apply(remove_punctuation)
data # Remove punctuation

Unnamed: 0,Review,clean_punctuation
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy


In [6]:
#to standardize the cases in the documents into lower case
data['clean_lower']= data['clean_punctuation'].str.lower()
data # Change all to lowwer case

Unnamed: 0,Review,clean_punctuation,clean_lower
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy


# Step 3: Remove numbers using re.sub ( ) in regular expression library

In [7]:
#import regular expression library
import re

In [8]:
#function to remove digit (\d) or hypens (-) from the documents with an empty string ''
def remove_numbers(text):
    return re.sub("[\d-]",'',text)

In [9]:
#applying the remove_numbers function to the 'clean_lower' column and storing the result in a new column 'clean_number'
data['clean_number'] = data['clean_lower'].apply(remove_numbers)
data # Remove numbers

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy


# Step 4: Break down the words in documents into tokens using nltk library

In [10]:
#import Natural Language Processing (NLP) library called 
#Natural Language Toolkit (NLTK)
import nltk
nltk.download('punkt')
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# import the library for word tokenization
from nltk.tokenize import word_tokenize

In [12]:
#the word tokens in the document
data['token_data']= data['clean_number'].apply(word_tokenize)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver..."
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l..."


In [13]:
from nltk.util import ngrams
# Generate bigrams and trigrams for each row in the DataFrame
data['bigrams'] = data['token_data'].apply(lambda x: list(ngrams(x, 2)))
data['trigrams'] = data['token_data'].apply(lambda x: list(ngrams(x, 3)))
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ..."
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th..."


# Step 5: Remove stopwords using nltk library

In [14]:
#download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
#Get the list of English stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [17]:
#Applying the remove_stopwords function to the 'token_data' column and storing the result in a new column 'clean_xstopwords'
data['clean_xstopwords'] = data['token_data'].apply(remove_stopwords)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]"


# Step 6: Perform word stemming using Porter/Snowball Stemmer in nltk library

Remove 'ed' or 's' example 'turned -> turn' and 'eats -> eat'.
Stemmer more accurate than porter.

In [18]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer

In [19]:
#defining the object for stemming
porter_stemmer = PorterStemmer()

In [20]:
#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [21]:
#applying the stemming function to the 'clean_xstopwords' column and storing the result in a new column 'clean_stemmed'
data['clean_stemmed_porter'] = data['clean_xstopwords'].apply(stemming)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords,clean_stemmed_porter
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]","[like, book, bore, lengthi]"


In [22]:
#importing the Stemming function from nltk library
from nltk.stem import SnowballStemmer

#defining the object for stemming
snowball_stemmer = SnowballStemmer('english')

#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = snowball_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

#applying the stemming function to the 'clean_xstopwords' column and storing the result in a new column 'clean_stemmed'
data['clean_stemmed_snowball'] = data['clean_xstopwords'].apply(stemming)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords,clean_stemmed_porter,clean_stemmed_snowball
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, bore, lengthi]"


In [23]:
#importing the Stemming function from nltk library
from nltk.stem import LancasterStemmer

#defining the object for stemming
lancaster_stemmer = LancasterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = lancaster_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

#applying the stemming function to the 'clean_xstopwords' column and storing the result in a new column 'clean_stemmed'
data['clean_stemmed_lancaster'] = data['clean_xstopwords'].apply(stemming)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords,clean_stemmed_porter,clean_stemmed_snowball,clean_stemmed_lancaster
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interest]","[lik, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, bore, lengthi]","[lik, book, bor, lengthy]"


# Step 7: Perform word lemmatization using WordNetLemmatizer( ) in nltk library
Chang the word back to its root word. Word might become unmeaningful after stemming.

In [24]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

In [26]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

In [28]:
#applying the lemmatizer function to the 'clean_stemmed' column and storing the result in a new column 'clean_lemmatized2'
data['clean_lemmatized_porter']=data['clean_stemmed_porter'].apply(lemmatizer)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords,clean_stemmed_porter,clean_stemmed_snowball,clean_stemmed_lancaster,clean_lemmatized_porter
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interest]","[lik, book, much, interest]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, bore, lengthi]","[lik, book, bor, lengthy]","[like, book, bore, lengthi]"


In [29]:
#applying the lemmatizer function to the 'clean_stemmed' column and storing the result in a new column 'clean_lemmatized2'
data['clean_lemmatized_stopwords']=data['clean_xstopwords'].apply(lemmatizer)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,bigrams,trigrams,clean_xstopwords,clean_stemmed_porter,clean_stemmed_snowball,clean_stemmed_lancaster,clean_lemmatized_porter,clean_lemmatized_stopwords
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[(i, like), (like, this), (this, books), (book...","[(i, like, this), (like, this, books), (this, ...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interest]","[lik, book, much, interest]","[like, book, much, interest]","[like, book, much, interesting]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[(do, not), (not, like), (like, this), (this, ...","[(do, not, like), (not, like, this), (like, th...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, bore, lengthi]","[lik, book, bor, lengthy]","[like, book, bore, lengthi]","[like, book, boring, lengthy]"
