In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download(['punkt','averaged_perceptron_tagger','vader_lexicon','stopwords','wordnet'])

pd.options.mode.chained_assignment = None

In [2]:
book_dtf = pd.read_csv("C:/Users/joann/Downloads/newProject/Data_Cleaning/Cleaned_Data.csv")

In [3]:
book_dtf.shape

(2029087, 11)

In [4]:
del book_dtf["Unnamed: 0"]
book_dtf.sort_values(by=['Book_ID']).head()

Unnamed: 0.1,Unnamed: 0,Book_ID,User_Reviews,Stars_Ratings,Average_Ratings,Num_Ratings,Review,Combined_Likes_Count,Likes_Count,Ratio,Popularity
0,0,1,18,5,4.54,1713866.0,"""I am the Half-blood prince"". \n Read to find ...",7439,1,0.000134,0
15395,15395,1,112,5,4.54,1713866.0,a bucketful of tears and a grin (because of th...,7439,5,0.000672,0
15507,15507,1,20,5,4.54,1713866.0,I think this might have been my favorite out o...,7439,1,0.000134,0
15527,15527,1,21,5,4.54,1713866.0,"""And Harry remembered his first nightmarish tr...",7439,3,0.000403,0
15548,15548,1,1,5,4.54,1713866.0,So amazing that there is no justifiable descri...,7439,3,0.000403,0


## Explanation of the variables

- Book_ID   &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;                 = Contains unique tag for each of the book
- User_Reviews &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Total number of reviews created by each user
- Stars_Rating &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&ensp; = Rating given to the book by each individual reviewers
- Average_Ratings     &emsp;&emsp;&emsp;&emsp;&ensp;&nbsp;       = Rating of the book (calculated based on the reviews given by each user)
- Num_Ratings         &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&nbsp;       = Total number of reviews received by each book
- Review              &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = The text review given bye ach user
- Combined_Likes_Count&emsp;&emsp;       = The total number of likes, received by all the reviews in a certain book
- Likes_Count         &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = Individual number of likes received by each review
- Ratio               &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = Measure of relative popularity of each review compared to all                                the other reviews of the book)
- Popularity          &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = Indicates of book is popular or not

In [5]:
book_dtf = book_dtf.drop(columns=['Book_ID','Num_Ratings','Likes_Count','Ratio'])

In [6]:
book_dtf.head()

Unnamed: 0.1,Unnamed: 0,User_Reviews,Stars_Ratings,Average_Ratings,Review,Combined_Likes_Count,Popularity
0,0,18,5,4.54,"""I am the Half-blood prince"". \n Read to find ...",7439,0
1,1,18,5,4.47,** spoiler alert ** \n A little memorial for m...,23232,0
2,2,18,5,4.53,One of my fav books!!,11254,0
3,3,18,5,3.86,"** spoiler alert ** \n Oh my goodness, it was ...",4130,0
4,4,18,4,4.11,I thought this was a great and educational boo...,1449,0


In [7]:
# Calculate difference between Stars_Ratings and Average_Ratings
book_dtf["Diff_Ratings"] = book_dtf["Stars_Ratings"]-book_dtf["Average_Ratings"]

In [8]:
book_dtf.head()

Unnamed: 0.1,Unnamed: 0,User_Reviews,Stars_Ratings,Average_Ratings,Review,Combined_Likes_Count,Popularity,Diff_Ratings
0,0,18,5,4.54,"""I am the Half-blood prince"". \n Read to find ...",7439,0,0.46
1,1,18,5,4.47,** spoiler alert ** \n A little memorial for m...,23232,0,0.53
2,2,18,5,4.53,One of my fav books!!,11254,0,0.47
3,3,18,5,3.86,"** spoiler alert ** \n Oh my goodness, it was ...",4130,0,1.14
4,4,18,4,4.11,I thought this was a great and educational boo...,1449,0,-0.11


In [9]:
book_dtf = book_dtf.drop(columns=['Average_Ratings'])

In [10]:
book_dtf.head()

Unnamed: 0.1,Unnamed: 0,User_Reviews,Stars_Ratings,Review,Combined_Likes_Count,Popularity,Diff_Ratings
0,0,18,5,"""I am the Half-blood prince"". \n Read to find ...",7439,0,0.46
1,1,18,5,** spoiler alert ** \n A little memorial for m...,23232,0,0.53
2,2,18,5,One of my fav books!!,11254,0,0.47
3,3,18,5,"** spoiler alert ** \n Oh my goodness, it was ...",4130,0,1.14
4,4,18,4,I thought this was a great and educational boo...,1449,0,-0.11


In [11]:
#Set Quote = True if review contains a quotation, Quote = False if review doesn't contain a quotation 
book_dtf["Quote"] = book_dtf["Review"].str.contains('"')

In [12]:
book_dtf.head(100)

Unnamed: 0.1,Unnamed: 0,User_Reviews,Stars_Ratings,Review,Combined_Likes_Count,Popularity,Diff_Ratings,Quote
0,0,18,5,"""I am the Half-blood prince"". \n Read to find ...",7439,0,0.46,True
1,1,18,5,** spoiler alert ** \n A little memorial for m...,23232,0,0.53,False
2,2,18,5,One of my fav books!!,11254,0,0.47,False
3,3,18,5,"** spoiler alert ** \n Oh my goodness, it was ...",4130,0,1.14,False
4,4,18,4,I thought this was a great and educational boo...,1449,0,-0.11,False
...,...,...,...,...,...,...,...,...
95,95,464,4,How did I not review this yet? I'm... \n GIFSo...,5189,0,-0.08,False
96,96,464,4,Have you ever wondered what it would be like t...,5204,0,-0.03,False
97,97,464,3,This was completely different from what I was ...,2718,0,-0.62,False
98,98,464,3,Finally! Finally I have finished this book! \n...,4689,0,-0.98,False


In [13]:
book_dtf["Quote"].value_counts()

False    1516387
True      512700
Name: Quote, dtype: int64

In [14]:
#Pragraphs will be divided into list of sentences (tokens) using NLTK
#.sent_tokenize will split paragraphs into sentence 
book_dtf["Sentences_in_Reviews"] = book_dtf["Review"].apply(nltk.tokenize.sent_tokenize)

#Calculate the number of sentences in a paragraphs
book_dtf["Num_Sentence"] = book_dtf["Sentences_in_Reviews"].apply(len)

#.word_tokenize will split paragraphs into individual words
# filter out the tokenize result for those containing alphabets only
# then we'll make the words all uncapitalized

def filtering_words(list):
    filtered_list = []
    for words in list:
        if words.isalpha():
            filtered_list.append(words.lower())
    return filtered_list
        
book_dtf["Tokenized_Words"] = book_dtf["Review"].apply(nltk.tokenize.word_tokenize)
book_dtf["Tokenized_Words"] = book_dtf["Tokenized_Words"].apply(lambda review: filtering_words(review))

# Count the number of words in the review
book_dtf["Num_Tokenized_Words"] = book_dtf["Tokenized_Words"].apply(len)

#Average number of words per sentence
book_dtf["Average_WordsinSentence"] = book_dtf["Num_Tokenized_Words"]/book_dtf["Num_Sentence"]

#Calculate number of letters in each tokenized words
def count_letters(list):
    sum = 0
    for words in list:
        sum = sum + len(words)
    return sum 

book_dtf["Num_Letters"] = book_dtf["Tokenized_Words"].apply(lambda review: count_letters(review))

#Count the average number of letters in the word
book_dtf["Average_WordLength"] = book_dtf["Num_Letters"]/book_dtf["Num_Tokenized_Words"]

In [15]:
book_dtf.to_csv('NLTK.csv', encoding='utf-8')

## Explanation of the variables

- User_Reviews &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Total number of reviews created by each user
- Stars_Rating &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&ensp; = Rating given to the book by each individual reviewers
- Review              &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = The text review given by each user
- Combined_Likes_Count&emsp;&emsp;       = The total number of likes, received by all the reviews in a certain book
- Popularity          &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;       = Indicates of book is popular or not
- Diff_Ratings &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Difference between rating given by individual user and overall book rating
- Quote &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Check if review contains a quote or not
- Sentences_in_Reviews &emsp;&emsp; = Seperate each sentences in a paragraph and store inside a list
- Num_Sentence &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Calculate the number of sentences
- Tokenized_Words &emsp;&emsp;&emsp;&emsp;&emsp; = Seperate each words in a paragraph and store inside a list
- Num_Tokenized_Words &emsp;&emsp; = Calculate number of words
- Average_WordsinSentence &emsp; = Calculate the number of average words in a sentence
- Num_Letters &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; = Total number of letters in the reviews
- Average_WordLength &emsp;&emsp;&emsp; = Average number of letters in a word