In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
#nltk.download(['punkt','averaged_perceptron_tagger','vader_lexicon','stopwords','wordnet'])

pd.options.mode.chained_assignment = None

In [2]:
#Reading the list-contained columns as list instead of strings
import ast
generic = lambda x: ast.literal_eval(x)
conv = {'Sentences_in_Reviews': generic,
        'Tokenized_Words': generic}

In [None]:
book_dtf = pd.read_csv("C:/Users/joann/Downloads/newProject/Data_Cleaning/NLTK.csv")

In [4]:
del book_dtf["Unnamed: 0"]
book_dtf.head()

Unnamed: 0,User_Reviews,Stars_Ratings,Review,Combined_Likes_Count,Popularity,Diff_Ratings,Quote,Sentences_in_Reviews,Num_Sentence,Tokenized_Words,Num_Tokenized_Words,Average_WordsinSentence,Num_Letters,Average_WordLength,posTAGS
0,3,5,So I just finished this book and I don't know ...,14137,0,0.79,False,[So I just finished this book and I don't know...,6,"[so, i, just, finished, this, book, and, i, do...",74,12.333333,288,3.891892,"[(so, RB), (i, JJ), (just, RB), (finished, VBN..."
1,3,5,Murakami is Love!,3549,0,1.18,False,[Murakami is Love!],1,"[murakami, is, love]",3,3.0,14,4.666667,"[(murakami, NN), (is, VBZ), (love, VB)]"
2,6,2,Dated and not very illuminating. Perhaps the t...,1977,0,-1.96,False,"[Dated and not very illuminating., Perhaps the...",2,"[dated, and, not, very, illuminating, perhaps,...",11,5.5,57,5.181818,"[(dated, VBN), (and, CC), (not, RB), (very, RB..."
3,6,4,Great read...my first tharoor book...read prim...,131,0,0.14,False,[Great read...my first tharoor book...read pri...,1,"[great, read, my, first, tharoor, book, read, ...",40,40.0,191,4.775,"[(great, JJ), (read, VB), (my, PRP$), (first, ..."
4,6,5,Staggering... Mindboggling....what a book...cl...,1179,0,0.83,False,[Staggering... Mindboggling....what a book...c...,1,"[staggering, mindboggling, what, a, book, clea...",29,29.0,141,4.862069,"[(staggering, VBG), (mindboggling, VBG), (what..."


In [None]:
book_dtf.shape

In [None]:
#POS tagging will be used to avoid confusion between two same words that have different meanings.
#With respect to definition and context, we give each word a particular tag and process them
#Will return a list of tuple containing the word and the associated POS Tags
book_dtf["posTAGS"] = book_dtf["Tokenized_Words"].apply(nltk.pos_tag)

In [6]:
def count_posTAGS(posTAGS, TAGS):
    count = 0
    for words in posTAGS:
        if words[1] in TAGS:
            count+=1
    return(count)

#Find sum of verbs of all form in an individual review:
    # VB = base form, VBD = past tense, VBG = present participle, VBN = past participle, VBP = singular present, VBZ = 3rd person singular present
book_dtf["Num_Verbs"] = book_dtf["posTAGS"].apply(count_posTAGS, TAGS = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"])

#Find sum of nouns of all form in an individual review:
    # NN = singular noun, NNS = plural noun, NNP = proper noun singular, NNPS = proper noun plural
book_dtf["Num_Nouns"] = book_dtf["posTAGS"].apply(count_posTAGS, TAGS = ["NN", "NNS", "NNP", "NNPS"])

#Find sum of adjective + adverb of all form in an individual review:
    # JJ = adjective, JJR = comparative adjective, JJS = superlative adjective
    # RB = adverb, RBR = comparative adverb,  RBS = superlativr adverb
book_dtf["Num_AdjAdverb"] = book_dtf["posTAGS"].apply(count_posTAGS, TAGS = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"])

In [None]:
#Calculate percentage of verbs, nouns and (adjective + adverbs) for each review
book_dtf["Percentage_Verbs"] = book_dtf["Num_Verbs"]/book_dtf["Num_Tokenized_Words"]
book_dtf["Percentage_Nouns"] = book_dtf["Num_Nouns"]/book_dtf["Num_Tokenized_Words"]
book_dtf["Percentage_AdjAdverb"] = book_dtf["Num_AdjAdverb"]/book_dtf["Num_Tokenized_Words"]

In [None]:
book_dtf = book_dtf.drop(columns=['Num_Verbs','Num_Nouns','Num_AdjAdverb'])

In [None]:
book_dtf.head()

In [None]:
#Sentiment Analysis
sid = SentimentIntensityAnalyzer()

In [None]:
#Count SentimentyIntensity for each review, if the value of the compound is positive, then it is a positive review, otherwise it is a negative review.
def count_SentimentScore(reviews):
    score = sid.polarity_scores(reviews)
    return score["compound"]
    
book_dtf["Sentiment_Score"] = book_dtf["Review"].apply(count_SentimentScore)

In [None]:
book_dtf = book_dtf.drop(columns=['Sentences_in_Reviews'])

In [None]:
book_dtf.head()

In [None]:
#Stopwords contain commonly used words in english
stopwords = nltk.corpus.stopwords.words('english')


#Remove stopwords from Tokenized_Words
def filtered_tokenized_words(list):
    filtered_words = []
    for words in list:
        if words not in stopwords:
            filtered_words.append(words)
    
    return filtered_words


book_dtf["Tokenized_Words"] = book_dtf["Tokenized_Words"].apply(filtered_tokenized_words)

In [None]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def get_pos_for_lemmatizer(list):
    lemmatized_words = []
    for package in list:
        lemmatized_words.append(lemmatizer.lemmatize(package[0], get_wordnet_pos(package[1])))
    
    return lemmatized_words


book_dtf["Tokenized_Words"] = book_dtf["posTAGS"].apply(get_pos_for_lemmatizer)

In [None]:
#Changing the True or False of Quote to numeric number
#0 = False, 1 = True

book_dtf["Quote"] = book_dtf["Quote"].astype(int)

#Dropping the data with NULL values
book_dtf = book_dtf.dropna()

In [None]:
book_dtf.head()

In [None]:
book_dtf = book_dtf[["Popularity","User_Reviews","Stars_Ratings","Diff_Ratings","Num_Tokenized_Words","Average_WordLength","Average_WordsinSentence","Percentage_Verbs","Percentage_Nouns","Percentage_AdjAdverb","Quote","Sentiment_Score","Tokenized_Words"]]

In [None]:
book_dtf.head()

In [None]:
book_dtf.shape

In [None]:
book_dtf.to_csv('NLTK.csv', encoding='utf-8')