In [27]:
import pandas as pd
import numpy as np 



## Traning Set for Sentiment Analysis and Topic Analysis 

 The training set will have the following format: 

| sentence id | text | sentiment | topic |
| --- | --- | --- | --- |

- Arange each data in a topic accordingly



## Book Reviews

### Kindle Reviews

In [28]:
# read csv file
kindle_review_df = pd.read_csv('data/original/preprocessed_kindle_review.csv')

In [29]:
# examine the data
kindle_review_df.head()


Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!
3,3,5,I don't normally buy 'mystery' novels because ...,Very good read.
4,4,5,"This isn't the kind of book I normally read, a...",Great Story!


In [30]:
# rename the columns so that it will match the test set
kindle_review_df.rename(columns={'Unnamed: 0': 'sentence id', 'reviewText': 'text'}, inplace=True)
# add a new column called 'topic' and set topic to book
kindle_review_df['topic'] = 'book'
# set a sentiment column with following criteria
# if rating > 3, sentiment = positive ; if rating < 3, sentiment = negative ; if rating = 3, sentiment = neutral
kindle_review_df['sentiment'] = np.where(kindle_review_df['rating'] > 3, 'positive', np.where(kindle_review_df['rating'] < 3, 'negative', 'neutral'))

In [31]:
kindle_review_df.drop(['rating'], axis=1, inplace=True)
kindle_review_df.drop(['summary'], axis=1, inplace=True)
#reorder the columns
kindle_review_df = kindle_review_df[['sentence id', 'text', 'sentiment', 'topic']]

In [80]:
kindle_review_df.head()

Unnamed: 0,sentence id,text,sentiment,topic
0,0,This book was the very first bookmobile book I...,positive,book
1,1,"When I read the description for this book, I c...",negative,book
2,2,I just had to edit this review. This book is a...,positive,book
3,3,I don't normally buy 'mystery' novels because ...,positive,book
4,4,"This isn't the kind of book I normally read, a...",positive,book


### Might add new book data eventually

### Book Topic Combined

In [32]:
book_df = kindle_review_df

# TODO: check counts of each sentiment and make sure it is balanced esp. for positive and negative

## Movie Reviews

In [59]:
rotten_review_df = pd.read_csv('data/original/rotten_tomatoes_critic_reviews.csv')
rotten_review_df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [60]:
# drop rows with review_score or review_content is null
rotten_review_df.dropna(subset=['review_score', 'review_content'], inplace=True)
# drop columns that are not needed
rotten_review_df.drop(['review_date', 'top_critic', 'review_type', 'critic_name', 'rotten_tomatoes_link', 'publisher_name'], axis=1, inplace=True)
rotten_review_df.reset_index(drop=True, inplace=True)

In [61]:
# cast review_score to float if possible for example it is '3/5' then it should be 3/5 = 0.6 
# first remove if review_score has /0 pattern in it
rotten_review_df = rotten_review_df[~rotten_review_df['review_score'].str.contains('/0')]
rotten_review_df['review_score'] = rotten_review_df['review_score'].apply(lambda x: float(x.split('/')[0])/float(x.split('/')[1]) if '/' in x else -1)
# drop rows with review_score = -1
rotten_review_df = rotten_review_df[rotten_review_df['review_score'] != -1]
# if review_score > 0.5, sentiment = positive ; if review_score < 0.5, sentiment = negative ; if review_score = 0.5, sentiment = neutral
rotten_review_df['sentiment'] = np.where(rotten_review_df['review_score'] > 0.5, 'positive', np.where(rotten_review_df['review_score'] < 0.5, 'negative', 'neutral'))
# rename the columns so that it will match the test set
rotten_review_df.rename(columns={'review_content': 'text'}, inplace=True)
# add a new column called 'topic' and set topic to movie
rotten_review_df['topic'] = 'movie'
# add sentence id 
rotten_review_df['sentence id'] = rotten_review_df.index
# reorder the columns
rotten_review_df = rotten_review_df[['sentence id', 'text', 'sentiment', 'topic']]

In [64]:
# save new dataframe to csv file as original is too large
rotten_review_df.to_csv('data/original/rotten_tomatoes_critic_reviews_preprocessed.csv', index=False)

In [79]:
rotten_review_df.head()

Unnamed: 0,sentence id,text,sentiment,topic
0,0,Whether audiences will get behind The Lightnin...,positive,movie
1,1,Harry Potter knockoffs don't come more transpa...,negative,movie
2,2,"Percy Jackson isn't a great movie, but it's a ...",positive,movie
4,4,"Crammed with dragons, set-destroying fights an...",positive,movie
5,5,"This action-packed fantasy adventure, based on...",positive,movie


### Might add new movie data eventually

### Movie Topic Combined

In [None]:
movie_df = rotten_review_df
# TODO: check counts of each sentiment and make sure it is balanced esp. for positive and negative

## Restaurant Reviews

In [75]:
restaurant_review_df = pd.read_csv('data/original/restaurant_reviews.csv')
restaurant_review_df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0


In [76]:
# drop columns: 'Restaurant','Reviewer','Metadata', 'Time', 'Pictures'
restaurant_review_df.drop(['Restaurant','Reviewer','Metadata', 'Time', 'Pictures'], axis=1, inplace=True)
restaurant_review_df.Rating.value_counts()




5       3832
4       2373
1       1735
3       1193
2        684
4.5       69
3.5       47
2.5       19
1.5        9
Like       1
Name: Rating, dtype: int64

In [77]:
# drop where Rating == 'Like'
restaurant_review_df = restaurant_review_df[restaurant_review_df['Rating'] != 'Like']
# if rating > 3, sentiment = positive ; if rating < 3, sentiment = negative ; if rating = 3, sentiment = neutral
# cast rating to float
restaurant_review_df['Rating'] = restaurant_review_df['Rating'].astype(float)
restaurant_review_df['sentiment'] = np.where(restaurant_review_df['Rating'] > 3, 'positive', np.where(restaurant_review_df['Rating'] < 3, 'negative', 'neutral'))
# rename the columns so that it will match the test set
restaurant_review_df.rename(columns={'Review': 'text'}, inplace=True)
# add a new column called 'topic' and set topic to restaurant
restaurant_review_df['topic'] = 'restaurant'
# add sentence id
restaurant_review_df['sentence id'] = restaurant_review_df.index
# reorder the columns
restaurant_review_df = restaurant_review_df[['sentence id', 'text', 'sentiment', 'topic']]
restaurant_review_df.head()

Unnamed: 0,sentence id,text,sentiment,topic
0,0,"The ambience was good, food was quite good . h...",positive,restaurant
1,1,Ambience is too good for a pleasant evening. S...,positive,restaurant
2,2,A must try.. great food great ambience. Thnx f...,positive,restaurant
3,3,Soumen das and Arun was a great guy. Only beca...,positive,restaurant
4,4,Food is good.we ordered Kodi drumsticks and ba...,positive,restaurant


### Might add new movie data eventually

### Restaurant Topic Combined

In [81]:
restaurant_df = restaurant_review_df