## Import Statement

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
import pandas as pd 
import re

## Load data and get all the reviews

In [2]:
df = pd.read_csv("./data/RGeo_Hotel_Reviews.csv", index_col=0)

## Filter the columns and combine positive and negative review

In [3]:
df_negative_review = df[['Negative_Review', 'city', 'country']]
df_positive_review = df[['Positive_Review', 'city', 'country']]
df_uncleaned_reviews = pd.concat([df_negative_review, df_positive_review], ignore_index=True)

In [4]:
df_uncleaned_reviews['Review'] = df_uncleaned_reviews['Negative_Review'].combine_first(df_uncleaned_reviews['Positive_Review'])
df_reviews = df_uncleaned_reviews.drop(columns=['Negative_Review', 'Positive_Review'])

## Shuffle and slice dataframe

In [5]:
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
df_reviews[0:20000]

Unnamed: 0,city,country,Review
0,el Parc i la Llacuna del Poblenou,Spain,Free room upgrade and awesome free use of the...
1,Wembley,United Kingdom,The best hotel I been here in England
2,Wembley,United Kingdom,breakfast
3,Blackheath,United Kingdom,Good location of the hotel for all visits in ...
4,Milan,Italy,No Negative
...,...,...,...
19995,Barri Gòtic,Spain,Bed not overly comfy and a lot of noise at ti...
19996,Bayswater,United Kingdom,No Negative
19997,Barnsbury,United Kingdom,The ambience and the staff and the proximity ...
19998,West End of London,United Kingdom,Really bad services not feeling safe and bags...


## Splitting reviews in to sentence using NNSplit

In [None]:
def split_review(reviews):
    reviews = [reviews.strip()]
    result_list = []
    splitter = NNSplit('en')
    results = splitter.split(reviews)
    for result in results[0]:
        sentence = ''
        for token in result:
            sentence += ' ' + token.text
        result_list.append(sentence.strip())
    return result_list      
        
df_reviews['sent_list'] = df_reviews['Review'].apply(split_review)

## Split the list into different rows of sentences in the dataframe

In [None]:
s = df_reviews.apply(lambda x: pd.Series(x['sent_list']), axis=1).stack().reset_index(level=1, drop=True)

In [None]:
s.name = 'sent_list'
df_reviews_sentence = df_reviews.drop('sent_list', axis=1).join(s)
df_reviews_sentence['sent_list'] = pd.Series(df_reviews_sentence['sent_list'], dtype=object)
df_reviews_sentence.reset_index(inplace=True)
df_reviews_sentence.drop(columns=['index', 'Review'])

## Filter sentence that is above length of 8

In [None]:
length = (df_reviews_sentence['sent_list'].str.len() >= 8)
df_reviews_sentence = df_reviews_sentence.loc[length]

## Vader to tag the sentiment

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    if compound >= 0.05:
        return ('positive')
    elif compound > -0.05 and compound < 0.05: 
        return ('neutral')
    else: 
        return ('negative')
    
df_reviews_sentiment['polarity'] = df_reviews_sentence['review'].apply(get_polarity)