## Import Statement

In [23]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
import pandas as pd 
import re

## Load data and get all the reviews

In [24]:
df = pd.read_csv("./data/RGeo_Hotel_Reviews.csv", index_col=0)

## Filter the columns and combine positive and negative review

In [25]:
df_negative_review = df[['Negative_Review', 'city', 'country']]
df_positive_review = df[['Positive_Review', 'city', 'country']]
df_uncleaned_reviews = pd.concat([df_negative_review, df_positive_review], ignore_index=True)

In [26]:
df_uncleaned_reviews.head()

Unnamed: 0,Negative_Review,city,country,Positive_Review
0,I am so angry that i made this post available...,Amsterdam,Netherlands,
1,No Negative,Amsterdam,Netherlands,
2,Rooms are nice but for elderly a bit difficul...,Amsterdam,Netherlands,
3,My room was dirty and I was afraid to walk ba...,Amsterdam,Netherlands,
4,You When I booked with your company on line y...,Amsterdam,Netherlands,


In [27]:
df_uncleaned_reviews['Review'] = df_uncleaned_reviews['Negative_Review'].combine_first(df_uncleaned_reviews['Positive_Review'])
df_reviews = df_uncleaned_reviews.drop(columns=['Negative_Review', 'Positive_Review'])

In [28]:
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
df_reviews = df_reviews[0:1000]
df_reviews

Unnamed: 0,city,country,Review
0,Eixample,Spain,No Negative
1,Kensington,United Kingdom,No Negative
2,London,United Kingdom,Great location and nice hotel
3,Paris,France,No Negative
4,Milan,Italy,Nice hotel with spacious rooms Breakfast was ...
...,...,...,...
995,Vienna,Austria,Nothing
996,Wembley,United Kingdom,No Negative
997,Saint-Ouen,France,Nice quiet place in very busy neighborhood Cl...
998,Bayswater,United Kingdom,Can t find anything really


## Splitting reviews in to sentence using NNSplit

In [29]:
def split_review(reviews):
    reviews = [reviews.strip()]
    result_list = []
    splitter = NNSplit('en')
    results = splitter.split(reviews)
    for result in results[0]:
        sentence = ''
        for token in result:
            sentence += ' ' + token.text
        result_list.append(sentence.strip())
    return result_list      
        
df_reviews['sent_list'] = df_reviews['Review'].apply(split_review)
df_reviews

Unnamed: 0,city,country,Review,sent_list
0,Eixample,Spain,No Negative,"[No, Negative]"
1,Kensington,United Kingdom,No Negative,"[No, Negative]"
2,London,United Kingdom,Great location and nice hotel,[Great location and nice hotel]
3,Paris,France,No Negative,"[No, Negative]"
4,Milan,Italy,Nice hotel with spacious rooms Breakfast was ...,[Nice hotel with spacious rooms Breakfast was ...
...,...,...,...,...
995,Vienna,Austria,Nothing,[Nothing]
996,Wembley,United Kingdom,No Negative,"[No, Negative]"
997,Saint-Ouen,France,Nice quiet place in very busy neighborhood Cl...,[Nice quiet place in very busy neighborhood Cl...
998,Bayswater,United Kingdom,Can t find anything really,[Can t find anything really]


## Split the list into different rows of sentences in the dataframe

In [30]:
s = df_reviews.apply(lambda x: pd.Series(x['sent_list']), axis=1).stack().reset_index(level=1, drop=True)

In [31]:
s.name = 'sent_list'
df_reviews_sentence = df_reviews.drop('sent_list', axis=1).join(s)
df_reviews_sentence['sentence'] = pd.Series(df_reviews_sentence['sent_list'], dtype=object)
df_reviews_sentence.reset_index(inplace=True)
df_reviews_sentence.drop(columns=['index', 'Review'])

Unnamed: 0,city,country,sent_list,sentence
0,Eixample,Spain,No,No
1,Eixample,Spain,Negative,Negative
2,Kensington,United Kingdom,No,No
3,Kensington,United Kingdom,Negative,Negative
4,London,United Kingdom,Great location and nice hotel,Great location and nice hotel
...,...,...,...,...
1905,Kensington,United Kingdom,hot,hot
1906,Kensington,United Kingdom,The furnishing a bit blunt The lighing in the ...,The furnishing a bit blunt The lighing in the ...
1907,Kensington,United Kingdom,ceiling spots right above the bed are blinding...,ceiling spots right above the bed are blinding...
1908,Kensington,United Kingdom,So we had to leave our valuables at the front ...,So we had to leave our valuables at the front ...


## Filter sentence that is above length of 8

In [32]:
df_reviews_sentence['sentence']

0                                                      No
1                                                Negative
2                                                      No
3                                                Negative
4                           Great location and nice hotel
                              ...                        
1905                                                  hot
1906    The furnishing a bit blunt The lighing in the ...
1907    ceiling spots right above the bed are blinding...
1908    So we had to leave our valuables at the front ...
1909    The price is on the higher end of comparable p...
Name: sentence, Length: 1910, dtype: object

In [33]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (df_reviews_sentence['sentence'].apply(number_words) >= 8)

In [34]:
# length = (df_reviews_sentence['sentence'].str.split().len() >= 8)
df_reviews_sentence = df_reviews_sentence.loc[length]
df_reviews_sentence

Unnamed: 0,index,city,country,Review,sent_list,sentence
7,4,Milan,Italy,Nice hotel with spacious rooms Breakfast was ...,Nice hotel with spacious rooms Breakfast was g...,Nice hotel with spacious rooms Breakfast was g...
8,5,Bayswater,United Kingdom,There wasn t a restaurant in the hotel,There wasn t a restaurant in the hotel,There wasn t a restaurant in the hotel
13,8,el Raval,Spain,Amazing breakfast roof too terrace and snacks...,Amazing breakfast roof too terrace and snacks ...,Amazing breakfast roof too terrace and snacks ...
15,9,London,United Kingdom,Very modern hotel with an excellent central l...,Very modern hotel with an excellent central lo...,Very modern hotel with an excellent central lo...
20,13,London,United Kingdom,It was lovely modern room and huge bathroom,It was lovely modern room and huge bathroom,It was lovely modern room and huge bathroom
...,...,...,...,...,...,...
1899,999,Kensington,United Kingdom,The airconditioning made a the only disturbin...,Also the pillows are just okay but we had much...,Also the pillows are just okay but we had much...
1906,999,Kensington,United Kingdom,The airconditioning made a the only disturbin...,The furnishing a bit blunt The lighing in the ...,The furnishing a bit blunt The lighing in the ...
1907,999,Kensington,United Kingdom,The airconditioning made a the only disturbin...,ceiling spots right above the bed are blinding...,ceiling spots right above the bed are blinding...
1908,999,Kensington,United Kingdom,The airconditioning made a the only disturbin...,So we had to leave our valuables at the front ...,So we had to leave our valuables at the front ...


## Export sentences file into csv

In [35]:
df_reviews.to_csv('./data/sentence_hotelreviews.csv', index=False)

## Vader get polarity score

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    if compound >= 0.05:
        return ('positive',compound)
    elif compound > -0.05 and compound < 0.05: 
        return ('neutral',compound)
    else: 
        return ('negative',compound)
    
df_reviews_sentence['polarity'] = df_reviews_sentence['sentence'].apply(get_polarity)
df_reviews_sentence

In [None]:
print(type(df_reviews_sentence['polarity']))

In [None]:
df_reviews_sentence[df_reviews_sentence['polarity']=='positive']