## Import Statement

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
import pandas as pd 
import re

## Load data and get all the reviews

In [2]:
df = pd.read_csv("./data/RGeo_Hotel_Reviews.csv", index_col=0)

## Filter the columns and combine positive and negative review

In [3]:
df_negative_review = df[['Negative_Review', 'city', 'country']]
df_positive_review = df[['Positive_Review', 'city', 'country']]
df_uncleaned_reviews = pd.concat([df_negative_review, df_positive_review], ignore_index=True)

In [4]:
df_uncleaned_reviews.head()

Unnamed: 0,Negative_Review,city,country,Positive_Review
0,I am so angry that i made this post available...,Amsterdam,Netherlands,
1,No Negative,Amsterdam,Netherlands,
2,Rooms are nice but for elderly a bit difficul...,Amsterdam,Netherlands,
3,My room was dirty and I was afraid to walk ba...,Amsterdam,Netherlands,
4,You When I booked with your company on line y...,Amsterdam,Netherlands,


In [5]:
df_uncleaned_reviews['Review'] = df_uncleaned_reviews['Negative_Review'].combine_first(df_uncleaned_reviews['Positive_Review'])
df_reviews = df_uncleaned_reviews.drop(columns=['Negative_Review', 'Positive_Review'])

In [6]:
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
df_reviews = df_reviews[0:20000]
df_reviews

Unnamed: 0,city,country,Review
0,Dreta de l'Eixample,Spain,Pool ok but it is a fish pond Gym which is ve...
1,Kensington,United Kingdom,No Positive
2,Kensington,United Kingdom,Location is great close to Glouscester statio...
3,Amsterdam,Netherlands,The hotel is built inside an old tram depot a...
4,Milan,Italy,The shower cabin may be more comfortable and big
...,...,...,...
19995,Amsterdam,Netherlands,No Negative
19996,Bayswater,United Kingdom,location
19997,West End of London,United Kingdom,Road and Tube noise
19998,Kensington,United Kingdom,Little to far from central London than though...


## Splitting reviews in to sentence using NNSplit

In [7]:
def split_review(reviews):
    reviews = [reviews.strip()]
    result_list = []
    splitter = NNSplit('en')
    results = splitter.split(reviews)
    for result in results[0]:
        sentence = ''
        for token in result:
            sentence += ' ' + token.text
        result_list.append(sentence.strip())
    return result_list      
        
df_reviews['sent_list'] = df_reviews['Review'].apply(split_review)
df_reviews

Unnamed: 0,city,country,Review,sent_list
0,Dreta de l'Eixample,Spain,Pool ok but it is a fish pond Gym which is ve...,"[Pool ok but it is a fish pond, Gym which is v..."
1,Kensington,United Kingdom,No Positive,"[No, Positive]"
2,Kensington,United Kingdom,Location is great close to Glouscester statio...,[Location is great close to Glouscester statio...
3,Amsterdam,Netherlands,The hotel is built inside an old tram depot a...,[The hotel is built inside an old tram depot a...
4,Milan,Italy,The shower cabin may be more comfortable and big,[The shower cabin may be more comfortable and ...
...,...,...,...,...
19995,Amsterdam,Netherlands,No Negative,"[No, Negative]"
19996,Bayswater,United Kingdom,location,[location]
19997,West End of London,United Kingdom,Road and Tube noise,[Road and Tube noise]
19998,Kensington,United Kingdom,Little to far from central London than though...,[Little to far from central London than though...


## Split the list into different rows of sentences in the dataframe

In [8]:
s = df_reviews.apply(lambda x: pd.Series(x['sent_list']), axis=1).stack().reset_index(level=1, drop=True)

  """Entry point for launching an IPython kernel.


In [9]:
s.name = 'sent_list'
df_reviews_sentence = df_reviews.drop('sent_list', axis=1).join(s)
df_reviews_sentence['sentence'] = pd.Series(df_reviews_sentence['sent_list'], dtype=object)
df_reviews_sentence.reset_index(inplace=True)
df_reviews_sentence.drop(columns=['index', 'Review'])

Unnamed: 0,city,country,sent_list,sentence
0,Dreta de l'Eixample,Spain,Pool ok but it is a fish pond,Pool ok but it is a fish pond
1,Dreta de l'Eixample,Spain,Gym which is very small was also out of order,Gym which is very small was also out of order
2,Kensington,United Kingdom,No,No
3,Kensington,United Kingdom,Positive,Positive
4,Kensington,United Kingdom,Location is great close to Glouscester station,Location is great close to Glouscester station
...,...,...,...,...
38959,Bayswater,United Kingdom,location,location
38960,West End of London,United Kingdom,Road and Tube noise,Road and Tube noise
38961,Kensington,United Kingdom,Little to far from central London than thought...,Little to far from central London than thought...
38962,Kensington,United Kingdom,excellent Room but tired but ok Breakfast exce...,excellent Room but tired but ok Breakfast exce...


## Filter sentence that is above length of 8

In [10]:
df_reviews_sentence['sentence']

0                            Pool ok but it is a fish pond
1            Gym which is very small was also out of order
2                                                       No
3                                                 Positive
4           Location is great close to Glouscester station
                               ...                        
38959                                             location
38960                                  Road and Tube noise
38961    Little to far from central London than thought...
38962    excellent Room but tired but ok Breakfast exce...
38963    The beds were comfortable and the staff on the...
Name: sentence, Length: 38964, dtype: object

In [11]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (df_reviews_sentence['sentence'].apply(number_words) >= 8)

In [12]:
# length = (df_reviews_sentence['sentence'].str.split().len() >= 8)
df_reviews_sentence = df_reviews_sentence.loc[length]
df_reviews_sentence

Unnamed: 0,index,city,country,Review,sent_list,sentence
0,0,Dreta de l'Eixample,Spain,Pool ok but it is a fish pond Gym which is ve...,Pool ok but it is a fish pond,Pool ok but it is a fish pond
1,0,Dreta de l'Eixample,Spain,Pool ok but it is a fish pond Gym which is ve...,Gym which is very small was also out of order,Gym which is very small was also out of order
5,2,Kensington,United Kingdom,Location is great close to Glouscester statio...,Close to National museum Victoria Albert Very ...,Close to National museum Victoria Albert Very ...
6,3,Amsterdam,Netherlands,The hotel is built inside an old tram depot a...,The hotel is built inside an old tram depot an...,The hotel is built inside an old tram depot an...
7,3,Amsterdam,Netherlands,The hotel is built inside an old tram depot a...,We walked from the hotel to Van Gogh Museum Ri...,We walked from the hotel to Van Gogh Museum Ri...
...,...,...,...,...,...,...
38948,19988,Dreta de l'Eixample,Spain,Beds are comfortable and windows isolate nois...,Beds are comfortable and windows isolate noise...,Beds are comfortable and windows isolate noise...
38955,19993,Montrouge,France,Access to Metro Friendly staffs Working space...,Access to Metro Friendly staffs Working space ...,Access to Metro Friendly staffs Working space ...
38961,19998,Kensington,United Kingdom,Little to far from central London than though...,Little to far from central London than thought...,Little to far from central London than thought...
38962,19998,Kensington,United Kingdom,Little to far from central London than though...,excellent Room but tired but ok Breakfast exce...,excellent Room but tired but ok Breakfast exce...


## Export sentences file into csv

In [17]:
df_reviews_sentence.to_csv('./data/sentence_hotelreviews.csv', index=False)

## Vader get polarity score

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    if compound >= 0.05:
        return ('positive',compound)
    elif compound > -0.05 and compound < 0.05: 
        return ('neutral',compound)
    else: 
        return ('negative',compound)
    
df_reviews_sentence['polarity'] = df_reviews_sentence['sentence'].apply(get_polarity)
df_reviews_sentence

In [None]:
print(type(df_reviews_sentence['polarity']))

In [None]:
df_reviews_sentence[df_reviews_sentence['polarity']=='positive']