In [18]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('vader_lexicon')
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from helpers import label_reviews, remove_non_alpha



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
df = pd.read_csv("Translated_London_Hotel_review_Dataset.csv")
print(df.columns)

Index(['Review Rating', 'Translated Review'], dtype='object')


In [6]:
df.head()

Unnamed: 0,Review Rating,Translated Review
0,4,Wonderful stay Found the hotel via Tripadvisor...
1,5,Business Dinner!! Recently attended the Passen...
2,5,Very nice boutique hotel. Recommended although...
3,5,Amazing stay Very last minute overnight stay. ...
4,5,Fantastic Hotel!! I stayed at this hotel for t...


In [7]:
df['Translated Review'] = df['Translated Review'].apply(remove_non_alpha)

In [8]:
# Set empty cells to Nan, so that we can drop.
df['Translated Review'].replace('', np.nan, inplace=True)

In [9]:
print(f'{df.isna().sum()}')

Review Rating         0
Translated Review    12
dtype: int64


In [10]:
df = df.dropna()

In [11]:
print(f'{df.isna().sum()}')

Review Rating        0
Translated Review    0
dtype: int64


In [12]:
df['label'] = df['Review Rating'].apply(label_reviews)

In [13]:
df.head()

Unnamed: 0,Review Rating,Translated Review,label
0,4,Wonderful stay Found the hotel via Tripadvisor...,pos
1,5,Business Dinner Recently attended the Passen...,pos
2,5,Very nice boutique hotel Recommended although...,pos
3,5,Amazing stay Very last minute overnight stay ...,pos
4,5,Fantastic Hotel I stayed at this hotel for t...,pos


In [19]:
def sa_vader(review):

  sia = SentimentIntensityAnalyzer()
  score = sia.polarity_scores(text=review)['compound']

  if score >= 0.05:
    return "pos"
  elif score <= -0.05:
    return "neg"
  else:
    return "neu"

In [20]:
df['vader_prediction'] = df['Translated Review'].apply(sa_vader)

In [21]:
def sa_textblob(reviews):
  
  score = TextBlob(reviews).sentiment.polarity

  if score > 0:
    return "pos"
  elif score == 0:
    return "neu"
  else:
    return "neg"


In [22]:
df['textblob_prediction'] = df['Translated Review'].apply(sa_textblob)

In [None]:
def sa_bert(model, tokenizer, review):

  tokens = tokenizer.encode(review, 
                            padding=True, 
                            truncation=True,
                            max_length=100, 
                            add_special_tokens = True,
                            return_tensors='pt')
  
  result = model(tokens)
  score = int(torch.argmax(result.logits)) + 1

  if score <= 2:
    return "neg"
  elif score == 3:
    return "neu"
  else:
    return "pos"

In [None]:
# NOTE: This might take close to an hour or more
# depending on the system capacity.
df['Bert_prediction'] = df['Translated Review'].apply(sa_bert)

In [None]:
df.head()

In [23]:
def check_sentiment(review):

  # Extract alphabets from the review.
  review = re.sub("[^a-zA-Z]", " ", review).strip()

  if review:
    sia = SentimentIntensityAnalyzer()
    vader_res = sa_vader(sia, review)

    txtblob_res = sa_textblob(review)

    tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

    bert_res = sa_bert(model, tokenizer, review)

    result_list = [bert_res, vader_res, txtblob_res]

    print(f"{result_list}")

    # choose the sentiment predicted/voted most.
    # NOTE: In case of Tie, the priority is given
    # to the Bert model (1st entry in the list).
    max_vote = max(result_list, key=result_list.count)

    if max_vote == "pos":
      print('The sentiment of the review is Postive')
    elif max_vote == "neg":
      print("The sentiment of the review is Negative")
    else:
      print("The sentiment of the review is Neutral")

  else:
      print("Error: The review has NO english alphabets. Please check again")




In [24]:
def individual_accuaracy(model_name, sentiment):

  return round(len(df[(df[model_name] == sentiment) & (df['label'] == sentiment )]) / len(df[df['label'] == sentiment]) * 100, 2)

## VADER EVALUATION


In [300]:
print(f"VADER Model Overall accuracy: {accuracy_score(df['label'], df['vader_prediction']) * 100}")

VADER Model Overall accuracy: 90.46052631578947


In [301]:
print(f'VADER accuracy on neutral sentiments: {individual_accuaracy("vader_prediction", "neu")}')

VADER accuracy on neutral sentiments: 1.2


In [302]:
print(f'VADER accuracy on positive sentiments: {individual_accuaracy("vader_prediction", "pos")}')

VADER accuracy on positive sentiments: 99.25


In [303]:
print(f'VADER accuracy on Negative sentiments: {individual_accuaracy("vader_prediction", "neg")}')

VADER accuracy on Negative sentiments: 48.66


## TEXTBLOB EVALUATION


In [295]:
print(f"TEXTBLOB Model Overall accuracy: {accuracy_score(df['label'], df['textblob_prediction']) * 100}")

TEXTBLOB Model Overall accuracy: 90.47783933518005


In [296]:
print(f'TEXTBLOB accuracy on neutral sentiments: {individual_accuaracy("textblob_prediction", "neu")}')

TEXTBLOB accuracy on neutral sentiments: 0.13


In [297]:
print(f'TEXTBLOB accuracy on positive sentiments: {individual_accuaracy("textblob_prediction", "pos")}')

TEXTBLOB accuracy on positive sentiments: 99.57


In [298]:
print(f'TEXTBLOB accuracy on Negative sentiments: {individual_accuaracy("textblob_prediction", "neg")}')

TEXTBLOB accuracy on Negative sentiments: 44.54


## BERT EVALUATION

In [305]:
print(f"BERT Model Overall accuracy: {accuracy_score(df['label'], df['Bert_prediction']) * 100}")

BERT Model Overall accuracy: 90.83275623268699


In [284]:
print(f'Bert accuracy on neutral sentiments: {individual_accuaracy("Bert_prediction", "neu")}')

Bert accuracy on neutral sentiments: 39.76


In [285]:
print(f'Bert accuracy on positive sentiments: {individual_accuaracy("Bert_prediction", "pos")}')

Bert accuracy on positive sentiments: 94.96


In [287]:
print(f'Bert accuracy on Negative sentiments: {individual_accuaracy("Bert_prediction", "neg")}')

Bert accuracy on Negative sentiments: 83.36
