# Sentiment Tagging with Vader

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import re
import pandas as pd

## Input unseen data

### New review

In [17]:
new_review = "You When I booked with your company on line you showed me pictures of a room I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly Which was completely false advertising After being there we realised that you have grouped lots of rooms on the photos together leaving me the consumer confused and extreamly disgruntled especially as its my my wife s 40th birthday present Please make your website more clear through pricing and photos as again I didn t really know what I was paying for and how much it had wnded up being Your photos told me I was getting something I wasn t Not happy and won t be using you again "

### Split into sentence using NNSplit

In [18]:
sent_list = []
splitter = NNSplit("en")

sent = splitter.split([new_review.strip()])
for i in sent[0]:
    new_string = ''
    for j in i:
        new_string += j.text + " "
    sent_list.append(new_string)
    
sent_list

['You When I booked with your company on line you showed me pictures of a room ',
 'I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly ',
 'Which was completely false advertising ',
 'After being there we realised that you have grouped lots of rooms on the photos together ',
 'leaving me ',
 'the consumer confused and extreamly disgruntled especially as its my ',
 'my wife s 40th birthday ',
 'present ',
 'Please make your website more clear through pricing and photos ',
 'as again ',
 'I didn t really know what I was paying for and how much it had wnded up being ',
 'Your photos told me I was getting something I wasn t Not happy and won t be using you again ']

### Filter sentence above 8 words

In [19]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

new_sent_list = [] 
for sent in sent_list:
    if number_words(sent) > 8:
        new_sent_list.append(sent)


### Data Cleaning

In [34]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatize_text(text):
    text = text.lower()
    text = [t for t in text.split() if len(t) > 2]
    lemmatizer = WordNetLemmatizer()
    lemma_list = [lemmatizer.lemmatize(w) for w in text]
    text = " ".join(lemma_list)
    print("Lemma text: ", text)
    return text
    
def stem_text(text):
    text = text.lower()
    text = [t for t in text.split() if len(t) > 2]
    stemmer = PorterStemmer()
    stem_list = [stemmer.stem(w) for w in text]
    text = " ".join(stem_list)
    print("Stemmed text :", text)
    return text
    
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    # text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    # text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text.split() if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [35]:
clean_sent_list = []
lemma_sent_list = []
stem_sent_list = []
for sent in new_sent_list:
    print("sentence :", sent)
    cleaned_text = clean_text(sent)
    lemma_text = lemmatize_text(sent)
    stemmed_text = stem_text(sent)
    clean_sent_list.append(cleaned_text)
    lemma_sent_list.append(lemma_text)
    stem_sent_list.append(stemmed_text)
    print("---------------------------")

sentence : You When I booked with your company on line you showed me pictures of a room 
Lemma text:  you when booked with your company line you showed picture room
Stemmed text : you when book with your compani line you show pictur room
---------------------------
sentence : I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly 
Lemma text:  thought wa getting and paying for and then when arrived that room wa booked and the staff told could only book the villa suite theough them directly
Stemmed text : thought wa get and pay for and then when arriv that room wa book and the staff told could onli book the villa suit theough them directli
---------------------------
sentence : After being there we realised that you have grouped lots of rooms on the photos together 
Lemma text:  after being there realised that you have grouped lot room the photo together
Stemmed text : after be

In [37]:
stem_sent_list

['you when book with your compani line you show pictur room',
 'thought wa get and pay for and then when arriv that room wa book and the staff told could onli book the villa suit theough them directli',
 'after be there realis that you have group lot room the photo togeth',
 'the consum confus and extreamli disgruntl especi it',
 'pleas make your websit more clear through price and photo',
 'didn realli know what wa pay for and how much had wnded be',
 'your photo told wa get someth wasn not happi and won use you again']

### Convert list to dataframe

In [41]:
data = pd.DataFrame(lemma_sent_list, columns=["sentence"])
data

Unnamed: 0,sentence
0,you when booked with your company line you sho...
1,thought wa getting and paying for and then whe...
2,after being there realised that you have group...
3,the consumer confused and extreamly disgruntle...
4,please make your website more clear through pr...
5,didn really know what wa paying for and how mu...
6,your photo told wa getting something wasn not ...


### Get polarity

In [42]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound

data['polarity'] = data['sentence'].apply(get_polarity)
data

Unnamed: 0,sentence,polarity
0,you when booked with your company line you sho...,0.0
1,thought wa getting and paying for and then whe...,0.0
2,after being there realised that you have group...,0.0
3,the consumer confused and extreamly disgruntle...,-0.3182
4,please make your website more clear through pr...,0.6361
5,didn really know what wa paying for and how mu...,0.0
6,your photo told wa getting something wasn not ...,-0.7181


### Aggregated polarity score

In [43]:
polarity = data['polarity'].mean()
if polarity >= 0.05:
    sentiment = ('positive', polarity)
elif polarity > -0.05 and polarity < 0.05: 
    sentiment = ('neutral', polarity)
else: 
    sentiment = ('negative', polarity)
print(sentiment)

('negative', -0.05717142857142856)


## Get Vader polarity score

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound
    
data['polarity'] = data['sentence'].apply(get_polarity)
data

In [None]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (data['sentence'].apply(number_words) > 1)
data = data.loc[length]

## Aggregate the polarity by grouping sentence by reviews

In [None]:
data['agg_polarity'] = data.groupby('reviews')['polarity'].transform('mean')

In [None]:
# data = data.drop_duplicates(subset="reviews", keep="first")
# data = data.drop(['sentence','polarity'], axis=1)

In [None]:
data["review_sentiment"] = ["positive" if x>=0.196725 else ("neutral" if x>=0.096725 else "negative") for x in data['agg_polarity']]
data

In [None]:
data["sen_sentiment"] = ["positive" if x>=0.1779 else ("neutral" if x>=0.01779 else "negative") for x in data['polarity']]
data

## Export 

In [None]:
data.to_csv('./data/tagged_sentence_data.csv', index=False)

## Setting to run to see all dataframe row

In [None]:
# view_data = data.loc[:, "sentence":"sentiment"]
data.sort_values(by=['agg_polarity'], axis=0, inplace=True,ascending=False)

pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
data
# Threshold: if polarity <= 0.1779: Negative

In [9]:
pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until
