In [1]:
# Import Libraries
import nltk
import pandas as pd
import pickle as pk
from nltk.chunk import tree2conlltags

In [2]:
# Assigning variable to library elements
stopwords = nltk.corpus.stopwords.words("english")
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
wt = nltk.tokenize.WhitespaceTokenizer()

capital_alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [3]:
# Fixes the given dataset paragraphs into acceptable format
def adjust_para(text):
    text = str(text)
    tokens = wt.tokenize(text)
    ner = tree2conlltags(nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))))
        
    for a in range(len(tokens)):
        if tokens[a][0] in capital_alphabets:
            if not (   ner[a][2][0] == "B"
                    or ner[a][2][0] == "I" ):
                tokens[a] = ". " + tokens[a] 
            
    text = " ".join(tokens).replace(" . ", ". ")

    if (". " == text[0:2]):
        text = text[2:]
        
    text = text + "."
        
    tokens = wt.tokenize(text)

    for word in range(len(tokens)):
        if tokens[word] == "s":
            tokens[word] = "'s"
        elif tokens[word] == "t":
            tokens[word] = "'t"
        elif tokens[word] == "d":
            tokens[word] = "'d"
        elif tokens[word] == "ll":
            tokens[word] = "'ll"
        elif tokens[word] == "m":
            tokens[word] = "'m"
        elif tokens[word] == "o":
            tokens[word] = "'o"
        elif tokens[word] == "re":
            tokens[word] = "'re"
        elif tokens[word] == "ve":
            tokens[word] = "'ve"
        elif tokens[word] == "y":
            tokens[word] = "'y"
    
    text = " ".join(tokens).replace(" '", "'")
    
    return text

In [4]:
# Break paragraph into sentences
def tokenize_para(text):
    text = adjust_para(text)
    sen_tokens = {}
    dict_id = 1
    
    for sentence in tokenizer.tokenize(text):
        sentence = str(sentence)
        sen_tokens[dict_id] = sentence
        dict_id += 1
        
#     doc = nlp(text)
#     for sentence in doc.sents:
#         sentence = str(sentence)
#         sen_tokens[dict_id] = sentence
#         dict_id += 1
           
    return sen_tokens

In [5]:
# Part-of-Speech Tagging using nltk library
def pos_tag(text):
    sen_pos = {}
    formatted_text = tokenize_para(text)
    for key,value in formatted_text.items():
        sen_pos[key] = nltk.pos_tag(nltk.word_tokenize(value))
        
    return sen_pos

In [6]:
# Aspect Extraction
def aspect_ext(text):
    prev_word = ""
    prev_tag = ""
    current_word = ""
    
    aspects = []
    pos_data = pos_tag(text)
    aspects_unique = {}
    
    # Gets all Noun and Noun Phrases from part of speech tagging
    for key,value in pos_data.items():
        for word, tag in value:
            if (tag == "NN" or tag == "NNP"):
                if (prev_tag == "NN" or prev_tag == "NNP"):
                    current_word = prev_word + " " + word
                else:
                    aspects.append(prev_word.lower())
                    current_word = word
                    
            prev_word = current_word
            prev_tag = tag
            
    # Avoid unesessary appends
    for aspect in aspects:
        if aspects_unique.keys() != aspect:
            aspects_unique[aspect] = aspects.count(aspect) 
            
    return aspects_unique

In [7]:
def word_sentiment(single_word):
    # Gets synonyms of a word from wordnet
    word_syn = nltk.corpus.wordnet.synsets(single_word)
    
    if len(word_syn)!=0:
        word = word_syn[0].name()
        # Gets sentiment word of a word in numerical value
        senti_score = nltk.corpus.sentiwordnet.senti_synset(word)
        
        if senti_score.pos_score() > senti_score.neg_score():
            return senti_score.pos_score()
        elif senti_score.pos_score() < senti_score.neg_score():
            return -(senti_score.neg_score())
        else:
            return 0
    return 0

In [8]:
def opinion_sentiment(text):
    reviews = pos_tag(text)      # Reviews
    aspects = aspect_ext(text)   # Aspects
    
    aspect_opinion_word = {}
    
    # Get aspect and occurrence from "aspects" array
    for aspect,no in aspects.items():
        
        # Tokenize the aspects
        # e.g. 'restaurant bar' --> ['restaurant', 'bar']
        aspect_tokens = nltk.word_tokenize(aspect)
        
        # For each sentence in "reviews" array, check which sub-aspect belongs to that sentence
        for key,value in reviews.items():
           
            for sub_word in aspect_tokens:
                if sub_word in str(value).lower():
                    
                    aspect_opinion_word.setdefault(aspect, [0,0])
                    
                    # For each aspect it get the opinion word like adverb, adjective, verbs, etc.
                    for word, tag in value:
                        if(    tag == 'JJ' 
                            or tag == 'JJR' 
                            or tag == 'JJS'
                            or tag == 'RB' 
                            or tag == 'RBR'
                            or tag == 'RBS' ):
                        
                            word_senti = word_sentiment(word)

                            if word_senti > 0: # Positive 
                                aspect_opinion_word[aspect][0] += word_senti
                            elif word_senti < 0: # Negative
                                aspect_opinion_word[aspect][1] += word_senti
    
    # Structure : {aspect: [Positive, Negative]}
    return aspect_opinion_word

In [9]:
# Import processed sample data from folder
infile = open("Stored Data/sample_review.pickle", "rb")
data = pk.load(infile)
infile.close()

data.head(10)

Unnamed: 0,Hotel,Reviews
0,Hotel Arena,Only the park outside of the hotel was beauti...
1,Hotel Arena,No real complaints the hotel was great great ...
2,Hotel Arena,Location was good and staff were ok It is cut...
3,Hotel Arena,Great location in nice surroundings the bar a...
4,Hotel Arena,Amazing location and building Romantic setting
5,Hotel Arena,Good restaurant with modern design great chil...
6,Hotel Arena,The room is spacious and bright The hotel is ...
7,Hotel Arena,Good location Set in a lovely park friendly s...
9,Hotel Arena,The room was big enough and the bed is good T...
10,Hotel Arena,Rooms were stunningly decorated and really sp...


In [10]:
data['Aspect_Sentiment'] = data['Reviews'].apply(lambda x: opinion_sentiment(x))
data.head()

Unnamed: 0,Hotel,Reviews,Aspect_Sentiment
0,Hotel Arena,Only the park outside of the hotel was beauti...,"{'park': [0.75, 0]}"
1,Hotel Arena,No real complaints the hotel was great great ...,"{'staff offer': [1.75, -1.0], 'staff': [0.875,..."
2,Hotel Arena,Location was good and staff were ok It is cut...,"{'staff': [0.5, 0], 'hotel': [0.5, 0], 'locati..."
3,Hotel Arena,Great location in nice surroundings the bar a...,"{'bar': [0.375, 0], 'restaurant': [0.375, 0], ..."
4,Hotel Arena,Amazing location and building Romantic setting,"{'building': [0.25, -0.25], 'location': [0.25,..."


In [12]:
# Store the aspect sample data
outfile = open("Stored Data/sample_review_w_aspects.pickle", "wb")
pk.dump(data, outfile)
outfile.close()

data.to_csv(r'Stored Data/sample_review_w_aspects.csv')