In [254]:
import pandas as pd
import os
import sys
import nltk
import ast
import re
import itertools
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
from nltk.tokenize import sent_tokenize

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [282]:
custom_stop_words = ['ok', 'yes', 'everything', 'one', 'two', 'three']
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(custom_stop_words)

grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'

### Load the Data

In [283]:
path = 'data/'
reviews_file = '7282_1.csv'

df = pd.read_csv(path+reviews_file)

# Sample some rows
df = df.sample(50)
print(df.head())
print(df.shape)

                      address                                         categories         city country   latitude   longitude                                        name postalCode                    province          reviews.date     reviews.dateAdded  reviews.doRecommend  reviews.id  reviews.rating                                       reviews.text              reviews.title reviews.userCity reviews.username reviews.userProvince
35526  1520 Candelaria Rd N E       Motels,Hotels,Bed & Breakfast & Inns,Lodging  Albuquerque      US  35.114822 -106.627610                  Ambassador Inn Albuquerque      87107  Los Ranchos De Albuquerque  2013-08-08T00:00:00Z  2016-06-22T19:06:57Z                  NaN         NaN             5.0  We stayed there for one night during August 20...                  Excellent       Sedgefield          Sarie V                   NM
12427  7380 Stage Road Hwy 64                                             Hotels      Memphis      US  35.204860  -89.808600  Suburb

In [284]:
df = df[['reviews.text']]
df.columns = ['reviews']

print(df.head())
print(type(df))

                                                 reviews
35526  We stayed there for one night during August 20...
12427  The room was clean, few blemishes, but the plu...
12570  Rooms are very nice.  Service is good. I perso...
20449  This hotel is located conveniently off of I-20...
4058   I've already written a review for my stay at t...
<class 'pandas.core.frame.DataFrame'>


### Preprocessing the reviews

In [237]:
def preProcessing(df):
    
    all_sentences = []
    for row in df.itertuples():
        text = row.reviews
        # split in sentences
        sentences = sent_tokenize(text)
        # split in words
        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        #print(sentences)
        all_sentences.append(sentences)
        #sentences_pos = [nltk.pos_tag(sent) for sent in sentences]
        #print(sentences_pos)
        
        #words = [word.lower() for sent in sentences for word in sent if word not in stop_words]
        #print(words)
        
        # Back to a single review
        #review = ' '.join([word for word in words])
        #print(review)
    return all_sentences

In [76]:
def reviewTagging(reviews):
    all_tagged_reviews = []
    for review in reviews:
        tagged_review = [nltk.pos_tag(sent) for sent in review]
        #print(tagged_review)
        all_tagged_reviews.append(tagged_review)
    
    #print(all_tagged_reviews)
    return all_tagged_reviews


In [125]:
def getChunks(sentence):
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    chunks = [chunker.parse(sentence)]
    #print "Chunks->>", chunks
    #print
    wtc_sents = [nltk.chunk.tree2conlltags(chunk) for chunk in chunks]    
    flattened_chunks = list(itertools.chain.from_iterable(wtc_sent 
                                                          for wtc_sent in wtc_sents))
    
    valid_chunks_tagged = [(status, [wtc for wtc in chunk])
                            for status, chunk
                                    in itertools.groupby(flattened_chunks,
                                            lambda word_pos_chunk: word_pos_chunk[2] != 'O')]
                                                
    valid_chunks = [' '.join(word.lower() for word, tag, chunk in wtc_group
                             if word.lower() not in stop_words) 
                            for status, wtc_group in valid_chunks_tagged if status]
                            
    valid_chunks = [word for word in valid_chunks if word]
                            
    all_chunks.append(valid_chunks)
        
    return all_chunks

In [158]:
def aspectExtraction(reviews):
    all_aspects = []
    
    for i, review in enumerate(reviews):
        #print("Review", i+1)
        review_aspect_list = []
        
        for sentence in review:
           chunks = getChunks(sentence)
           review_aspect_list.extend(chunks)
        #print(review_aspect_list)
        flattened_list = [y for x in review_aspect_list for y in x]
        #print(flattened_list)
        #print()
        
        all_aspects.append(flattened_list)
    #print(all_aspects)
    return all_aspects
    

In [285]:
reviews = preProcessing(df)
#print(reviews)
#print()

In [286]:
tagged_reviews = reviewTagging(reviews)
#print(tagged_reviews)

In [287]:
aspects = aspectExtraction(tagged_reviews)
print(aspects)

[['night', 'august', 'weeks', 'road', 'every night', 'different low budget motel', 'little gem', 'reasonable rate', 'room'], ['room', 'blemishes', 'plumbing', 'water pressure', 'bed', 'comfy', 'air conditioner', 'room cool', 'fridge', 'drinks', 'food cold', 'next trip', 'memphis'], ['rooms', 'service', 'hilton', 'street', 'service'], ['hotel', 'i-20', 'tons', 'restaurants', 'staff', 'glitch', 'room', 'staff', 'room', 'places', 'hotel'], ['review', 'stay', 'hotel', 'easter', 'december', 'return visit', 'wife', 'company christmas party', 'bisbee', 'co-workers', 'eldorado suites', 'warm hospitality', 'staff', 'check', 'breeze'], ['kit', 'meeting', 'group', 'success', 'marriott suites', 'meeting events', 'nice hotel', 'boss', 'particular hotel'], ['room', 'door', 'next room', 'neighbor', 'night', 'multiple times', 'sound', 'door', 'room', 'door latch', 'inside', 'room', 'work', 'pieces', 'equipment', 'fitness center', 'piece', 'many hampton inn', 'different states', 'stay'], ['staff', 'bre

In [288]:
# Put the aspect in the DF
df['aspects'] = aspects
print(df.head(30))
print(type(df))


                                                 reviews                                            aspects
35526  We stayed there for one night during August 20...  [night, august, weeks, road, every night, diff...
12427  The room was clean, few blemishes, but the plu...  [room, blemishes, plumbing, water pressure, be...
12570  Rooms are very nice.  Service is good. I perso...          [rooms, service, hilton, street, service]
20449  This hotel is located conveniently off of I-20...  [hotel, i-20, tons, restaurants, staff, glitch...
4058   I've already written a review for my stay at t...  [review, stay, hotel, easter, december, return...
31051  Working with Kit to schedule my meeting and ac...  [kit, meeting, group, success, marriott suites...
6713   We had a room with a door attaching to next ro...  [room, door, next room, neighbor, night, multi...
18513  Staff was lovely, breakfast was good. Air cond...                      [staff, breakfast, air, room]
12515  I made reservations 4