In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import spacy
nlp = spacy.load('en_core_web_sm')

from preprocess import filter_pos, process_text, remove_nt, lemma_pattern, lemmatize_word, adv_to_adj
from vader import get_sentiment
from pain_points import get_frequent, get_negative_tokens, create_token_match_columns, process_token_df

from pymongo import MongoClient
from pycommon.warehouse.load_queries import acquire_all_review_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
#port = os.getenv("MONGO_PORT") if os.getenv("MONGO_PORT") is not None else 27017 # MONGO_PORT defines the port number. 
port= 27017
mongo_client = MongoClient('localhost', port) # mongo is always the host. Again, docker handles this dns resolution.

# And we're good! mongo is ready to be used. Most of the methods in pycommon/warehouse need you to 
# pass in the mongoclient. 

reviews = acquire_all_review_data(
        mongo_client, 
        datetime.datetime(2001,12,1,0,0).timestamp(), # from
        datetime.datetime(2018,12,1,0,0).timestamp(), # to
        "SimpangAsia",
        "Yelp"
    )

reviews_array = []
for review in reviews:
    reviews_array.append(review)

acquire all review data with skip and limit types: <class 'NoneType'> <class 'NoneType'>


In [3]:
d = {
    "timestamp": [reviews_array[i].timestamp for i in range(0,len(reviews_array))],
    "source_id": [reviews_array[i].source_id for i in range(0,len(reviews_array))],
    "business_id": [reviews_array[i].business_id for i in range(0,len(reviews_array))],
    "review_content": [reviews_array[i].content for i in range(0,len(reviews_array))],
    "review_rating": [reviews_array[i].rating for i in range(0,len(reviews_array))],
}

df = pd.DataFrame(data=d)

In [4]:
# retains only adjectives and adverbs for reviews
df['review_tokens'] = df['review_content'].apply(filter_pos)
# Makes lowercase, removes punctuation and stopwords, and lemmatizes remaining words
df['review_tokens'] = df['review_tokens'].apply(process_text)
# removes the word 'nt'
df['review_tokens'] = df['review_tokens'].apply(remove_nt)

In [5]:
# getting tokens
most_freq = get_frequent(df['review_tokens'],500)
neg_corp = get_negative_tokens(most_freq)

In [6]:
create_token_match_columns(neg_corp, df)
token_df = process_token_df(neg_corp, df)
token_df.sort_values(['df_len','token'], ascending = False, inplace=True)
token_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['neg_sentence'] = token_df['review_content'].apply(lambda x: get_neg_sentence(neg_token_list[index], x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['df_len'] = len(token_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [7]:
token_df.drop(neg_corp, axis=1, inplace=True)
token_df.drop(['level_0', 'index','review_content'], axis=1, inplace=True)

In [8]:
token_df.head()

Unnamed: 0,timestamp,source_id,business_id,review_rating,review_tokens,neg_sentence,df_len,token
0,2018-08-06,Yelp,SimpangAsia,5,first everlasting Indonesia Malaysia happy yum...,It's insanely spicy! I felt the fire in my mo...,48,bad
1,2017-10-01,Yelp,SimpangAsia,5,favorite first new subsequent locality bad eas...,Let's start with the bad: parking,48,bad
2,2017-12-07,Yelp,SimpangAsia,4,great thorough tender bad fast hot enough odd ...,The bad side is that some dishes came out ver...,48,bad
3,2018-11-07,Yelp,SimpangAsia,1,worst ever bad thorough mild extra hot true le...,very bad services and food is not good,48,bad
4,2016-06-11,Yelp,SimpangAsia,1,absolute horrible sure easier bland reasonable...,"It was so bad, he filed a report with the LA ...",48,bad


In [9]:
def noun_adj_matcher_pos(sentence):
    # matches each noun in the setence to the adjective and returns a list of tuples containing [(noun, adj)] 
    doc = nlp(sentence)
    noun_adj_pairs = []
    for i,token in enumerate(doc):
        if token.pos_ not in ('NOUN','PROPN','VERB'):
            continue
        for j in range(i+1,len(doc)):
            if doc[j].pos_ == 'ADJ' or doc[j].pos_ == 'ADV':
                noun_adj_pairs.append((token.text,doc[j].text))
                break
    return noun_adj_pairs

In [10]:
def adj_noun_matcher_pos(sentence):
    # matches each noun in the setence to the adjective and returns a list of tuples containing [(noun, adj)] 
    doc = nlp(sentence)
    noun_adj_pairs = []
    for i,token in enumerate(doc):
        if token.pos_ not in ('ADJ', 'ADV'):
            continue
        for j in range(i+1,len(doc)):
            if doc[j].pos_ in ('NOUN','PROPN','VERB'):
                noun_adj_pairs.append((token.text,doc[j].text))
                break
    return noun_adj_pairs

In [11]:
# def verb_adv_matcher_pos(sentence):
    # matches each verb in the setence to the adverb and returns a list of tuples containing [(verb, adv)] 
#    doc = nlp(sentence)
#    verb_adv_pairs = []
#    for i,token in enumerate(doc):
#        if token.pos_ not in ('VERB'):
#            continue
#        for j in range(i+1,len(doc)):
#            if doc[j].pos_ == 'ADV':
#                verb_adv_pairs.append((token.text,doc[j].text))
#                break
#    return verb_adv_pairs

In [12]:
def match_pos(sentence):
    noun_list = noun_adj_matcher_pos(sentence)
    # verb_list = verb_adv_matcher_pos(sentence)
    # return noun_list+verb_list
    return noun_list+adj_noun_matcher_pos(sentence)

In [13]:
token_df.head()

Unnamed: 0,timestamp,source_id,business_id,review_rating,review_tokens,neg_sentence,df_len,token
0,2018-08-06,Yelp,SimpangAsia,5,first everlasting Indonesia Malaysia happy yum...,It's insanely spicy! I felt the fire in my mo...,48,bad
1,2017-10-01,Yelp,SimpangAsia,5,favorite first new subsequent locality bad eas...,Let's start with the bad: parking,48,bad
2,2017-12-07,Yelp,SimpangAsia,4,great thorough tender bad fast hot enough odd ...,The bad side is that some dishes came out ver...,48,bad
3,2018-11-07,Yelp,SimpangAsia,1,worst ever bad thorough mild extra hot true le...,very bad services and food is not good,48,bad
4,2016-06-11,Yelp,SimpangAsia,1,absolute horrible sure easier bland reasonable...,"It was so bad, he filed a report with the LA ...",48,bad


In [14]:
def get_word_described(sentence, token):
    # returns one tuple containing the description as well as the word being described from neg_sentence
    tuple_list = match_pos(sentence)
    for word, description in tuple_list:
        if description == token:
            return word, description
    # return None, token
    return 'Failed'

In [15]:
def create_word_described_cols(df):
    df['word_described'] = 'NA'
    for index in range(0, len(df)):
        df['word_described'][index] = get_word_described(df['neg_sentence'].apply(lambda x: lemmatize_word(adv_to_adj(lemma_pattern(x))))[index], df['token'][index])

In [16]:
# This method is not as effective
#
# def create_word_described_cols(df):
#    df['word_described'] = 'NA'
#    for index in range(0, len(df)):
#        df['word_described'][index] = get_word_described(df['review_tokens'][index], df['token'][index])

In [17]:
create_word_described_cols(token_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [18]:
token_df.head()

Unnamed: 0,timestamp,source_id,business_id,review_rating,review_tokens,neg_sentence,df_len,token,word_described
0,2018-08-06,Yelp,SimpangAsia,5,first everlasting Indonesia Malaysia happy yum...,It's insanely spicy! I felt the fire in my mo...,48,bad,Failed
1,2017-10-01,Yelp,SimpangAsia,5,favorite first new subsequent locality bad eas...,Let's start with the bad: parking,48,bad,"(let, bad)"
2,2017-12-07,Yelp,SimpangAsia,4,great thorough tender bad fast hot enough odd ...,The bad side is that some dishes came out ver...,48,bad,Failed
3,2018-11-07,Yelp,SimpangAsia,1,worst ever bad thorough mild extra hot true le...,very bad services and food is not good,48,bad,Failed
4,2016-06-11,Yelp,SimpangAsia,1,absolute horrible sure easier bland reasonable...,"It was so bad, he filed a report with the LA ...",48,bad,Failed


In [19]:
token_df['word_described'].value_counts()

Failed                    203
(parking, difficult)        4
(parking, hard)             4
('s, hard)                  3
(parking, terrible)         3
(was, terrible)             3
(service, bad)              2
(had, worst)                2
(was, weird)                2
('s, odd)                   2
(meat, tough)               1
(rames, horrible)           1
(ambience, terrible)        1
(boy, difficult)            1
(what, strange)             1
(be, hard)                  1
(strikes, strange)          1
(eat, sick)                 1
(valet, odd)                1
(brought, strange)          1
(have, low)                 1
(food, poor)                1
(is, worst)                 1
(were, bad)                 1
(based, bad)                1
(complaint, bad)            1
('d, bad)                   1
(service, worse)            1
(groceries, hard)           1
(place, empty)              1
                         ... 
(curry, hard)               1
(was, insane)               1
(found, di

In [20]:
len(token_df)

329

In [21]:
# Simple noun-adj / verb-adv matching does not work! Sentences are too complicated to match

In [22]:
# Let us check the accuracy of our results

In [23]:
def check_accuracy(row):
    print ('Token: ', token_df.iloc[row]['token'])
    print ('Sentence: ', token_df.iloc[row]['neg_sentence'])
    print ('Lemmatized: ', lemmatize_word(adv_to_adj(lemma_pattern(token_df.iloc[row]['neg_sentence']))))
    print ('Processed: ', process_text(token_df.iloc[row]['neg_sentence']))
    print ('Tuple: ', token_df.iloc[row]['word_described'])
    print ('Lemmatized Description: ', [word for word in nlp(lemmatize_word(adv_to_adj(lemma_pattern(token_df.iloc[row]['neg_sentence'])))) if word.pos_ in ('NOUN', 'PNOUN', 'VERB')])
    print ('Processed Description: ', [word for word in nlp(process_text(token_df.iloc[row]['neg_sentence'])) if word.pos_ in ('NOUN', 'PNOUN', 'VERB')])

In [24]:
check_accuracy(1)

Token:  bad
Sentence:  Let's start with the bad: parking
Lemmatized:  let's start with the bad: park
Processed:  let start bad park
Tuple:  ('let', 'bad')
Lemmatized Description:  [let, start, park]
Processed Description:  [let, start, park]


In [25]:
# HERE IS ONE PROBLEM!
doc = nlp('bad')
for token in doc:
    print (token.pos_)

ADJ


In [26]:
# PREVIOUS METHODS FAILED BECAUSE DESCRIPTION IS CHECKED ONLY AFTER THE WORD
# THIS IS SPACY's RULE BASED MATCHING

In [27]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)

doc = nlp(u"Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


In [28]:
# THIS IS NLTK'S ENTITY RECOGNITION

In [29]:
# GETTING ALL ENTITIES
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
 
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                    current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                    named_entity = " ".join(current_chunk)
                    if named_entity not in continuous_chunk:
                            continuous_chunk.append(named_entity)
                            current_chunk = []
            else:
                    continue
    return continuous_chunk

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [30]:
# Extracting Label for Entity
def get_entity_label(sentence):
    for sent in nltk.sent_tokenize(sentence):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                print(chunk.label(), ' '.join(c[0] for c in chunk))

In [31]:
sentence = "food was very bad"

In [32]:
print(get_continuous_chunks(sentence))
get_entity_label(sentence)

[]


In [33]:
# SEEMS THAT USING ENTITY RECOGNITION DOES NOT WORK AS WELL