In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import spacy
nlp = spacy.load('en_core_web_sm')

from preprocess import filter_pos, process_text, remove_nt, lemma_pattern, lemmatize_word, adv_to_adj
from vader import get_sentiment
from pain_points import get_frequent, get_negative_tokens, create_token_match_columns, process_token_df

from pymongo import MongoClient
from pycommon.warehouse.load_queries import acquire_all_review_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
#port = os.getenv("MONGO_PORT") if os.getenv("MONGO_PORT") is not None else 27017 # MONGO_PORT defines the port number. 
port= 27017
mongo_client = MongoClient('localhost', port) # mongo is always the host. Again, docker handles this dns resolution.

# And we're good! mongo is ready to be used. Most of the methods in pycommon/warehouse need you to 
# pass in the mongoclient. 

reviews = acquire_all_review_data(
        mongo_client, 
        datetime.datetime(2001,12,1,0,0).timestamp(), # from
        datetime.datetime(2018,12,1,0,0).timestamp(), # to
        "SimpangAsia",
        "Yelp"
    )

reviews_array = []
for review in reviews:
    reviews_array.append(review)

acquire all review data with skip and limit types: <class 'NoneType'> <class 'NoneType'>


In [3]:
d = {
    "timestamp": [reviews_array[i].timestamp for i in range(0,len(reviews_array))],
    "source_id": [reviews_array[i].source_id for i in range(0,len(reviews_array))],
    "business_id": [reviews_array[i].business_id for i in range(0,len(reviews_array))],
    "review_content": [reviews_array[i].content for i in range(0,len(reviews_array))],
    "review_rating": [reviews_array[i].rating for i in range(0,len(reviews_array))],
}

df = pd.DataFrame(data=d)

In [4]:
# retains only adjectives and adverbs for reviews
df['review_tokens'] = df['review_content'].apply(filter_pos)
# Makes lowercase, removes punctuation and stopwords, and lemmatizes remaining words
df['review_tokens'] = df['review_tokens'].apply(process_text)
# removes the word 'nt'
df['review_tokens'] = df['review_tokens'].apply(remove_nt)

In [5]:
# getting tokens
most_freq = get_frequent(df['review_tokens'],500)
neg_corp = get_negative_tokens(most_freq)

In [6]:
create_token_match_columns(neg_corp, df)
token_df = process_token_df(neg_corp, df)
token_df.sort_values(['df_len','token'], ascending = False, inplace=True)
token_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['neg_sentence'] = token_df['review_content'].apply(lambda x: get_neg_sentence(neg_token_list[index], x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['df_len'] = len(token_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [7]:
token_df.drop(neg_corp, axis=1, inplace=True)
token_df.drop(['level_0', 'index','review_content'], axis=1, inplace=True)

In [8]:
token_df.head()

Unnamed: 0,timestamp,source_id,business_id,review_rating,review_tokens,neg_sentence,df_len,token
0,2018-08-06,Yelp,SimpangAsia,5,first everlasting Indonesia Malaysia happy yum...,It's insanely spicy! I felt the fire in my mo...,48,bad
1,2017-10-01,Yelp,SimpangAsia,5,favorite first new subsequent locality bad eas...,Let's start with the bad: parking,48,bad
2,2017-12-07,Yelp,SimpangAsia,4,great thorough tender bad fast hot enough odd ...,The bad side is that some dishes came out ver...,48,bad
3,2018-11-07,Yelp,SimpangAsia,1,worst ever bad thorough mild extra hot true le...,very bad services and food is not good,48,bad
4,2016-06-11,Yelp,SimpangAsia,1,absolute horrible sure easier bland reasonable...,"It was so bad, he filed a report with the LA ...",48,bad


In [9]:
# Let us try to use Topic Modelling

In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iechi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
topic = token_df['neg_sentence']

In [37]:
topic_df = pd.DataFrame(topic, columns=['neg_sentence'])

In [47]:
def process_sentence(x):
    return process_text(x).split()

In [50]:
processed_docs = topic_df['neg_sentence'].map(process_sentence)

In [51]:
processed_docs

0      [insane, spicy, feel, fire, mouth, till, next,...
1                                [let, start, bad, park]
2             [bad, side, dish, come, fast, hot, enough]
3                         [bad, service, food, thorough]
4      [bad, file, report, la, department, public, he...
5      [bad, eventual, live, culver, three, year, try...
6      [know, anythe, yelp, review, primary, goal, si...
7                            [kid, hungry, grubhub, bad]
8                                        [e, teler, bad]
9            [bad, service, horrible, operation, period]
10     [high, rave, yelp, fail, take, consideration, ...
11     [one, bad, change, cant, order, nasi, goreng, ...
12     [omg, three, hour, subsequent, give, true, bad...
13                             [way, slow, bad, service]
14                                           [bad, Asia]
15     [somethe, say, almost, drop, fork, spoon, take...
16     [fourth, food, awful, get, date, padang, sioma...
17                        [thor