In [39]:
import numpy as np
import pandas as pd
from preprocess import filter_pos, process_text, remove_nt
from vader import get_sentiment
from pain_points import get_frequent, get_negative_tokens, create_token_match_columns, process_token_df

In [40]:
import os
from pymongo import MongoClient
from pycommon.warehouse.load_queries import acquire_all_review_data
import datetime

In [41]:
#port = os.getenv("MONGO_PORT") if os.getenv("MONGO_PORT") is not None else 27017 # MONGO_PORT defines the port number. 
port= 27017
mongo_client = MongoClient('localhost', port) # mongo is always the host. Again, docker handles this dns resolution.

# And we're good! mongo is ready to be used. Most of the methods in pycommon/warehouse need you to 
# pass in the mongoclient. 

reviews = acquire_all_review_data(
        mongo_client, 
        datetime.datetime(2001,12,1,0,0).timestamp(), # from
        datetime.datetime(2018,12,1,0,0).timestamp(), # to
        "SimpangAsia",
        "Yelp"
    )

reviews_array = []
for review in reviews:
    reviews_array.append(review)

acquire all review data with skip and limit types: <class 'NoneType'> <class 'NoneType'>


In [42]:
len(reviews_array)

1275

In [43]:
d = {
    "timestamp": [reviews_array[i].timestamp for i in range(0,len(reviews_array))],
    "source_id": [reviews_array[i].source_id for i in range(0,len(reviews_array))],
    "review_id": [reviews_array[i].review_id for i in range(0,len(reviews_array))],
    "business_id": [reviews_array[i].business_id for i in range(0,len(reviews_array))],
    "review_content": [reviews_array[i].content for i in range(0,len(reviews_array))],
    "review_rating": [reviews_array[i].rating for i in range(0,len(reviews_array))],
}

df = pd.DataFrame(data=d)

In [44]:
df.head()

Unnamed: 0,timestamp,source_id,review_id,business_id,review_content,review_rating
0,2018-11-26,Yelp,b654548d-4783-4013-b247-ff5b5a833e8f,SimpangAsia,"Sat 10:30a, no wait for 5, parking lot w valet...",5
1,2018-10-25,Yelp,4b385f99-0613-441f-9b7a-eeb996074c77,SimpangAsia,I've heard a lot of good things about Simpang ...,5
2,2018-11-14,Yelp,35957733-3885-4542-894a-bc506111fdf7,SimpangAsia,Great and simple restaurant in West LA. The am...,4
3,2018-11-02,Yelp,eeab5b21-2ab0-4819-a101-f11c2c15cb28,SimpangAsia,Indonesia goodness! The fish was extremely cri...,4
4,2018-09-16,Yelp,49b15113-6d94-4de5-a790-8e076ed7e6a4,SimpangAsia,Not many Indonesian restaurants near where we ...,4


In [45]:
# retains only adjectives and adverbs for reviews
df['review_tokens'] = df['review_content'].apply(filter_pos)
# Makes lowercase, removes punctuation and stopwords, and lemmatizes remaining words
df['review_tokens'] = df['review_tokens'].apply(process_text)
# removes the word 'nt'
df['review_tokens'] = df['review_tokens'].apply(remove_nt)

In [46]:
df.head()

Unnamed: 0,timestamp,source_id,review_id,business_id,review_content,review_rating,review_tokens
0,2018-11-26,Yelp,b654548d-4783-4013-b247-ff5b5a833e8f,SimpangAsia,"Sat 10:30a, no wait for 5, parking lot w valet...",5,thorough incorrect real garlic bagu deliciou a...
1,2018-10-25,Yelp,4b385f99-0613-441f-9b7a-eeb996074c77,SimpangAsia,I've heard a lot of good things about Simpang ...,5,thorough long crazy complimentary absolute fla...
2,2018-11-14,Yelp,35957733-3885-4542-894a-bc506111fdf7,SimpangAsia,Great and simple restaurant in West LA. The am...,4,great simple slow nice great flavorful Green_P...
3,2018-11-02,Yelp,eeab5b21-2ab0-4819-a101-f11c2c15cb28,SimpangAsia,Indonesia goodness! The fish was extremely cri...,4,high crispy great considerable also dice Green...
4,2018-09-16,Yelp,49b15113-6d94-4de5-a790-8e076ed7e6a4,SimpangAsia,Not many Indonesian restaurants near where we ...,4,many Indonesia sure west dry embarrassedly pop...


In [47]:
# getting tokens
most_freq = get_frequent(df['review_tokens'],500)
neg_corp = get_negative_tokens(most_freq)
len(neg_corp)

33

In [48]:
create_token_match_columns(neg_corp, df)

In [49]:
df.head()

Unnamed: 0,timestamp,source_id,review_id,business_id,review_content,review_rating,review_tokens,bad,hard,disappoint,...,sorry,annoy,worse,negative,hide,disgust,sick,hesitant,nasty,weak
0,2018-11-26,Yelp,b654548d-4783-4013-b247-ff5b5a833e8f,SimpangAsia,"Sat 10:30a, no wait for 5, parking lot w valet...",5,thorough incorrect real garlic bagu deliciou a...,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2018-10-25,Yelp,4b385f99-0613-441f-9b7a-eeb996074c77,SimpangAsia,I've heard a lot of good things about Simpang ...,5,thorough long crazy complimentary absolute fla...,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2018-11-14,Yelp,35957733-3885-4542-894a-bc506111fdf7,SimpangAsia,Great and simple restaurant in West LA. The am...,4,great simple slow nice great flavorful Green_P...,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2018-11-02,Yelp,eeab5b21-2ab0-4819-a101-f11c2c15cb28,SimpangAsia,Indonesia goodness! The fish was extremely cri...,4,high crispy great considerable also dice Green...,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2018-09-16,Yelp,49b15113-6d94-4de5-a790-8e076ed7e6a4,SimpangAsia,Not many Indonesian restaurants near where we ...,4,many Indonesia sure west dry embarrassedly pop...,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [50]:
token_df = process_token_df(neg_corp, df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['neg_sentence'] = token_df['review_content'].apply(lambda x: get_neg_sentence(neg_token_list[index], x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  token_df['df_len'] = len(token_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [51]:
token_df.sort_values(['df_len','token'], ascending = False, inplace=True)
token_df.reset_index(inplace=True)

In [52]:
# getting the top 10 tokens
# NOT USED
top_ten = token_df['token'].unique()[:10]
top_ten
top_eleven = token_df['token'].unique()[:11]
top_eleven

array(['bad', 'hard', 'disappoint', 'unfortunate', 'difficult',
       'terrible', 'weird', 'serious', 'poor', 'worst', 'tough'],
      dtype=object)

In [53]:
# getting the index number for the top 10 tokens up till the last evidence (neg_sentence)
# NOT USED
index = token_df[token_df['token'] == top_eleven[-1]].index.values[0]
index

215

In [54]:
# seeing the top 10 negative tokens and their evidences
# NOT USED
token_df[:index]

Unnamed: 0,level_0,index,timestamp,source_id,review_id,business_id,review_content,review_rating,review_tokens,bad,...,negative,hide,disgust,sick,hesitant,nasty,weak,neg_sentence,df_len,token
0,0,9,2018-08-06,Yelp,3b3854d0-957e-4f01-8ae8-98dfca70d307,SimpangAsia,OMG! First time in this place after searching ...,5,first everlasting Indonesia Malaysia happy yum...,True,...,False,False,False,False,False,False,False,It's insanely spicy! I felt the fire in my mo...,48,bad
1,1,51,2017-10-01,Yelp,17e90eec-b87e-466a-9a9d-cef9c1fb89e2,SimpangAsia,Simpang was my favorite discovery when I moved...,5,favorite first new subsequent locality bad eas...,True,...,False,False,False,False,False,False,False,Let's start with the bad: parking,48,bad
2,2,66,2017-12-07,Yelp,09748bb3-2e5e-4485-8b29-f756557fa222,SimpangAsia,Simpang Asia serves some great dishes. The goo...,4,great thorough tender bad fast hot enough odd ...,True,...,False,False,False,False,False,False,False,The bad side is that some dishes came out ver...,48,bad
3,3,69,2018-11-07,Yelp,71f4192f-65c9-4927-bb56-f77c153700bd,SimpangAsia,Worst place ever.. very bad services and food ...,1,worst ever bad thorough mild extra hot true le...,True,...,False,False,False,False,False,False,False,very bad services and food is not good,48,bad
4,4,205,2016-06-11,Yelp,f8b5e6c3-20e8-4b33-a91d-4e0b8075a82b,SimpangAsia,The service was absolutely horrible. I'm sure...,1,absolute horrible sure easier bland reasonable...,True,...,False,False,False,True,False,False,False,"It was so bad, he filed a report with the LA ...",48,bad
5,5,235,2018-03-06,Yelp,41af5769-bc4f-4ea0-a1f8-feaec25b458e,SimpangAsia,"Food is good with good price, very like tai fo...",4,thorough thorough bad eventual earlier,True,...,False,False,False,False,False,False,False,Too bad I finally have this after I lived in ...,48,bad
6,6,252,2018-04-08,Yelp,72e42ccc-0efd-44f8-a068-218bec430c60,SimpangAsia,If you know anything about my Yelp reviews - m...,5,primary bad legit favorite ruen flavorful auth...,True,...,False,False,False,False,False,False,False,If you know anything about my Yelp reviews - m...,48,bad
7,7,280,2018-06-16,Yelp,c654fd5e-7f52-4b9f-8784-2fd68d67ded8,SimpangAsia,Ordered from yelp . After 1 hour when I left t...,1,hungry bad,True,...,False,False,False,False,False,False,False,Kids are so hungry and Grubhub so bad,48,bad
8,8,338,2014-07-22,Yelp,3d9d0d16-b534-4baa-ae23-9635cfe83475,SimpangAsia,If you want to sample what Indonesian food tas...,4,Indonesia proper Indonesia Indonesia considera...,True,...,False,False,False,False,False,False,False,Their Es Teler is bad,48,bad
9,9,339,2016-10-21,Yelp,4d27c7bb-8dde-4e5f-82e0-9866ca1cb165,SimpangAsia,"Bad service, horrible operations, period.We go...",1,bad horrible waitres many sure subsequent subs...,True,...,False,False,False,False,False,False,False,"Bad service, horrible operations, period",48,bad


In [55]:
# from pycommon.warehouse.store_insights import store_painpoint_insights
# from pycommon.warehouse.objects import painpoint_object

# TODO: convert all the stuff above to a list of painpoint_object classes. Each list is one painpoint.

# NOTE: ching I changed it. Pls follow the format below:
# class painpoint_object:
#    def __init__ (self, title: str, description: str, solution: str, source_id: str, business_id: str, example_review_ids: List[str], timestamp: int):

# e.g. a painpoint object for the 'bad' reviews



In [67]:
evid= {}

evid[1094] = 'A'

In [68]:
evid

{1094: 'A'}

In [82]:
review_id_evidence = {}
review_ids = token_df[token_df['token'] == 'bad']['review_id']
for review_id in review_ids:

    evidence = list(token_df[(token_df['review_id'] == review_id) & (token_df['token'] == 'bad')]['neg_sentence'])
    review_id_evidence[review_id] = evidence

In [83]:
review_id_evidence

{'3b3854d0-957e-4f01-8ae8-98dfca70d307': [" It's insanely spicy! I felt the fire in my mouth till the next day that's how bad it was!!! But now I m ow for next time! Can't wait go back!!!"],
 '17e90eec-b87e-466a-9a9d-cef9c1fb89e2': ["Let's start with the bad: parking"],
 '09748bb3-2e5e-4485-8b29-f756557fa222': [' The bad side is that some dishes came out very fast and were not hot enough'],
 '71f4192f-65c9-4927-bb56-f77c153700bd': [' very bad services and food is not good'],
 'f8b5e6c3-20e8-4b33-a91d-4e0b8075a82b': [' It was so bad, he filed a report with the LA Department of Public Health'],
 '41af5769-bc4f-4ea0-a1f8-feaec25b458e': [' Too bad I finally have this after I lived in culver for three years, I should try it earlier'],
 '72e42ccc-0efd-44f8-a068-218bec430c60': ['If you know anything about my Yelp reviews - my primary goal for being on this site is to read all the bad reviews and respond to them in my review to assure you that this place is legit'],
 'c654fd5e-7f52-4b9f-8784-2

In [64]:
# final_tokens = list(pd.unique(token_df['token']))
# for token in final_tokens:
    
#     review_id_evidence = {}
    
#     review_ids = token_df[token_df['token'] == token]['review_id']
#     for review_id in review_ids:

#         evidence = list(token_df[(token_df['review_id'] == review_id) & (token_df['token'] == token)]['neg_sentence'])
#         review_id_evidence[review_id] = [evidence]

In [74]:
# moving the processed data into the dataframe
# ALL NEGATIVE TOKENS AND THEIR EVIDENCE WILL BE ADDED TO THE DATAFRAME; NOT JUST THE TOP 10 NEGATIVE TOKENS
# final_tokens = list(pd.unique(token_df['token']))
# list of painpoint_object objects
# all_objects = []

# for token in final_tokens:
#    review_id_evidence = {}
    
#    review_ids = token_df[token_df['token'] == token]['review_id']
#    for review_id in review_ids:

#        evidence = list(token_df[(token_df['review_id'] == review_id) & (token_df['token'] == token)]['neg_sentence'])
#        review_id_evidence[review_id] = [evidence]
    
    # creates a painpoint_object object
#    pain_point = painpoint_object(token, "Description", "Solution", "Yelp", "SimpangAsia", review_id_evidence, datetime.datetime(2018,12,1,0,0).timestamp())
#    all_objects.append(pain_point)
    
# adding to the db and checking if successful
#store_success = store_painpoint_insights(mongo_client, all_objects)
#if store_success:
#    print("yay its done")
#else:
#    print("riperdoodles")