In [1]:
import datetime 
from dateutil.relativedelta import *
import os
import re 
import string
import nltk
import pandas as pd 
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

import matplotlib.pyplot as plt 

from collections import OrderedDict
import langdetect 
from langdetect import DetectorFactory , detect_langs # for dermining language  
DetectorFactory.seed = 0

from google_trans_new import google_translator  # translating words
translator = google_translator()  

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation


from wordcloud import WordCloud 
%matplotlib inline

In [2]:
'''Utility Functions'''
def concat_all_dataframes():
    ''' 
    Here we are going to combine all of the training csv's
    Into 1 dataframe that we can use for preprocessing
    
    adjust the path to whereever you are holding the data
    '''
#     path = os.path.join(os.getcwd(),'google_scraping','data')
    path = os.path.join(os.getcwd(),'Scrape_data')
    csvs = os.listdir(path)
    output = pd.DataFrame()
    
    count = 0
    for csv in csvs:
        if csv == '.ipynb_checkpoints':
            continue
        temp = pd.read_csv(os.path.join(path,csv))
        count += 1 
        
        if output.empty:
            output = temp        
        else:
            output = pd.concat([output,temp],axis=0)

    output.reset_index(inplace=True)        
    return output

def clean_translated_reviews(review):
    if 'Google' in review:
        res = review.split('(Original)')[0][23:]
        return res
    else:
        return review

def preprocessing_text(df):
    '''
    Here we are going to clean the text to be in a good format for further analysis 
    
    input: df ; DataFrame object that contains all of the reviews. 
    '''  
    lemm = WordNetLemmatizer()
    df = df.loc[~df.caption.isnull()] # here we are going to only process reviews 
    
    df.caption =  df.caption.apply(lambda review: clean_translated_reviews(review) if 'Google' in review else review)
        
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(str.lower)
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: re.sub(r'[^\w\s]','',review)) # removes punctuation
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: re.sub('\([^()]*\)','',review)) # removes words within parenthesis 
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: emoji_pattern.sub(r'', review)) 
    df.loc[:,'caption_split'] = df.loc[:,'caption'].apply(lambda review: review.split(' '))
    
    clean_reviews = []
    for review_idx , review in df.caption_split.items():
        temp = []
        for idx, word in enumerate(review):
            if word in stop_words:
                continue
            else:
                temp.append(lemm.lemmatize(word))
        
        review_  = ' '.join(temp)
        clean_reviews.append(review_)
    
    df.loc[:,'caption'] = clean_reviews
    df.drop(['Unnamed: 0'],axis=1,inplace=True)
    

    return df


def rating_by_loc(df):
    # here we are going to return the locations that the 
    temp = pd.DataFrame(df.groupby(['address']).rating.value_counts(normalize=True)).sort_index(ascending=True).unstack()
    temp.fillna(0,inplace=True)
    output = {}
    for row in temp.iterrows():
        output[row[0]] = row[1]['rating']
    
    return output
        
def plot_rating_by_loc(d,key):
    # given a key, we are going to be able to plot the rating distribution for that location.
    # the format of the key is 
    d[key].plot(kind='bar');
    plt.title(f'Distribution of ratings from BJs {key} Google Reviews');
    plt.xlabel('rating score');
    plt.xticks(rotation=0)
    plt.ylabel('percent');
    
 
    
def create_language_col(df):
    # Here are we going to make a column of the language of the name
    df['language'] = None 
    for row in df.iterrows():
#         print(row[0])
        if (row[0] in [150,415,987,1120,9780,28028,32711]):
            df.loc[row[0],'language'] = 'en'
        else:
            
            lang = detect_langs(row[1]['caption'].split('(Original) ')[-1]).pop().__repr__()[:2]
            df.loc[row[0],'language'] = lang

    return df


def creating_word_clouds(n_grams_from_reviews,review_score):
    d_n_grams = {gram:freq for gram,freq in zip(n_grams_from_reviews['n-gram'],n_grams_from_reviews['value_count'])}
    plt.figure(figsize=(10,8))
    wordcloud = WordCloud(width=1600, height=800,stopwords=stopwords, background_color="white").generate_from_frequencies(d_n_grams)
    # Display the generated image:
    # the matplotlib way:
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(f'wordcloud{review_score}.png', facecolor='k', bbox_inches='tight')

    
def convert_relative_to_datetime(df):
    ''' Here we are going to convert relative dates given from google into estimated dates
    
    Input:
        df: The dataframe that is going to contain the relative and retrieval dates
    '''
    for idx , row in df.iterrows():
        print(idx)
        date_scraped = datetime.datetime.strptime(df.loc[idx,'retrieval_date'][:10],'%Y-%m-%d')
        
        relative_date = df.loc[idx,'relative_date'].split()

        if 'hour' in relative_date[1]:
                if relative_date[1] == 'hour':
                    res = date_scraped - relativedelta(hours=1)
                else:
                    res = date_scraped - relativedelta(hours=int(relative_date[0]))
                    
        if 'day' in relative_date[1]:
                if relative_date[1] == 'day':
                    res = date_scraped - relativedelta(days=1)
                else:
                    res = date_scraped - relativedelta(days=int(relative_date[0]))
                    
        if 'week' in relative_date[1]:
                if relative_date[1] == 'week':
                    res = date_scraped - relativedelta(weeks=1)
                else:
                    res = date_scraped - relativedelta(weeks=int(relative_date[0]))
                    
        if 'month' in relative_date[1]:
                if relative_date[1] == 'month':
                    res = date_scraped - relativedelta(months=1)
                else:
                    res = date_scraped - relativedelta(months=int(relative_date[0]))
        elif 'year' in relative_date[1]:
            if relative_date[1] == 'year':
                res = date_scraped - relativedelta(years=1)
            else:
                res = date_scraped - relativedelta(years=int(relative_date[0]))

        df.loc[idx,'relative_date'] = res
        
    return df 


def save_ppl_data(df):
    
    res = df[['relative_date','username','address']]
    res.to_csv('data_for_george.csv')

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

def ngrams_and_topic_modeling(df,rating_score,return_tfidf = False):
    ''' Here we are going to make a function that print out the topics and the ngrams 
        of a given model based on 
    '''
    # Here we are going to be initializing the vectorizers 
    tfidf_ngram = TfidfVectorizer(max_features=50,ngram_range=(3,5),stop_words='english')
    tfidf_lda = TfidfVectorizer(max_features=50,stop_words='english')
    
    # Here we are going to be creating our df by rewview rating score 
    df_score = df.loc[df['star rating'] == rating_score,'caption']
    df_score_copy = df_score.copy(deep=True)
    
    df_score_transformed = tfidf_ngram.fit_transform(df_score)
    df_score_copy_transformed = tfidf_lda.fit_transform(df_score_copy)

    # getting ngrams and sorting in descending order
    tfidf_series = pd.Series(tfidf_ngram.vocabulary_)
    tfidf_series=tfidf_series.to_frame()
    tfidf_series['n-gram'] = tfidf_series.index
    tfidf_series.index = range(len(tfidf_series.index))
    tfidf_series.rename({0:'value_count'},axis=1,inplace=True)

    top_n_grams = tfidf_series.loc[~tfidf_series['n-gram'].str.contains('google')].sort_values('n-gram',ascending = False)
    print('done with ngrams')
    print('starting lda...')
    # Next we are going to do the Topic Modelling.
#     search_params = {'n_components': list(range(3,8)), 'learning_decay': [.5, .7, .9],'random_state':[23]}
    lda = LatentDirichletAllocation(n_components=3,learning_decay=.5,random_state=23)
#     grid = GridSearchCV(lda,param_grid=search_params,n_jobs=3)
    
    lda.fit(df_score_copy_transformed)
    
    tfidf_lda = pd.DataFrame(df_score_copy_transformed.toarray(),columns=tfidf_lda.get_feature_names())
    topics = display_topics(lda,tfidf_lda.columns,10)
    
    return top_n_grams , topics

In [3]:
samsclub = pd.read_csv('Scrape_data/samsclub_reviews_all.csv')
samsclub.rename(columns={'review':'caption'},inplace=True)
samsclub = preprocessing_text(samsclub)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.

In [5]:
ngrams_sams_5 , topics_5 = ngrams_and_topic_modeling(samsclub,rating_score=5)

done with ngrams
starting lda...


In [10]:
ngrams_sams_5

Unnamed: 0,value_count,n-gram
43,49,thank sam club
49,48,store clean organized
21,47,staff friendly helpful
11,46,shopping sam club
30,45,shop sam club
8,44,service great price
34,43,sam club need
44,42,sam club location
42,41,sam club great place
14,40,sam club great


In [11]:
topics_5

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,store,1068.2,great,2234.2,good,1995.3
1,service,1021.0,price,1292.4,sam,1470.4
2,clean,950.8,place,968.9,club,1106.0
3,friendly,773.0,excellent,881.7,love,985.7
4,staff,702.0,need,660.1,nice,686.3
5,helpful,669.1,love,493.3,best,572.6
6,customer,511.3,experience,488.3,item,541.8
7,time,471.9,like,471.4,scan,537.6
8,shop,435.0,awesome,441.0,line,522.4
9,stocked,399.1,product,422.2,app,441.6


In [6]:
ngrams_sams_4 , topics_4 = ngrams_and_topic_modeling(samsclub,rating_score=4)

done with ngrams
starting lda...


In [12]:
ngrams_sams_4

Unnamed: 0,value_count,n-gram
11,49,visit sam club
6,48,use scan app
40,47,typical sam club
9,46,toilet paper paper towel
7,45,toilet paper paper
33,44,store friendly staff
49,43,store clean stocked
0,42,store clean organized
42,41,staff friendly helpful
13,40,shopping sam club


In [13]:
topics_4

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,store,383.3,price,423.7,good,687.8
1,clean,303.0,need,290.9,sam,478.8
2,great,297.8,item,252.6,club,368.7
3,line,258.7,love,240.9,place,354.1
4,staff,217.8,selection,209.8,nice,280.7
5,time,216.6,great,187.4,service,247.9
6,friendly,211.6,lot,187.2,like,243.0
7,thing,194.3,product,182.2,shop,206.1
8,helpful,174.9,good,168.2,busy,185.2
9,experience,163.1,people,160.9,bulk,182.1


In [7]:
ngrams_sams_3 , topics_3 = ngrams_and_topic_modeling(samsclub,rating_score=3)

done with ngrams
starting lda...


In [14]:
ngrams_sams_3

Unnamed: 0,value_count,n-gram
6,49,went sam club
13,48,wait long line
8,47,typical sam club
37,46,towel toilet paper
46,45,toilet paper paper
26,44,standard sam club
43,43,smaller sam club
4,42,shopping sam club
24,41,service need work
21,40,self checkout line


In [15]:
topics_3

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,good,181.3,product,87.9,sam,179.9
1,price,139.4,line,77.3,store,177.0
2,time,111.8,cashier,73.3,club,134.4
3,service,108.9,people,72.8,need,108.7
4,great,98.9,ok,72.4,like,99.6
5,thing,85.5,employee,67.4,item,93.4
6,customer,83.7,checkout,66.6,dont,89.8
7,went,61.4,lot,63.1,long,81.9
8,selection,56.9,busy,62.7,place,78.4
9,today,56.0,didnt,62.1,shopping,64.8


In [18]:
ngrams_sams_2 , topics_2 = ngrams_and_topic_modeling(samsclub,rating_score=2)

done with ngrams
starting lda...


In [19]:
ngrams_sams_2

Unnamed: 0,value_count,n-gram
29,49,worst sam club ive
1,48,worst sam club
16,47,worst customer service
45,46,working customer service
41,45,went sam club
13,44,waited 20 minute
38,43,wait long time
33,42,used sam club
9,41,took half hour
36,40,toilet paper paper towel


In [20]:
topics_2

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,service,126.3,item,79.2,store,124.9
1,customer,115.6,dont,65.4,sam,119.1
2,tire,49.9,need,62.4,club,83.4
3,help,49.5,employee,61.2,line,61.3
4,minute,42.2,people,59.1,time,54.8
5,wait,41.8,price,58.8,check,52.7
6,today,41.2,cashier,46.4,membership,52.4
7,good,37.0,rude,44.5,long,52.0
8,told,35.2,lot,41.1,cart,41.1
9,went,34.7,didnt,40.1,staff,40.5


In [21]:
ngrams_sams_1 , topics_1 = ngrams_and_topic_modeling(samsclub,rating_score=1)

done with ngrams
starting lda...


In [22]:
ngrams_sams_1

Unnamed: 0,value_count,n-gram
41,49,worst sam club ive
30,48,worst sam club
23,47,worst customer service ive
13,46,worst customer service
33,45,worse customer service
6,44,went sam club
46,43,went customer service
45,42,waited 15 minute
35,41,tire battery department
42,40,tire battery center


In [23]:
topics_1

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,store,270.4,sam,350.9,service,415.8
1,tire,242.0,club,293.3,customer,393.5
2,went,179.2,people,179.6,dont,210.3
3,item,154.4,membership,170.0,bad,165.5
4,hour,143.3,time,136.0,manager,151.5
5,told,139.8,staff,134.5,phone,131.6
6,minute,133.3,need,133.2,cashier,122.8
7,card,128.7,member,131.1,terrible,122.4
8,line,116.9,rude,129.4,check,116.4
9,shop,110.7,employee,122.5,like,115.2
