In [1]:
import datetime 
from dateutil.relativedelta import *
import os
import re 
import string
import nltk
import pandas as pd 
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

import matplotlib.pyplot as plt 

from collections import OrderedDict
import langdetect 
from langdetect import DetectorFactory , detect_langs # for dermining language  
DetectorFactory.seed = 0

from google_trans_new import google_translator  # translating words
translator = google_translator()  

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation


from wordcloud import WordCloud 
%matplotlib inline

In [2]:
'''Utility Functions'''
def concat_all_dataframes():
    ''' 
    Here we are going to combine all of the training csv's
    Into 1 dataframe that we can use for preprocessing
    
    adjust the path to whereever you are holding the data
    '''
#     path = os.path.join(os.getcwd(),'google_scraping','data')
    path = os.path.join(os.getcwd(),'Scrape_data')
    csvs = os.listdir(path)
    output = pd.DataFrame()
    
    count = 0
    for csv in csvs:
        if csv == '.ipynb_checkpoints':
            continue
        temp = pd.read_csv(os.path.join(path,csv))
        count += 1 
        
        if output.empty:
            output = temp        
        else:
            output = pd.concat([output,temp],axis=0)

    output.reset_index(inplace=True)        
    return output

def clean_translated_reviews(review):
    if 'Google' in review:
        res = review.split('(Original)')[0][23:]
        return res
    else:
        return review

def preprocessing_text(df):
    '''
    Here we are going to clean the text to be in a good format for further analysis 
    
    input: df ; DataFrame object that contains all of the reviews. 
    '''  
    lemm = WordNetLemmatizer()
    df = df.loc[~df.caption.isnull()] # here we are going to only process reviews 
    
    df.caption =  df.caption.apply(lambda review: clean_translated_reviews(review) if 'Google' in review else review)
        
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(str.lower)
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: re.sub(r'[^\w\s]','',review)) # removes punctuation
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: re.sub('\([^()]*\)','',review)) # removes words within parenthesis 
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    df.loc[:,'caption'] = df.loc[:,'caption'].apply(lambda review: emoji_pattern.sub(r'', review)) 
    df.loc[:,'caption_split'] = df.loc[:,'caption'].apply(lambda review: review.split(' '))
    
    clean_reviews = []
    for review_idx , review in df.caption_split.items():
        temp = []
        for idx, word in enumerate(review):
            if word in stop_words:
                continue
            else:
                temp.append(lemm.lemmatize(word))
        
        review_  = ' '.join(temp)
        clean_reviews.append(review_)
    
    df.loc[:,'caption'] = clean_reviews
    df.drop(['Unnamed: 0'],axis=1,inplace=True)
    

    return df


def rating_by_loc(df):
    # here we are going to return the locations that the 
    temp = pd.DataFrame(df.groupby(['address']).rating.value_counts(normalize=True)).sort_index(ascending=True).unstack()
    temp.fillna(0,inplace=True)
    output = {}
    for row in temp.iterrows():
        output[row[0]] = row[1]['rating']
    
    return output
        
def plot_rating_by_loc(d,key):
    # given a key, we are going to be able to plot the rating distribution for that location.
    # the format of the key is 
    d[key].plot(kind='bar');
    plt.title(f'Distribution of ratings from BJs {key} Google Reviews');
    plt.xlabel('rating score');
    plt.xticks(rotation=0)
    plt.ylabel('percent');
    
 
    
def create_language_col(df):
    # Here are we going to make a column of the language of the name
    df['language'] = None 
    for row in df.iterrows():
#         print(row[0])
        if (row[0] in [150,415,987,1120,9780,28028,32711]):
            df.loc[row[0],'language'] = 'en'
        else:
            
            lang = detect_langs(row[1]['caption'].split('(Original) ')[-1]).pop().__repr__()[:2]
            df.loc[row[0],'language'] = lang

    return df


def creating_word_clouds(n_grams_from_reviews,review_score):
    d_n_grams = {gram:freq for gram,freq in zip(n_grams_from_reviews['n-gram'],n_grams_from_reviews['value_count'])}
    plt.figure(figsize=(10,8))
    wordcloud = WordCloud(width=1600, height=800,stopwords=stopwords, background_color="white").generate_from_frequencies(d_n_grams)
    # Display the generated image:
    # the matplotlib way:
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(f'wordcloud{review_score}.png', facecolor='k', bbox_inches='tight')

    
def convert_relative_to_datetime(df):
    ''' Here we are going to convert relative dates given from google into estimated dates
    
    Input:
        df: The dataframe that is going to contain the relative and retrieval dates
    '''
    for idx , row in df.iterrows():
        print(idx)
        date_scraped = datetime.datetime.strptime(df.loc[idx,'retrieval_date'][:10],'%Y-%m-%d')
        
        relative_date = df.loc[idx,'relative_date'].split()

        if 'hour' in relative_date[1]:
                if relative_date[1] == 'hour':
                    res = date_scraped - relativedelta(hours=1)
                else:
                    res = date_scraped - relativedelta(hours=int(relative_date[0]))
                    
        if 'day' in relative_date[1]:
                if relative_date[1] == 'day':
                    res = date_scraped - relativedelta(days=1)
                else:
                    res = date_scraped - relativedelta(days=int(relative_date[0]))
                    
        if 'week' in relative_date[1]:
                if relative_date[1] == 'week':
                    res = date_scraped - relativedelta(weeks=1)
                else:
                    res = date_scraped - relativedelta(weeks=int(relative_date[0]))
                    
        if 'month' in relative_date[1]:
                if relative_date[1] == 'month':
                    res = date_scraped - relativedelta(months=1)
                else:
                    res = date_scraped - relativedelta(months=int(relative_date[0]))
        elif 'year' in relative_date[1]:
            if relative_date[1] == 'year':
                res = date_scraped - relativedelta(years=1)
            else:
                res = date_scraped - relativedelta(years=int(relative_date[0]))

        df.loc[idx,'relative_date'] = res
        
    return df 


def save_ppl_data(df):
    
    res = df[['relative_date','username','address']]
    res.to_csv('data_for_george.csv')

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

def ngrams_and_topic_modeling(df,rating_score,return_tfidf = False):
    ''' Here we are going to make a function that print out the topics and the ngrams 
        of a given model based on 
    '''
    # Here we are going to be initializing the vectorizers 
    tfidf_ngram = TfidfVectorizer(max_features=50,ngram_range=(3,5),stop_words='english')
    tfidf_lda = TfidfVectorizer(max_features=50,stop_words='english')
    
    # Here we are going to be creating our df by rewview rating score 
    df_score = df.loc[df['star rating'] == rating_score,'caption']
    df_score_copy = df_score.copy(deep=True)
    
    df_score_transformed = tfidf_ngram.fit_transform(df_score)
    df_score_copy_transformed = tfidf_lda.fit_transform(df_score_copy)

    # getting ngrams and sorting in descending order
    tfidf_series = pd.Series(tfidf_ngram.vocabulary_)
    tfidf_series=tfidf_series.to_frame()
    tfidf_series['n-gram'] = tfidf_series.index
    tfidf_series.index = range(len(tfidf_series.index))
    tfidf_series.rename({0:'value_count'},axis=1,inplace=True)

    top_n_grams = tfidf_series.loc[~tfidf_series['n-gram'].str.contains('google')].sort_values('n-gram',ascending = False)
    print('done with ngrams')
    print('starting lda...')
    # Next we are going to do the Topic Modelling.
#     search_params = {'n_components': list(range(3,8)), 'learning_decay': [.5, .7, .9],'random_state':[23]}
    lda = LatentDirichletAllocation(n_components=3,learning_decay=.5,random_state=23)
#     grid = GridSearchCV(lda,param_grid=search_params,n_jobs=3)
    
    lda.fit(df_score_copy_transformed)
    
    tfidf_lda = pd.DataFrame(df_score_copy_transformed.toarray(),columns=tfidf_lda.get_feature_names())
    topics = display_topics(lda,tfidf_lda.columns,10)
    
    return top_n_grams , topics

In [3]:
walmart = pd.read_csv('Scrape_data/walmart_reviews_all.csv')
walmart.rename(columns={'review':'caption'},inplace=True)
walmart = preprocessing_text(walmart)

In [20]:
walmart.loc[walmart.caption.str.contains('english') == True]

Unnamed: 0,star rating,name,caption,date,location_code,caption_split
362,2,David R.F,first walmart see staff dont speak english sho...,2021-02-01,1.0,"[first, walmart, i, see, where, some, staff, d..."
365,5,J P,nobody speaks word english,2021-05-01,1.0,"[nobody, speaks, a, word, of, english]"
381,3,richard kosta,could find many employee speaking english coul...,2021-02-01,1.0,"[could, not, find, many, employees, speaking, ..."
395,3,Pamela Watson,happy could purchase grocery everything else n...,2021-05-01,1.0,"[was, happy, i, could, purchase, groceries, an..."
416,4,Orkhan Rza,one thing like wallmart parking space store go...,2021-05-01,1.0,"[one, thing, i, like, about, this, wallmart, i..."
...,...,...,...,...,...,...
230778,5,Henry Morin,excellent walmart clean organized well staffe...,2021-05-01,180.0,"[excellent, walmart, very, clean, and, organiz..."
230781,5,james nicolo,nice go one english spoken,2021-05-01,180.0,"[so, nice, to, go, to, one, where, english, is..."
230841,3,Richard Hard,looked help walmart couldnt find single person...,2021-02-01,180.0,"[i, looked, for, help, at, this, walmart, and,..."
231515,5,james nicolo,nice go one english spoken,2021-05-01,180.0,"[so, nice, to, go, to, one, where, english, is..."


In [4]:
ngrams_walmart_5 , topics_5 = ngrams_and_topic_modeling(walmart,rating_score=5)

done with ngrams
starting lda...


In [9]:
ngrams_walmart_5

Unnamed: 0,value_count,n-gram
26,49,walmart super center
16,48,walmart great price
27,47,walmart great place
45,46,walmart good price
11,45,walmart favorite store
43,44,walmart clean organized
44,43,store great price
39,42,store friendly staff
48,41,store clean stocked
42,40,store clean organized


In [10]:
topics_5

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,great,8563.0,good,7019.5,store,6334.8
1,walmart,6845.9,love,6050.8,clean,4728.2
2,price,4611.5,need,5462.8,shopping,3193.4
3,place,4259.4,service,3818.2,helpful,3080.5
4,shop,3104.0,walmart,2626.0,friendly,3062.9
5,best,2812.4,needed,2287.5,staff,2899.7
6,nice,2446.7,customer,2213.4,stocked,2211.0
7,like,2132.9,looking,1753.2,employee,2077.4
8,time,2126.6,excellent,1430.2,easy,1768.2
9,people,1937.6,help,1322.0,nice,1758.9


In [5]:
ngrams_walmart_4 , topics_4 = ngrams_and_topic_modeling(walmart,rating_score=4)

done with ngrams
starting lda...


In [11]:
ngrams_walmart_4

Unnamed: 0,value_count,n-gram
26,49,walmart super center
39,48,walmart great price
31,47,walmart great place
15,46,walmart good price
37,45,usually im looking
30,44,use self checkout
38,43,store friendly staff
0,42,store clean stocked
27,41,store clean staff
23,40,store clean organized


In [12]:
topics_4

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,walmart,5872.4,good,3130.1,need,2842.7
1,price,2530.7,nice,2249.8,staff,1645.8
2,place,2151.1,great,1951.9,like,1556.1
3,time,1458.9,item,1658.1,shopping,1544.9
4,good,1373.8,clean,1635.5,stocked,1500.1
5,shop,1304.7,service,1621.5,helpful,1498.1
6,store,1222.3,store,1354.2,friendly,1442.5
7,line,1217.9,needed,1323.3,better,1288.5
8,thing,1215.7,selection,1287.4,love,1201.5
9,checkout,1000.1,busy,1256.9,store,1174.2


In [6]:
ngrams_walmart_3 , topics_3 = ngrams_and_topic_modeling(walmart,rating_score=3)

done with ngrams
starting lda...


In [13]:
ngrams_walmart_3

Unnamed: 0,value_count,n-gram
19,49,walmart super center
0,48,walmart good price
25,47,walmart dont expect
34,46,walmart customer service
4,45,use self checkout
8,44,use self check
40,43,staff friendly helpful
38,42,self checkout register
20,41,self checkout open
1,40,self checkout line


In [14]:
topics_3

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,walmart,4931.8,need,1591.2,good,1724.8
1,store,1837.4,line,1510.8,customer,1120.9
2,item,1311.2,like,1302.4,service,1092.6
3,shelf,1268.4,cashier,1194.1,staff,1024.5
4,price,1073.0,employee,1191.0,place,1023.3
5,thing,916.9,dont,1077.7,lot,1013.6
6,stocked,844.0,open,1049.7,crowded,896.4
7,time,766.8,long,1036.3,clean,840.7
8,selection,658.6,register,898.5,busy,823.6
9,help,625.0,check,867.8,nice,757.4


In [7]:
ngrams_walmart_2 , topics_2 = ngrams_and_topic_modeling(walmart,rating_score=2)

done with ngrams
starting lda...


In [15]:
ngrams_walmart_2

Unnamed: 0,value_count,n-gram
7,49,worst walmart ive
24,48,waited 30 minute
38,47,waited 20 minute
35,46,use self checkout
48,45,use self check
5,44,terrible customer service
18,43,self checkout register
21,42,self checkout open
15,41,self checkout line
26,40,register open self


In [16]:
topics_2

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,walmart,1736.7,line,1061.9,store,1281.6
1,shelf,954.7,people,890.7,customer,881.0
2,time,817.2,cashier,828.3,service,803.2
3,item,724.9,open,770.9,stock,620.0
4,help,634.4,dont,756.7,staff,563.3
5,good,528.1,long,674.8,employee,538.5
6,stocked,500.1,register,666.4,lot,516.9
7,thing,473.4,place,634.7,dirty,408.4
8,price,469.6,check,596.4,crowded,404.5
9,rude,463.8,need,586.5,cart,375.6


In [8]:
ngrams_walmart_1 , topics_1 = ngrams_and_topic_modeling(walmart,rating_score=1)

done with ngrams
starting lda...


In [17]:
ngrams_walmart_1

Unnamed: 0,value_count,n-gram
5,49,worst walmart ive
10,48,worst customer service
38,47,went customer service
43,46,walmart customer service
17,45,waited 30 minute
34,44,waited 20 minute
46,43,wait 20 minute
19,42,use self checkout
32,41,use self check
12,40,terrible customer service


In [18]:
topics_1

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,walmart,3574.1,customer,2467.4,employee,2152.7
1,store,1448.7,service,2405.0,store,1628.3
2,shelf,1350.6,line,1479.8,dont,1625.3
3,item,1336.1,people,1109.5,rude,1515.6
4,time,1239.1,horrible,1076.8,open,1433.4
5,went,1206.9,need,1041.6,cashier,1414.0
6,hour,978.7,minute,1015.6,place,1381.0
7,like,958.5,long,993.3,register,1218.1
8,worst,848.3,phone,922.7,check,1036.5
9,shopping,814.6,going,798.6,staff,961.7
