# Groupon

## Imports and Packages

In [4]:
import datetime 
from dateutil.relativedelta import *
import os
import re 
import string
import nltk
import pandas as pd 
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

import matplotlib.pyplot as plt 

from collections import OrderedDict
import langdetect 
from langdetect import DetectorFactory , detect_langs # for dermining language  
DetectorFactory.seed = 0

from google_trans_new import google_translator  # translating words
translator = google_translator()  

from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud 
%matplotlib inline

## Data Loading

In [5]:
# Loading csv files to panda dataframe
bjs = pd.read_csv("Groupon_BJs.csv")
sams = pd.read_csv("Groupon_SamsClub.csv")

In [6]:
# Labeling reviews
bjs["store"] = "BJs"
bjs["store_index"] = bjs.index
sams["store"] = "Sams"
sams["store_index"] = sams.index

In [7]:
# Concatenating BJ's and Sam's reviews to one dataframe
groupon = pd.concat([bjs,sams],ignore_index= True)

## Data Description

In [8]:
# groupon.loc[groupon.store=="BJs",].sample(5)
# groupon.loc[groupon.store=="Sams",].sample(5)
groupon.sample(5)

Unnamed: 0,name,text,rating,review_count,top_reviewer,helpful_reviewer,date_published,date_scraped,store,store_index
23379,Teresa,I love shopping at Sam’s club. Thank you Group...,5,0,0,0,02/24/2018,04/07/2021 09:54:09,Sams,18004
19511,Roxanne,"After some initial problems, got a lot of help...",5,0,0,0,10/19/2018,04/07/2021 09:30:17,Sams,14136
1418,Roxanne,Clean facilities great parking prices are reas...,4,0,1,0,04/20/2018,04/07/2021 13:18:12,BJs,1418
5291,Ken,Very knowledgeable and friendly lady who worke...,5,0,0,0,12/16/2013,04/07/2021 13:42:42,BJs,5291
5314,Martha,I love the promotional $25 dollar gift card wi...,5,0,0,0,12/16/2013,04/07/2021 13:42:49,BJs,5314


In [9]:
# groupon.loc[groupon.store=="BJs",].info()
# groupon.loc[groupon.store=="Sams",].info()
groupon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33915 entries, 0 to 33914
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              33915 non-null  object
 1   text              33914 non-null  object
 2   rating            33915 non-null  int64 
 3   review_count      33915 non-null  int64 
 4   top_reviewer      33915 non-null  int64 
 5   helpful_reviewer  33915 non-null  int64 
 6   date_published    33915 non-null  object
 7   date_scraped      33915 non-null  object
 8   store             33915 non-null  object
 9   store_index       33915 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 2.6+ MB


In [10]:
# groupon.loc[groupon.store=="BJs",].describe().T
# groupon.loc[groupon.store=="Sams",].describe().T
groupon.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,33915.0,3.828483,1.635537,0.0,2.0,5.0,5.0,5.0
review_count,33915.0,0.002595,0.182518,0.0,0.0,0.0,0.0,28.0
top_reviewer,33915.0,0.274687,0.446363,0.0,0.0,0.0,1.0,1.0
helpful_reviewer,33915.0,0.094354,0.292324,0.0,0.0,0.0,0.0,1.0
store_index,33915.0,12433.853899,8683.066342,0.0,4239.0,11582.0,20060.5,28539.0


## Null Values

In [11]:
# Check
groupon.text.isnull().value_counts()

False    33914
True         1
Name: text, dtype: int64

There is one null value lets find it

In [12]:
# Find Null
groupon.loc[groupon.text.isnull() == True,]

Unnamed: 0,name,text,rating,review_count,top_reviewer,helpful_reviewer,date_published,date_scraped,store,store_index
8262,Christine,,5,0,0,0,11/19/2020,04/07/2021 01:12:05,Sams,2887


Lets replace the NaN value with an empty string " "

In [13]:
# Replace with empty space
groupon.loc[groupon.text.isnull() == True,"text"] = " "

#Check
groupon.text.isnull().value_counts()

False    33915
Name: text, dtype: int64

## Exploratory Data Analysis

In [14]:
def plot_rating(df,name):

    print(f'Number of Reviews Ratings are:\t{df.rating.sum()}')

    plot_df = df.rating.value_counts(normalize=True).sort_index()
    plot_df *= 100

    plot_df.plot(kind='bar')
    plt.title(f'Rating Frequency of {name}');
    plt.xlabel('Rating Score');
    plt.xticks(rotation=0);
    plt.ylabel('Percent (%)');

In [125]:
plot_rating(groupon, "All Groupon Reviews")

In [126]:
plot_rating(groupon.loc[groupon.store=="BJs",],"BJ's Reviews")

In [127]:
plot_rating(groupon.loc[groupon.store=="Sams",],"Sam's Reviews")

In [16]:
groupon["review_len"] = groupon.text.apply(lambda x: len(x.split()))

In [27]:
avg_review_len_all = groupon.review_len.sum()/len(groupon.text)
avg_review_len_bjs = groupon.loc[groupon.store=="BJs","review_len"].sum()/len(groupon.loc[groupon.store=="BJs",])
avg_review_len_sams = groupon.loc[groupon.store=="Sams","review_len"].sum()/len(groupon.loc[groupon.store=="Sams",])

print(f'Average word count per review for all reviews:\t{avg_review_len_all}')
print(f'Average word count per review for all reviews:\t{avg_review_len_bjs}')
print(f'Average word count per review for all reviews:\t{avg_review_len_sams}')

Average word count per review for all reviews:	19.04390387734041
Average word count per review for all reviews:	15.445581395348837
Average word count per review for all reviews:	19.721583742116326


## Natural Language Processing