In [1]:
import os
import sys
import pandas as pd
import numpy as np
module_path = os.path.abspath(os.path.join('../../data/..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%matplotlib inline

In [2]:
#Full Check in data, separate hours
df = pd.read_csv("../../data/raw/yelp_checkin.csv") 

#group into businesses and add to make total number of checkins for each business
check = df.groupby("business_id")['checkins'].sum().sort_values(ascending=False)

In [3]:
checkins = pd.DataFrame(check) #read in csv file

checkins.reset_index(level = 0, inplace = True)
checkins


Unnamed: 0,business_id,checkins
0,FaHADZARwnY4yvlvpnsfGA,131958
1,JmI9nslLD7KZqRr__Bg6NQ,112590
2,yQab5dxZzgBLTEHCw9V7_w,49934
3,5LNZ67Yw9RD6nf4_UhXOjw,43995
4,SMPbvZLSMMb7KU76YNYMGg,32603
5,IZivKqtHyz4-ts8KsnvMrA,32393
6,Wxxvi3LZbHNIDwJ-ZimtnA,30583
7,na4Th5DrNauOv-c43QQFvA,29271
8,El4FC8jcawUVgw_0EIcbaQ,28272
9,VyjyHoBg3KC5BSFRlD0ZPQ,27306


In [4]:

biz = pd.read_csv("../../data/raw/yelp_business.csv") #read in FULL business file
biz = biz.filter(['business_id','name','city' 'stars', 'review_count', 'is_open', 'postal_code',\
                     'categories']) #filter relevant columns



In [5]:

#merges tables together, 'NaN' where there was no match for business_id.
bizWithCheckins = biz.merge(checkins,on='business_id', how = 'outer')
#We replace NaN with 0, because we know that NaN arises because businesses with 0 checkins do not appear in the checkin.csv
bizWithCheckins['checkins'] = bizWithCheckins['checkins'].fillna(value = 0)


In [6]:
bizWithCheckins.head(10)

Unnamed: 0,business_id,name,review_count,is_open,postal_code,categories,checkins
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",9,0,85282,Sporting Goods;Shopping,120.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",5,1,70567,Italian;Restaurants,1.0
6,kCoE3jvEtg6UVz5SOD3GVw,"""BDJ Realty""",5,1,89128,Real Estate Services;Real Estate;Home Services...,0.0
7,OD2hnuuTJI9uotcKycxg1A,"""Soccer Zone""",9,1,89128,Shopping;Sporting Goods,27.0
8,EsMcGiZaQuG1OOvL9iUFug,"""Any Given Sundae""",15,1,15090,Coffee & Tea;Ice Cream & Frozen Yogurt;Food,15.0
9,TGWhGNusxyMaA4kQVBNeew,"""Detailing Gone Mobile""",7,1,89014,Automotive;Auto Detailing,1.0


 ### Adding tip count totals
 - Read in CSV file
 - Read in filtered business file (only useful info)
 - Find the businesses that have no tips
 - Set their tip total to 0
 - Combine the business info and tip totals to one dataframe

In [11]:
tips = pd.read_csv("../../data/raw/yelp_tip.csv")

In [21]:
tipCountGroup = tips.groupby("business_id").size().sort_values(ascending=False)
tipCountDf = pd.DataFrame(tipCountGroup)
tipCountDf.head()

Unnamed: 0_level_0,0
business_id,Unnamed: 1_level_1
FaHADZARwnY4yvlvpnsfGA,3517
JmI9nslLD7KZqRr__Bg6NQ,2382
DkYS3arLOhA8si5uUEmHOw,1474
5LNZ67Yw9RD6nf4_UhXOjw,1436
K7lWdNUhCbcnEvI0NhGewg,1346


In [22]:
tipCountDf.reset_index(level = 0, drop = False, inplace = True)
tipCountDf.columns = ['business_id', 'tipcount']
tipCountDf.head()

Unnamed: 0,business_id,tipcount
0,FaHADZARwnY4yvlvpnsfGA,3517
1,JmI9nslLD7KZqRr__Bg6NQ,2382
2,DkYS3arLOhA8si5uUEmHOw,1474
3,5LNZ67Yw9RD6nf4_UhXOjw,1436
4,K7lWdNUhCbcnEvI0NhGewg,1346


In [25]:
bizComplete = bizWithCheckins.merge(tipCountDf, on = 'business_id', how = 'outer')
bizComplete['tipcount'] = bizComplete['tipcount'].fillna(0)
bizComplete.head()

Unnamed: 0,business_id,name,review_count,is_open,postal_code,categories,checkins,tipcount
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0,5.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0,1.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0,0.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",9,0,85282,Sporting Goods;Shopping,120.0,3.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0,17.0


# Postal Code Processing

In [26]:
ZIPmask = bizComplete['postal_code'].notnull()
bizComplete[ZIPmask].count()

business_id     174567
name            174567
review_count    174567
is_open         174567
postal_code     174567
categories      174567
checkins        174567
tipcount        174567
dtype: int64

Function to clean zip codes like Canadian ones. If a postal code is made up of 2 separated strings, the first string is returned.

In [27]:
def cleanZip(string):
    if string is None:
        return None
    stringList = string.split(" ")
    return stringList[0]
    

In [28]:
#received float errors for 'string.split("") in the cleanZip function.
bizComplete['postal_code'] = bizComplete['postal_code'].astype('str')

In [30]:
bizComplete['postal_code'] = bizComplete['postal_code'].apply(cleanZip)
bizComplete.head(10)

Unnamed: 0,business_id,name,review_count,is_open,postal_code,categories,checkins,tipcount
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0,5.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0,1.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0,0.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",9,0,85282,Sporting Goods;Shopping,120.0,3.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0,17.0
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",5,1,70567,Italian;Restaurants,1.0,1.0
6,kCoE3jvEtg6UVz5SOD3GVw,"""BDJ Realty""",5,1,89128,Real Estate Services;Real Estate;Home Services...,0.0,0.0
7,OD2hnuuTJI9uotcKycxg1A,"""Soccer Zone""",9,1,89128,Shopping;Sporting Goods,27.0,3.0
8,EsMcGiZaQuG1OOvL9iUFug,"""Any Given Sundae""",15,1,15090,Coffee & Tea;Ice Cream & Frozen Yogurt;Food,15.0,1.0
9,TGWhGNusxyMaA4kQVBNeew,"""Detailing Gone Mobile""",7,1,89014,Automotive;Auto Detailing,1.0,0.0
