# Combining and Cleaning our Business Information
This notebook combines the checkin, tip and business data into a single table.
The second part of the notebook cleans the postcodes in the table.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
module_path = os.path.abspath(os.path.join('../../data/..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%matplotlib inline

In [2]:
#Full Check in data, separate hours
df = pd.read_csv("../../data/raw/yelp_checkin.csv") 

#group into businesses and add to make total number of checkins for each business
check = df.groupby("business_id")['checkins'].sum().sort_values(ascending=False)

Total check in counts for each business:

In [1]:
checkins = pd.DataFrame(check) #read in csv file

checkins.reset_index(level = 0, inplace = True)
checkins.head()

NameError: name 'pd' is not defined

Combine with our business table

In [4]:
biz = pd.read_csv("../../data/raw/yelp_business.csv") #read in FULL business file
biz = biz.filter(['business_id','name','city','state','stars', 'review_count', 'is_open', 'postal_code',\
                     'categories']) #filter relevant columns

In [5]:
#merges tables together, 'NaN' where there was no match for business_id.
bizWithCheckins = biz.merge(checkins,on='business_id', how = 'outer')
#We replace NaN with 0, because we know that NaN arises because businesses with 0 checkins do not appear in the checkin.csv
bizWithCheckins['checkins'] = bizWithCheckins['checkins'].fillna(value = 0)

In [6]:
bizWithCheckins.head(10)

Unnamed: 0,business_id,name,city,state,stars,review_count,is_open,postal_code,categories,checkins
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",Ahwatukee,AZ,4.0,22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",McMurray,PA,3.0,11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",Phoenix,AZ,1.5,18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",Tempe,AZ,3.0,9,0,85282,Sporting Goods;Shopping,120.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",Stuttgart,BW,4.0,5,1,70567,Italian;Restaurants,1.0
6,kCoE3jvEtg6UVz5SOD3GVw,"""BDJ Realty""",Las Vegas,NV,4.0,5,1,89128,Real Estate Services;Real Estate;Home Services...,0.0
7,OD2hnuuTJI9uotcKycxg1A,"""Soccer Zone""",Las Vegas,NV,1.5,9,1,89128,Shopping;Sporting Goods,27.0
8,EsMcGiZaQuG1OOvL9iUFug,"""Any Given Sundae""",Wexford,PA,5.0,15,1,15090,Coffee & Tea;Ice Cream & Frozen Yogurt;Food,15.0
9,TGWhGNusxyMaA4kQVBNeew,"""Detailing Gone Mobile""",Henderson,NV,5.0,7,1,89014,Automotive;Auto Detailing,1.0


Adding tip count totals

In [7]:
tips = pd.read_csv("../../data/raw/yelp_tip.csv")

In [8]:
tipCountGroup = tips.groupby("business_id").size().sort_values(ascending=False)
tipCountDf = pd.DataFrame(tipCountGroup)
tipCountDf.head()

Unnamed: 0_level_0,0
business_id,Unnamed: 1_level_1
FaHADZARwnY4yvlvpnsfGA,3517
JmI9nslLD7KZqRr__Bg6NQ,2382
DkYS3arLOhA8si5uUEmHOw,1474
5LNZ67Yw9RD6nf4_UhXOjw,1436
K7lWdNUhCbcnEvI0NhGewg,1346


In [9]:
tipCountDf.reset_index(level = 0, drop = False, inplace = True)
tipCountDf.columns = ['business_id', 'tipcount']
tipCountDf.head()

Unnamed: 0,business_id,tipcount
0,FaHADZARwnY4yvlvpnsfGA,3517
1,JmI9nslLD7KZqRr__Bg6NQ,2382
2,DkYS3arLOhA8si5uUEmHOw,1474
3,5LNZ67Yw9RD6nf4_UhXOjw,1436
4,K7lWdNUhCbcnEvI0NhGewg,1346


Final table detailing all checkin and tip info in one table:

In [10]:
bizComplete = bizWithCheckins.merge(tipCountDf, on = 'business_id', how = 'outer')
bizComplete['tipcount'] = bizComplete['tipcount'].fillna(0)
bizComplete.head()

Unnamed: 0,business_id,name,city,state,stars,review_count,is_open,postal_code,categories,checkins,tipcount
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",Ahwatukee,AZ,4.0,22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0,5.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",McMurray,PA,3.0,11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0,1.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",Phoenix,AZ,1.5,18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0,0.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",Tempe,AZ,3.0,9,0,85282,Sporting Goods;Shopping,120.0,3.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0,17.0


# Postal Code Processing

In [11]:
ZIPmask = bizComplete['postal_code'].notnull()
bizComplete[ZIPmask].count()

business_id     173944
name            173944
city            173943
state           173943
stars           173944
review_count    173944
is_open         173944
postal_code     173944
categories      173944
checkins        173944
tipcount        173944
dtype: int64

Function to clean zip codes like Canadian ones. If a postal code is made up of 2 separated strings, the first string is returned.

In [12]:
def cleanZip(string):
    if string is None:
        return None
    stringList = string.split(" ")
    return stringList[0]
    

In [13]:
#received float errors for 'string.split("") in the cleanZip function.
bizComplete['postal_code'] = bizComplete['postal_code'].astype('str')

In [14]:
bizComplete['postal_code'] = bizComplete['postal_code'].apply(cleanZip)
bizComplete.head(10)

Unnamed: 0,business_id,name,city,state,stars,review_count,is_open,postal_code,categories,checkins,tipcount
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",Ahwatukee,AZ,4.0,22,1,85044,Dentists;General Dentistry;Health & Medical;Or...,39.0,5.0
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",McMurray,PA,3.0,11,1,15317,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...,15.0,1.0
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",Phoenix,AZ,1.5,18,1,85017,Departments of Motor Vehicles;Public Services ...,6.0,0.0
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",Tempe,AZ,3.0,9,0,85282,Sporting Goods;Shopping,120.0,3.0
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,1,44221,American (New);Nightlife;Bars;Sandwiches;Ameri...,263.0,17.0
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",Stuttgart,BW,4.0,5,1,70567,Italian;Restaurants,1.0,1.0
6,kCoE3jvEtg6UVz5SOD3GVw,"""BDJ Realty""",Las Vegas,NV,4.0,5,1,89128,Real Estate Services;Real Estate;Home Services...,0.0,0.0
7,OD2hnuuTJI9uotcKycxg1A,"""Soccer Zone""",Las Vegas,NV,1.5,9,1,89128,Shopping;Sporting Goods,27.0,3.0
8,EsMcGiZaQuG1OOvL9iUFug,"""Any Given Sundae""",Wexford,PA,5.0,15,1,15090,Coffee & Tea;Ice Cream & Frozen Yogurt;Food,15.0,1.0
9,TGWhGNusxyMaA4kQVBNeew,"""Detailing Gone Mobile""",Henderson,NV,5.0,7,1,89014,Automotive;Auto Detailing,1.0,0.0


In [15]:
bizComplete.to_pickle("../../data/prep/businessPrepped.pkl")