# Cleaning the Data

In [1]:
# Import libraries used for cleaning
import pandas as pd
import numpy as np


In [2]:
# Convert json file to a csv file

business_json_path = 'Data/yelp_business.json'

business_df = pd.read_json(business_json_path, lines=True)
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [3]:
#### General Cleaning#####
# Initial shape of data
print("Shape of data initially " + str(business_df.shape))

# Number of null values
print("This is the total number of null values in the dataframe " + str(business_df.isna().sum().sum()))

# Drop rows where restaraunt is not open
business_df = business_df[business_df.is_open != 0]

# Drop column 'is_open' as not needed anymore
business_df = business_df.drop(['is_open'], axis=1)

# Drop rows that have that do not have proper star rating and have "yes"
business_df = business_df[business_df['stars'] != 'Yes']
# Filter catagories that contain "Restaurants" and #Food
business_df = business_df[business_df['categories'].str.contains('Restaurants|Food', case =False, na=False)]



print("New shape of data " + str(business_df.shape))

Shape of data initially (150346, 14)
This is the total number of null values in the dataframe 37070
New shape of data (44582, 13)


In [4]:
# Create column that counts the number of null values in each row
business_df["Null Count"] = business_df.isnull().sum(axis=1)
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,Null Count
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",0
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '...",0
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '...",0
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,"{'WiFi': ''free'', 'RestaurantsGoodForGroups':...","Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '...",0
150328,1jx1sfgjgVg0nM6n3p0xWA,Savaya Coffee Market,11177 N Oracle Rd,Oro Valley,AZ,85737,32.409552,-110.943073,4.5,41,"{'BusinessParking': '{'garage': False, 'street...","Specialty Food, Food, Coffee & Tea, Coffee Roa...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-14:0', '...",0
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",0


In [5]:
# Separate the days of the week open from the hours column
business_df = pd.concat([business_df, business_df["hours"].apply(pd.Series)], axis=1)
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,categories,hours,Null Count,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,"Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",0,,,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0,12:0-18:0
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '...",0,0:0-0:0,6:0-22:0,6:0-22:0,6:0-22:0,9:0-0:0,9:0-22:0,8:0-22:0
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,...,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '...",0,0:0-0:0,6:0-21:0,6:0-21:0,6:0-16:0,6:0-16:0,6:0-17:0,6:0-21:0
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,...,"Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,5:0-10:0,15:0-18:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,...,"Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,...,"Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '...",0,0:0-0:0,0:0-17:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0
150328,1jx1sfgjgVg0nM6n3p0xWA,Savaya Coffee Market,11177 N Oracle Rd,Oro Valley,AZ,85737,32.409552,-110.943073,4.5,41,...,"Specialty Food, Food, Coffee & Tea, Coffee Roa...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-14:0', '...",0,0:0-0:0,6:0-14:0,6:0-14:0,6:0-14:0,6:0-14:0,6:0-15:0,6:0-15:0
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,...,"Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",0,11:0-22:0,11:0-22:0,,11:0-22:0,11:0-2:0,11:0-2:0,11:0-22:0


In [6]:
# Replace all NaN values in all the day columns to have 0 to represent not being open that day 
business_df[['Monday','Tuesday','Wednesday','Thursday', 'Friday', 'Saturday','Sunday']] = business_df[['Monday','Tuesday','Wednesday','Thursday', 'Friday', 'Saturday','Sunday']].replace(np.nan,0)

In [7]:
# Function created to check if restaraunt is open on the weekend
def open_weekend_conditions(x):
    if (x['Saturday'] != 0) & (x['Sunday'] != 0):
        return 1
    else:
        return 0
# Apply Function to new column open_weekend in the dataframe
business_df["open_weekends"] = business_df.apply(open_weekend_conditions,axis=1)
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,hours,Null Count,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,open_weekends
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0,1
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,"{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",0,0,0,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0,12:0-18:0,1
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,"{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '...",0,0:0-0:0,6:0-22:0,6:0-22:0,6:0-22:0,9:0-0:0,9:0-22:0,8:0-22:0,1
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,...,"{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '...",0,0:0-0:0,6:0-21:0,6:0-21:0,6:0-16:0,6:0-16:0,6:0-17:0,6:0-21:0,1
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,...,"{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,5:0-10:0,15:0-18:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,...,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,1
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,...,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '...",0,0:0-0:0,0:0-17:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,0:0-0:0,1
150328,1jx1sfgjgVg0nM6n3p0xWA,Savaya Coffee Market,11177 N Oracle Rd,Oro Valley,AZ,85737,32.409552,-110.943073,4.5,41,...,"{'Monday': '0:0-0:0', 'Tuesday': '6:0-14:0', '...",0,0:0-0:0,6:0-14:0,6:0-14:0,6:0-14:0,6:0-14:0,6:0-15:0,6:0-15:0,1
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,...,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",0,11:0-22:0,11:0-22:0,0,11:0-22:0,11:0-2:0,11:0-2:0,11:0-22:0,1


In [8]:
# Find out if restaraunt is open monday and then drop columns that are not needed anymore
# Replace all null values in Monday to be 0 
business_df['Monday'] = business_df['Monday'].replace(np.nan,0)
# change 0:0 -0:0 to 0s 
# Function to convert monday 0 or 0:0-0:0 values to 0 and the rest be 1 to represent open.
business_df['Monday'] = business_df['Monday'].replace('0:0-0:0',0,regex=True)

# Function to convert open to 1 and 0 for restaraunts that are not open on monday
def open_on_monday(x):
    if (x['Monday'] != 0):
        return 1
    else:
        return 0

# Apply the function to create open_Monday column
business_df["open_monday"] = business_df.apply(open_on_monday,axis=1)

# Drop days of the week columns not needed anymore 
business_df = business_df.drop(business_df.iloc[:,14:21],axis=1)

# Drop hours column as not needed anymore
business_df = business_df.drop('hours', axis=1)

In [9]:
# Extract the attributes column to its own column
business_df = pd.concat([business_df, business_df["attributes"].apply(pd.Series)], axis=1)
business_df.head()
# Make all attributes columns that are nan to 0 
business_df= business_df.fillna(0)

# Make the 'None' type values into 0 throughout the dataframe

# Replace String using apply() function with lambda.
business_df = business_df.apply(lambda x: x.replace({"none": 0, "None": 0,"False": 0,"u'none'": 0," u 'no'": 0,'no': 0,}, regex=True))
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions,HairSpecializesIn
3,MTSW4McQd7CbVtyjqoe9mw,0,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,0,0,0,0,0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,0,0,0,0,0,0,0,0,0,0
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,0,0,0,0,0,0,0,0,0,0
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,...,0,0,0,0,0,0,0,0,0,0
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Columns that can be taken out of dataFrame AgesAllowed HairSpecializesIn AcceptsInsurance BYOBCorkage GoodForDancing BestNights BYOB ByAppointmentOnly BusinessParking RestaurantsAttire Ambience CoatCheck GoodForMeal BusinessAcceptsBitcoin 
business_df = business_df.drop(['AgesAllowed','DietaryRestrictions','HairSpecializesIn', 'Music','AcceptsInsurance','Open24Hours','BYOBCorkage', 'Corkage','GoodForDancing','BestNights','BYOB','ByAppointmentOnly','BusinessParking','RestaurantsAttire','Ambience', 'CoatCheck', 'GoodForMeal', 'BusinessAcceptsBitcoin','RestaurantsCounterService'], axis = 1)

# Clean Wifi column
# Function to change having wifi to 1 and rest to 0
def clean_wifi(x):
    if (x['WiFi'] != 0):
        return 1
    else:
        return 0
business_df["WiFi"] = business_df.apply(clean_wifi,axis=1)

# Drop the attribiutes colums as all data is extracted from it 
business_df = business_df.drop(columns= 'attributes')

### clean the u before the string

In [12]:
# Function that will clean the 'u before data values that is messing up for making into ordinal values
#business_df[['NoiseLevel','Alcohol','Smoking']] = business_df[['NoiseLevel','Alcohol','Smoking']].str.replace("^u", "")
#business_df[['NoiseLevel','Alcohol','Smoking']] = business_df[['NoiseLevel','Alcohol','Smoking']].str.replace("\'", "")


In [13]:
# Remove the starting u', ', and the ending ' for NoiseLevel
business_df['NoiseLevel'] = business_df['NoiseLevel'].str.replace("^u", "")
business_df['NoiseLevel'] = business_df['NoiseLevel'].str.replace("\'", "")
business_df['NoiseLevel'].value_counts()

  business_df['NoiseLevel'] = business_df['NoiseLevel'].str.replace("^u", "")


average      16905
quiet         4001
loud          1660
very_loud      506
Name: NoiseLevel, dtype: int64

In [14]:
# Remove the starting u', ', and the ending ' for Smoking
business_df['Smoking'] = business_df['Smoking'].str.replace("^u", "")
business_df['Smoking'] = business_df['Smoking'].str.replace("\'", "")
business_df['Smoking'].value_counts()

  business_df['Smoking'] = business_df['Smoking'].str.replace("^u", "")


outdoor    1065
yes         130
Name: Smoking, dtype: int64

In [15]:
# Remove the starting u', ', and the ending ' for Alcohol
business_df['Alcohol'] = business_df['Alcohol'].str.replace("^u", "")
business_df['Alcohol'] = business_df['Alcohol'].str.replace("\'", "")
business_df['Alcohol'].value_counts()

  business_df['Alcohol'] = business_df['Alcohol'].str.replace("^u", "")


full_bar         8544
beer_and_wine    3827
Name: Alcohol, dtype: int64

In [16]:
# Simplify to- Change all "False" values to 0 and "True" values to 1       
business_df = business_df.replace({'False': '0', 'True': '1'})

In [17]:
## ordinal value converison work
###Make Noise Level column into ordinal values###
# change the null values to 0
business_df["NoiseLevel"] = business_df["NoiseLevel"].replace(np.nan,0)
# Encode NoiseLevel values in column
scale_mapper = {"quiet":1, "average":2, "loud":3, "very_loud":4}
business_df["NoiseLevel"] = business_df["NoiseLevel"].replace(scale_mapper)

## Make Alcohol column into ordinal values
business_df["Alcohol"] = business_df["Alcohol"].replace(np.nan,0)
alcohol_mapper = {"full_bar":1, "beer_and_wine":1}
business_df["Alcohol"] = business_df["Alcohol"].replace(alcohol_mapper)

## Clean smoking column into ordinal values
business_df["Smoking"] = business_df["Smoking"].replace(np.nan,0)
smoking_mapper = {"outdoor":1, "yes":1}
business_df["Smoking"] = business_df["Smoking"].replace(smoking_mapper)

In [18]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForKids,RestaurantsReservations,DogsAllowed,RestaurantsTableService,RestaurantsGoodForGroups,HasTV,HappyHour,DriveThru,NoiseLevel,Smoking
3,MTSW4McQd7CbVtyjqoe9mw,0,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,0,0,0,0,0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,1,0,0,0,0,0,0,0,0,0
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,1,0,0,0,1,1,0,1,0,0
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,...,1,0,0,0,0,1,0,1,0,0
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,...,0,0,0,0,0,0,0,0,0,0


In [0]:
# Export Cleaned Data
business_df.to_csv('Data/yelp_business_cleaned.csv',
                    index = False)



In [0]:
import os
os.listdir('Data')

### Create ordinal/dummy variables of your columns:

Best Code Source for dummy variables: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

pd.dummy()

Overview of dummy variable concept: https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

Split out the columns of attributes and categories into dummy variables

In [0]:
#dummies = pd.get_dummies(business_df["state"])
#pd.concat([business_df, dummies])

In [0]:
business_df.head()

In [0]:
dummies = business_df['categories'].str.get_dummies(',')
business_df = pd.concat([business_df, dummies])
business_df

In [0]:
ethnic_food_category = ["American", "Vietnamese", "Korean", "Italian", "Mexican", "Japanese", "Chinese"]

In [0]:
business_df['Vietnamese'].value_counts()

In [0]:
business_df["ethnic food category"] = ""

business_df["B"][business_df['A'].str.contains("BULL")] = "Long"

In [0]:
# EDA
# Compare rating by food category (American, Mexican, Indian, etc) by state

In [0]:
n = int(input())
arr = list(map(int, input().split()))
print(arr)

In [0]:
# city, state to dummy variable

# change the true false to 0 and 1


# Try 3 algorithms: Linear regression, Naive Bayes, XGBoost


# 5 fold cross validation

# https://machinelearningmastery.com/k-fold-cross-validation/



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fcd92ccc-2af6-49a1-bc78-ae9f0a350628' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>