In [2]:
# import libraries
import pandas as pd

In [3]:
data = pd.read_csv('zomato.csv')

Understanding Data

In [4]:
print(str(data.head()) + '\n{}'.format('*'*10))
print(str(data.shape) + '\n{}'.format('*'*10))
print(str(data.columns) + '\n{}'.format('*'*10))
print(str(data.info()) + '\n{}'.format('*'*10))
print(str(data.describe()) + '\n{}'.format('*'*10))
print(str(data.isnull().sum()) + '\n{}'.format('*'*10))


                                                 url  \
0  https://www.zomato.com/bangalore/jalsa-banasha...   
1  https://www.zomato.com/bangalore/spice-elephan...   
2  https://www.zomato.com/SanchurroBangalore?cont...   
3  https://www.zomato.com/bangalore/addhuri-udupi...   
4  https://www.zomato.com/bangalore/grand-village...   

                                             address                   name  \
0  942, 21st Main Road, 2nd Stage, Banashankari, ...                  Jalsa   
1  2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...         Spice Elephant   
2  1112, Next to KIMS Medical College, 17th Cross...        San Churro Cafe   
3  1st Floor, Annakuteera, 3rd Stage, Banashankar...  Addhuri Udupi Bhojana   
4  10, 3rd Floor, Lakshmi Associates, Gandhi Baza...          Grand Village   

  online_order book_table   rate  votes                             phone  \
0          Yes        Yes  4.1/5    775    080 42297555\r\n+91 9743772233   
1          Yes         No  4.1/5  

In [5]:
# looking at unique values and see if cleaning is required for categorical columns
print(str(data['name'].value_counts()) + "\n{}".format("*"*10))
print(str(data['online_order'].value_counts()) + "\n{}".format("*"*10))
print(str(data['book_table'].value_counts()) + "\n{}".format("*"*10))
print(str(data['location'].value_counts()) + "\n{}".format("*"*10))
print(str(data['listed_in(type)'].value_counts()) + "\n{}".format("*"*10))
print(str(data['listed_in(city)'].value_counts()) + "\n{}".format("*"*10))

print(str(data['rest_type'].value_counts()) + "\n{}".format("*"*10))
print(str(data['dish_liked'].value_counts()) + "\n{}".format("*"*10))
print(str(data['cuisines'].value_counts()) + "\n{}".format("*"*10))
print(str(data['reviews_list'].value_counts()) + "\n{}".format("*"*10))
print(str(data['menu_item'].value_counts()) + "\n{}".format("*"*10))

Cafe Coffee Day             96
Onesta                      85
Just Bake                   73
Empire Restaurant           71
Five Star Chicken           70
                            ..
Ghar Ka Healthy Khana        1
Svadu Sweets & Savouries     1
Mum?s Kitchen                1
The Cakesplorer              1
Foodbook Restaurant          1
Name: name, Length: 8792, dtype: int64
**********
Yes    30444
No     21273
Name: online_order, dtype: int64
**********
No     45268
Yes     6449
Name: book_table, dtype: int64
**********
BTM                      5124
HSR                      2523
Koramangala 5th Block    2504
JP Nagar                 2235
Whitefield               2144
                         ... 
West Bangalore              6
Yelahanka                   6
Jakkur                      3
Rajarajeshwari Nagar        2
Peenya                      1
Name: location, Length: 93, dtype: int64
**********
Delivery              25942
Dine-out              17779
Desserts               3593
Cafes

Basic inferences from data:
-   online_order and book_table have binary values
-   rate and approx_cost values need to be extracted from the actual provided object values

Data Cleaning

In [10]:
print(data.shape)
print(data.isnull().sum())

(51717, 17)
url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64


In [55]:
# already aware online_order and book_table have no null values and each value is a binary
# let's see if all non-null rate values follow same pattern
import re
from util import utility
from importlib import reload
reload(utility)
RATE_PATTERN = re.compile(r'([1-5]\.[0-9])\/5')

# evaluate which rows follow rate pattern: 4.1/5 or 4.1 /5 or 4.1/ 5 or 4.1 / 5
rate_pattern_matches = data['rate'].apply(lambda x: utility.matches_pattern(RATE_PATTERN, x))
# print count of rows which follow and do not follow pattern
print(rate_pattern_matches.value_counts())
# print what other patterns found in rows which do not follow pattern
print(data[~rate_pattern_matches]['rate'].value_counts())
# create a new column to hold just the rate value
data['rate_out_of_5'] = data['rate'].apply(lambda x: utility.extract_from_pattern(RATE_PATTERN, x).group(1) if utility.matches_pattern(RATE_PATTERN, x) else '0.0')
data['rate_out_of_5'] = data['rate_out_of_5'].astype(float)
# verify if total count is still preserved
print(data['rate_out_of_5'].count())
# verify if row count for rate_out_of_5 having 0.0 is same as those not following pattern in actual data  
print(data[data['rate_out_of_5'] == 0.0]['rate_out_of_5'].count())

True     41665
False    10052
Name: rate, dtype: int64
NEW    2208
-        69
Name: rate, dtype: int64
51717
10052
