In [2]:
!pip install pandas numpy matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print(pd.__version__)
### so that u dont have warnings
from warnings import filterwarnings
filterwarnings('ignore')

1.5.3


In [None]:
# Reading restaurants data
data_path = 'zomato.csv'
from pandas import read_csv

In [None]:
df= read_csv(data_path)

# Results
print(f'Dataset shape: {df.shape}')
df.head()

####  An overview from the data

In [None]:
df.info()

In [None]:
df.isnull().sum()

#### getting all NAN features

In [None]:
feature_na=[feature for feature in df.columns if df[feature].isnull().sum()>0]
feature_na

In [None]:
#% of missing values
import numpy as np
for feature in feature_na:
    print('{} has {} % missing values'.format(feature,np.round(df[feature].isnull().sum()/len(df)*100,4)))
    

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create arrays for features and missing values
features = ['rate', 'phone', 'location', 'rest_type',
            'dish_liked', 'cuisines', 'approx_cost(for two people)']
missing_values = [15.0337, 2.3358, 0.0406, 0.4389, 54.2916, 0.087, 0.669]

# Create horizontal bar chart
fig, ax = plt.subplots(figsize=(12, 10))
y_pos = np.arange(len(features))
ax.barh(y_pos, missing_values, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(features)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Percentage of Missing Values')
ax.set_title('Missing Values by Feature')
plt.show()


### Preparing Approx_cost column

In [None]:
df['approx_cost(for two people)'].dtype

In [None]:
df[df['approx_cost(for two people)'].isnull()]

In [None]:
df['approx_cost(for two people)'].unique()

In [None]:
### right now it has some NAN Values so it will be of float data-type,dats why very first we have to convert it into string then
### we have to remove this comma
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str).apply(lambda x: x.replace(',', ''))

In [None]:
df['approx_cost(for two people)']=df['approx_cost(for two people)'].astype(float)

In [None]:
df['approx_cost(for two people)'].dtype

### preparing rate_num col

In [None]:
df['rate'].unique()

In [None]:
df['rate'][0].split('/')[0]

In [None]:
def split(x):
    return x.split('/')[0]

In [None]:
df['rate'].dtype

In [None]:
df['rate'].isnull().sum()

In [None]:
### right now it has some NAN Values so it will be of float data-type,dats why very first weI have to convert it into string then
### we have to split it & access 
df['rate']=df['rate'].astype(str).apply(split)
### ''' df['rate'] = df['rate'].astype(str).apply(lambda x: x.split('/')[0])'''

In [None]:
df['rate'].replace('NEW',0,inplace=True)
df['rate'].replace('-',0,inplace=True)

In [None]:
df['rate']=df['rate'].astype(str).astype(float)

In [None]:
df['rate'].dtype


#### How many types of restaurants we have?

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,12))
df['rest_type'].value_counts().nlargest(20).plot.bar(color='red')

### to provide styling to text on x-axis
plt.gcf().autofmt_xdate()

In [None]:
df.columns

In [None]:
df['rest_type'].value_counts()

In [None]:
def mark(x):
    if x in ('Quick Bites', 'Casual Dining'):
        return 'Quick Bites + Casual Dining'
    else:
        return 'other'
    
    

In [None]:
df['Top_types']=df['rest_type'].apply(mark)

In [None]:
df.head()

In [None]:
!pip install plotly

In [None]:
import plotly.express as px
values=df['Top_types'].value_counts()
labels=df['Top_types'].value_counts().index

In [None]:
fig = px.pie(df, values=values, names=labels,title='Restaurants Pie chart')

fig.show()


In [None]:
### Almost 60 % of restaurants are of Casual Dining & Quick Bites

In [None]:
df.head()

### Top 5 Most Voted Restaurants

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
rest=df.groupby('name').agg({'votes': 'sum','url': 'count','approx_cost(for two people)': 'mean','rate': 'mean'}).reset_index()
rest

In [None]:
rest.columns = ['name', 'total_votes', 'total_unities', 'avg_approx_cost', 'mean_rating']
rest.head()

In [None]:
rest['votes_per_unity'] = rest['total_votes'] / rest['total_unities']
rest.head()

In [None]:
popular=rest.sort_values(by='total_unities', ascending=False)
popular

In [None]:
popular['name'].nunique()

In [None]:
popular.shape

In [None]:
import seaborn as sns
# Creating a figure for restaurants overview analysis
fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(20,30))

# Plot Pack 01 - Most popular restaurants (votes)

# Annotations
ax1.text(0.50, 0.30, int(popular['total_votes'].mean()), fontsize=45, ha='center')
ax1.text(0.50, 0.12, 'is the average of votes', fontsize=12, ha='center')
ax1.text(0.50, 0.00, 'received by restaurants', fontsize=12, ha='center')
ax1.axis('off')

sns.barplot(x='total_votes', y='name', data=popular.sort_values(by='total_votes', ascending=False)[0:5],ax=ax2, palette='plasma')
ax2.set_title('Top 5 Most Voted Restaurants', size=12)

sns.barplot(x='total_votes', y='name', data=popular.sort_values(by='total_votes', ascending=False).query('total_votes > 0').tail(),ax=ax3, palette='plasma_r')
ax3.set_title('Top 5 Less Voted Restaurants\n(with at least 1 vote)', size=12)


In [None]:
popular.columns

In [None]:
popular.head()

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(20,30))
# Annotations
import numpy as np
ax1.text(0.50, 0.30, np.round(popular['avg_approx_cost'].mean(), 2), fontsize=45, ha='center')
ax1.text(0.50, 0.12, 'is mean approx cost', fontsize=12, ha='center')
ax1.text(0.50, 0.00, 'for Bengaluru restaurants', fontsize=12, ha='center')
ax1.axis('off')

sns.barplot(x='avg_approx_cost', y='name', data=popular.sort_values(by='avg_approx_cost', ascending=False)[0:5],ax=ax2, palette='plasma')
ax2.set_title('Top 5 Most Expensives Restaurants', size=12)

sns.barplot(x='avg_approx_cost', y='name', data=popular.sort_values(by='avg_approx_cost', ascending=False).query('avg_approx_cost > 0').tail(),ax=ax3, palette='plasma_r')
ax3.set_title('Top 5 Less Expensive Restaurants', size=12)


#### How many restaurants offer Book Table service? And how about Online Order service?

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot
x=df['book_table'].value_counts()
labels=['not book','book']


In [None]:
trace=go.Pie(labels=labels, values=x,
               hoverinfo='label+percent', textinfo='percent', 
               textfont=dict(size=25),
              pull=[0, 0, 0,0.2, 0]
               )
iplot([trace])

In [None]:
import plotly.express as px
x=df['online_order'].value_counts()
labels=['accepted','not accepted']

In [None]:
fig = px.pie(df, values=x, names=labels,title='Pie chart')
fig.show()

### Finding Best budget Restaurants in any location
    we will pass location and restaurant type as parameteres,function will return name of restaurants.¶

In [None]:
def return_budget(location,restaurant):
    budget=df[(df['approx_cost(for two people)']<=400) & (df['location']==location) & 
                     (df['rate']>4) & (df['rest_type']==restaurant)]
    return(budget['name'].unique())

In [None]:
return_budget('BTM',"Quick Bites")

#### geographical analysis

#### We need Latitudes & longitudes for each of the place for geaographical Data analysis,so to fetch lat,lon of each place,use mapquest 

In [None]:
locations=pd.DataFrame({"Name":df['location'].unique()})

In [None]:
locations['new_Name']='Bangalore '+locations['Name']

In [None]:
locations.head(100)

In [None]:
locations.head

MapQuest API

In [None]:
import requests
# MAPQUEST API
API_KEY = 'FvWB34iwtjSNxakt3jRVLD2bQbrxQsUU'  # Replace with your API key
BASE_URL = 'http://www.mapquestapi.com/geocoding/v1/address'

latitudes = []
longitudes = []

for location in locations['Name']:
    response = requests.get(
        BASE_URL, params={'key': API_KEY, 'location': location})
    if response.status_code == 200:
        data = response.json()
        lat = data['results'][0]['locations'][0]['latLng']['lat']
        lng = data['results'][0]['locations'][0]['latLng']['lng']
        latitudes.append(lat)
        longitudes.append(lng)
    else:
        latitudes.append(None)
        longitudes.append(None)

print(latitudes)
print(longitudes)


In [None]:
locations['latitude']=latitudes
locations['longitude']=longitudes

In [None]:
locations.to_csv('zomato_locations.csv',index=True)

In [None]:
Rest_locations=pd.DataFrame(df['location'].value_counts().reset_index())

In [None]:
Rest_locations.columns=['Name','count']
Rest_locations.head()

#### now combine both the dataframes

In [None]:
Restaurant_locations=Rest_locations.merge(locations,on='Name',how="left").dropna()
Restaurant_locations.head()

In [None]:
def generateBaseMap(default_location=[12.97, 77.59], default_zoom_start=12):
    base_map = folium.Map(location=default_location, zoom_start=default_zoom_start)
    return base_map

In [None]:
import folium
from folium.plugins import HeatMap
basemap=generateBaseMap()

#### Heatmap of Restaurant

In [None]:
HeatMap(Restaurant_locations[['latitude','longitude','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)

In [None]:
basemap

#### It is clear that restaurants tend to concentrate in central bangalore area.
    The clutter of restaurants lowers are we move away from central.
    So,potential restaurant entrepreneurs can refer this and find out good locations for their venture.
    note heatmap is good when we have latitude,longitude or imporatnce of that particular place or count of that place

####  Predicting the Success of a Restaurant

In [None]:
df.columns

In [None]:
df['rate'].unique()

In [None]:
def assign(x):
    if x>0:
        return 1
    else:
        return 0
df['rated']=df['rate'].apply(assign)

In [None]:
df['rated'].unique()

In [None]:
new_restaurants = df[df['rated'] == 0]
train_val_restaurants = df.query('rated == 1')


#### By now we've already splitted our original data into new_restaurants and train_val_restaurants using pandas DataFrames. Let's  keep the first one aside for now and let's work only with the training and validation set. The next step is to create our target variable to be used in this classification task.

#### The main point here is to define a fair threshold for splitting the restaurants into good and bad ones. It would be a  really experimental decision and we must keep in mind that this approach is not the best one. Probably it would let margin for classification errors. Even so, let's try!

In [None]:
train_val_restaurants.head()

In [None]:
train_val_restaurants['rate'].unique()

### Defining a custom threshold for splitting restaurants into good and bad

In [None]:
# Defining a custom threshold for splitting restaurants into good and bad
threshold = 3.75
train_val_restaurants['target'] = train_val_restaurants['rate'].apply(lambda x: 1 if x >= threshold else 0)


In [None]:
train_val_restaurants.head()

In [None]:
import matplotlib.pyplot as plt
x=train_val_restaurants['target'].value_counts()
labels=x.index
print(x)
plt.pie(x,explode=[0.0,0.1],autopct='%1.1f%%')

#### Ok, for our first trial it's fair. The meaning of all this is that we marked as good restaurants with a rate greater or equal to 3.75. Correct or not, let's continue to see what we can get from this.

#### The next step is to prepare some features for training our classification model.

### Feature Extraction

In [None]:
train_val_restaurants.columns

In [None]:
train_val_restaurants.head()

In [None]:
## train_val_restaurants['total_cuisines'] = train_val_restaurants['cuisines'].astype(str).apply(lambda x: len(x.split(',')))

def count(x):
    return len(x.split(','))

In [None]:
#### as it have some NAN value that's why, first I have to convert into str  &  then apply a function
train_val_restaurants['total_cuisines']=train_val_restaurants['cuisines'].astype(str).apply(count)
train_val_restaurants['multiple_types']=train_val_restaurants['rest_type'].astype(str).apply(count)

In [None]:
train_val_restaurants.columns

In [None]:
imp_features=['online_order','book_table','location','rest_type','multiple_types','total_cuisines','listed_in(type)', 'listed_in(city)','approx_cost(for two people)','target']

In [None]:
data = train_val_restaurants[imp_features]

In [None]:
data.isnull().sum()

In [None]:
data.dropna(how='any',inplace=True) # inplace true to update the dataframe

In [None]:
data.isnull().sum()

In [None]:
# Splitting features by data type
cat_features= [col for col in data.columns if data[col].dtype == 'O']
num_features= [col for col in data.columns if data[col].dtype != 'O']

In [None]:
cat_features

In [None]:
num_features

In [None]:
for feature in cat_features:
    print('{} has total {} unique features'.format(feature, data[feature].nunique()))

#### But we will observe over here,we have many categories thus if we encode it using onne-hot encoding, it will consume more 
#### memory in our system

In [None]:
data.shape

In [None]:
cols=['location','rest_type','listed_in(city)']
for col in cols:
    print('Total feature in {} are {}'.format(col,data[col].nunique()))
    print(data[col].value_counts()/(len(data))*100)
    print('\n')
    

In [None]:
percent=data['location'].value_counts()/len(data)*100
values=percent.values


In [None]:
len(values[values>0.4])

In [None]:
#### lets set Threshold value 0.4 ,

In [None]:
values=data['location'].value_counts()/len(data)*100
values

In [None]:
threshold=0.4
imp=values[values>threshold]
imp

In [None]:
data['location']=np.where(data['location'].isin(imp.index),data['location'],'other')

##X_train['location']=X_train['location'].apply(lambda x:'other' if x not in imp.index else x)

In [None]:
data['location'].nunique()

In [None]:
values2=data['rest_type'].value_counts()/len(data)*100
values2

In [None]:
data['rest_type'].head(20)

In [None]:
len(values2[values2>0.3])

In [None]:
threshold=1.5
imp2=values2[values2>1.5]
imp2

In [None]:
imp2.index

In [None]:
data['rest_type'].isin(imp2.index)

In [None]:
data['rest_type']=np.where(data['rest_type'].isin(imp2.index),data['rest_type'],'other')
##data['rest_type'].apply(lambda x: 'other' if x not in imp2.index else x)


In [None]:
data['rest_type']

#### after apply feature reduction, we will observe less number of features

In [None]:
for feature in cat_features:
    print('{} has total {} unique features'.format(feature, data[feature].nunique()))

In [None]:
cat_features

In [None]:
import pandas as pd
data_cat = data[cat_features]
for col in cat_features:
    col_encoded = pd.get_dummies(data_cat[col],prefix=col,drop_first=True) # drop_first=True because it removes the additional column encoding
    data_cat=pd.concat([data_cat,col_encoded],axis=1)
    data_cat.drop(col, axis=1, inplace=True)

In [None]:
data_cat.shape

In [None]:
data_cat.head(10)

In [None]:
data_cat.shape

In [None]:
data.head()

In [None]:
data_final=pd.concat([data.loc[:,['multiple_types','total_cuisines','approx_cost(for two people)','target']],data_cat],axis=1)

In [None]:
data_final.shape

In [None]:
# Splitting the data
X = data_final.drop('target', axis=1)
y = data_final['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [None]:
X_train.shape

In [None]:
# Import the random forest model.
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Initialize the model with some parameters.
model = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, random_state=1)


In [None]:
# Fit the model to the data.
model.fit(X_train, y_train)

In [None]:
# Make predictions.
predictions = model.predict(X_test)

# Compute the error.
from sklearn.metrics import confusion_matrix
confusion_matrix(predictions, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions,y_test)


In [None]:
#fit naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
### classifier models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('Naive Bayes',GaussianNB()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))

In [None]:
for name,model in models:
    print(name)
    print(models)

In [None]:
# Make predictions on validation dataset

for name, model in models:
    print(name)
    model.fit(X_train, y_train)
    
    # Make predictions.
    predictions = model.predict(X_test)

    # Compute the error.
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(predictions, y_test))

    from sklearn.metrics import accuracy_score
    print(accuracy_score(predictions,y_test))
    print('\n')



#Plot the confusion matrix
#     # Make predictions on validation dataset

# for name, model in models:
#     print(name)
#     model.fit(X_train, y_train)
    
#     # Make predictions.
#     predictions = model.predict(X_test)

#     # Compute the error.
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(predictions, y_test)
#     print(confusion_matrix(predictions, y_test))
    
#     # Plot the confusion matrix.
#     fig, ax = plt.subplots(figsize=(6, 4))
#     sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", ax=ax)

#     # Add labels and title to the plot.
#     ax.set_xlabel("Predicted labels")
#     ax.set_ylabel("True labels")
#     ax.set_title(name)

#     # Show the plot.
#     plt.show()
#     from sklearn.metrics import accuracy_score
#     print(accuracy_score(predictions,y_test))
#     print('\n')