# Capstone Project - The Battle of Neighborhood

## Analysis

We will import the required libraries for python.

pandas and numpy for handling data  
request module for using FourSquare API  
geopy to get co-ordinates of City of New York  
folium to visualize the results on a map

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests
from bs4 import BeautifulSoup
import geocoder
import os
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline


print('Libraries imported.')

Now we define a function to get the geocodes i.e latitude and longitude of a given location using geopy.

In [None]:
def geo_location(address):
    # get geo location of address
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude,longitude

We define a function to intract with FourSquare API and get top 100 venues within a radius of 1000 metres for a given latitude and longitude. Below function will return us the venue id, venue name and category.

In [None]:
CLIENT_ID = 'CKUC2TTYGS44UTC0IMOWNUBR1NI2QIT3IT2DC1OZJKVLNYU0'
CLIENT_SECRET = 'DIK5SIHSOLKCJ1KX0DTXIQQM3JNP04F0KLDO1JZ3FMEFD0JD'
VERSION = '20200608'
radius=400
LIMIT=100

def get_venues(lat,lng):
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    # get all the data
    results = requests.get(url).json()
    venue_data=results['response']
    venue_details=[]
    for row in venue_data:
        try:
            venue_id=row['venues']['id']
            venue_name=row['venues']['name']
            venue_category=row['venues']['categories']['name']
            venue_details.append([venue_id,venue_name,venue_category])
        except KeyError:
            pass
    column_names=['ID','Name','Category']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df

Now we will define a function to get venue details like like counts, rating, tip counts for a given venue id. This will be used for ranking.

In [None]:
def get_venue_details(venue_id):
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            venue_id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
    # get all the data
    results = requests.get(url).json()
    print(results)
    venue_data=results['response']['venues']
    venue_details=[]
    try:
        venue_id=venue_data['id']
        venue_name=venue_data['name']
        venue_likes=venue_data['likes']['count']
        venue_rating=venue_data['rating']
        venue_tips=venue_data['tips']['count']
        venue_details.append([venue_id,venue_name,venue_likes,venue_rating,venue_tips])
    except KeyError:
        pass
    column_names=['ID','Name','Likes','Rating','Tips']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df

Now we define a funtion to get the new york city data such as Boroughs, Neighborhoods along with their latitude and longitude.

In [None]:
def get_new_york_data():
    url='https://cocl.us/new_york_dataset'
    resp=requests.get(url).json()
    # all data is present in features label
    features=resp['features']
    # define the dataframe columns
    column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
    # instantiate the dataframe
    new_york_data = pd.DataFrame(columns=column_names)
    for data in features:
        borough = data['properties']['borough'] 
        neighborhood_name = data['properties']['name']
        neighborhood_latlon = data['geometry']['coordinates']
        neighborhood_lat = neighborhood_latlon[1]
        neighborhood_lon = neighborhood_latlon[0]
        new_york_data = new_york_data.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    return new_york_data

We will call the above funtion to get the New York City data.

In [None]:
ny_data = get_new_york_data()
ny_data.head()

In [None]:
ny_data.shape

So there are total of 306 different Neighborhoods in New York City.

In [None]:
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Number of Neighborhoods for each Borough in New York City')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('No. of Neighborhoods', fontsize=15)
#giving a bar plot
ny_data.groupby('Borough')['Neighborhood'].count().plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()

We see that Queens has highest number of neighborhoods.

Now we will collect Vietnamese restaurants for each Neighborhood.

In [None]:
column_names=['Borough', 'Neighborhood', 'ID','Name']
vietnamese_rest_ny=pd.DataFrame(columns=column_names)
count=1
for row in ny_data.values.tolist():
    Borough, Neighborhood, Latitude, Longitude=row
    venues = get_venues(Latitude,Longitude)
    vietnamese_restaurants=venues[venues['Category']=='Vietnamese Restaurant']   
    print('(',count,'/',len(ny_data),')','Vietnamese Restaurants in '+Neighborhood+', '+Borough+':'+str(len(vietnamese_restaurants)))
    for restaurant_detail in vietnamese_restaurants.values.tolist():
        id, name , category=restaurant_detail
        vietnamese_rest_ny = vietnamese_rest_ny.append({'Borough': Borough,
                                                'Neighborhood': Neighborhood, 
                                                'ID': id,
                                                'Name' : name
                                               }, ignore_index=True)
    count+=1

In [None]:
vietnamese_rest_ny.head()

In [None]:
vietnamese_rest_ny.shape

We got 23 Vietnamese restaurants in New York City.

In [None]:
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Number of Vietnamese Restaurants for each Borough in New York City')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('No. of Vietnamese Restaurants', fontsize=15)
#giving a bar plot
vietnamese_rest_ny.groupby('Borough')['ID'].count().plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()

We see that Manhattan has the largest number of Vietnamese restaurants.

In [None]:
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Number of Vietnamese Restaurants for each Neighborhood in New York City')
#On x-axis
plt.xlabel('Neighborhood', fontsize = 15)
#On y-axis
plt.ylabel('No.of Vietnamese Restaurants', fontsize=15)
#giving a bar plot
vietnamese_rest_ny.groupby('Neighborhood')['ID'].count().nlargest(5).plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()

We see that Chinatown has the largest number of Vietnamese restaurants.

In [None]:
vietnamese_rest_ny[vietnamese_rest_ny['Neighborhood']=='Chinatown']

So Chinatown in Manhattan has the highest number of Vietnamese Restaurants with a total count of 3.

Now we will get the ranking of each restaurant for further analysis.

In [None]:
# prepare neighborhood list that contains Vietnamese restaurants
column_names=['Borough', 'Neighborhood', 'ID','Name','Likes','Rating','Tips']
vietnamese_rest_stats_ny=pd.DataFrame(columns=column_names)
count=1


for row in vietnamese_rest_ny.values.tolist():
    Borough,Neighborhood,ID,Name=row
    try:
        venue_details=get_venue_details(ID)
        print(venue_details)
        id,name,likes,rating,tips=venue_details.values.tolist()[0]
    except IndexError:
        print('No data available for id=',ID)
        # we will assign 0 value for these restaurants as they may have been 
        #recently opened or details does not exist in Foursquare Database
        id,name,likes,rating,tips=[0]*5
    print('(',count,'/',len(vietnamese_rest_ny),')','processed')
    vietnamese_rest_stats_ny = vietnamese_rest_stats_ny.append({'Borough': Borough,
                                                'Neighborhood': Neighborhood, 
                                                'ID': id,
                                                'Name' : name,
                                                'Likes' : likes,
                                                'Rating' : rating,
                                                'Tips' : tips
                                               }, ignore_index=True)
    count+=1

In [None]:
vietnamese_rest_stats_ny.head()

In [None]:
vietnamese_rest_stats_ny.shape

So we got data for all Vietnamese restaurants. Now let's save this data to a csv sheet. In case we by mistake modify it. As the number of calls to get details for venue are premium call and have limit of 500 per day, we will refer to saved data sheet csv if required.

In [None]:
vietnamese_rest_stats_ny.to_csv('vietnamese_rest_stats_ny.csv', index=False)

In [None]:
vietnamese_rest_stats_ny_csv=pd.read_csv('vietnamese_rest_stats_ny.csv')

In [None]:
vietnamese_rest_stats_ny_csv.shape

In [None]:
vietnamese_rest_stats_ny_csv.head()

In [None]:
vietnamese_rest_stats_ny.info()

We see that values like Likes, Tips are strig values. We would need to convert them into float for further analysis.

In [None]:
vietnamese_rest_stats_ny['Likes']=vietnamese_rest_stats_ny['Likes'].astype('float64')

In [None]:
vietnamese_rest_stats_ny['Tips']=vietnamese_rest_stats_ny['Tips'].astype('float64')

In [None]:
vietnamese_rest_stats_ny.info()

In [None]:
#Restaurant with maximum Likes
vietnamese_rest_stats_ny.iloc[vietnamese_rest_stats_ny['Likes'].idxmax()]

In [None]:
# Restaurant with maximum Tips
vietnamese_rest_stats_ny.iloc[vietnamese_rest_stats_ny['Tips'].idxmax()]

Now let's visualize neighborhood with maximum average rating of restaurants.

In [None]:
ny_neighborhood_stats=vietnamese_rest_stats_ny.groupby('Neighborhood',as_index=False).mean()[['Neighborhood','Rating']]
ny_neighborhood_stats.columns=['Neighborhood','Average Rating']

In [None]:
#Top neighborhoods with top average rating for Vietnamese restaurants
ny_neighborhood_stats.sort_values(['Average Rating'],ascending=False).head(10)

In [None]:
ny_borough_stats=vietnamese_rest_stats_ny.groupby('Borough',as_index=False).mean()[['Borough','Rating']]
ny_borough_stats.columns=['Borough','Average Rating']

In [None]:
#Top boroughs with top average rating for Vietnamese restaurants
ny_borough_stats.sort_values(['Average Rating'],ascending=False).head(10)

In [None]:
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Average rating of Vietnamese Restaurants for each Borough')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('Average Rating', fontsize=15)
#giving a bar plot
vietnamese_rest_stats_ny.groupby('Borough').mean()['Rating'].plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()

We will consider all the neighborhoods with average rating greater or equal 8.0 to visualize on map.

In [None]:
ny_neighborhood_stats=ny_neighborhood_stats[ny_neighborhood_stats['Average Rating']>=8.0]

In [None]:
ny_neighborhood_stats

We will join this dataset to original New York City data to get lonitude and latitude.

In [None]:
ny_neighborhood_stats=pd.merge(ny_neighborhood_stats,ny_data, on='Neighborhood')

In [None]:
ny_neighborhood_stats=ny_neighborhood_stats[['Borough','Neighborhood','Latitude','Longitude','Average Rating']]

In [None]:
ny_neighborhood_stats

In [None]:
# create map and display it
ny_map = folium.Map(location=geo_location('New York'), zoom_start=12)

In [None]:
# instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, in ny_neighborhood_stats[['Latitude','Longitude']].values:
    incidents.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=10, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    )

In [None]:
# add a new field to dataframe for labelling purpose
ny_neighborhood_stats['Label']=ny_neighborhood_stats['Neighborhood']+', '+ny_neighborhood_stats['Borough']+'('+ny_neighborhood_stats['Average Rating'].map(str)+')'

In [None]:
# add pop-up text to each marker on the map
for lat, lng, label in ny_neighborhood_stats[['Latitude','Longitude','Label']].values:
    folium.Marker([lat, lng], popup=label).add_to(ny_map)        
# add incidents to map
ny_map.add_child(incidents)