# Segmenting and Clustering Neighborhoods in London City

In [1]:
import requests 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [2]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# install and import folium library 
!pip -q install folium
import folium 

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name[0]
        ll = data8.get_text()
        lat_long = ll[2]
        latitude = lat_long[0]
        longitude = lat_long[1]
#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

# 1) Download and Explore the Dataset

Read the latitude and longitude coordinates of all Boroughs in London from a Wikipedia link

In [4]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name.split('[')
        borough_name = borough_name[0]
        borough_name = borough_name.strip()
        
        ll = data8.get_text()
        ll = ll.split('/')
        lat_long = ll[2]
        lat_long = lat_long.split('(')
        lat_long = lat_long[0]
        lat_long = lat_long.split(';')
        latitude = lat_long[0]
        latitude = latitude.strip()
        longitude = lat_long[1]
        longitude = longitude.strip()
        longitude = longitude.replace(u'\ufeff', '')
        latitude = float(latitude)
        longitude = float(longitude)

#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

In [5]:
df_boroughs = pd.DataFrame(df_list, columns=['Borough', 'Latitude' , 'Longitude'])

df_boroughs.shape

(32, 3)

In [6]:
df_boroughs.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Barking and Dagenham,51.5607,0.1557
1,Barnet,51.6252,-0.1517
2,Bexley,51.4549,0.1505
3,Brent,51.5588,-0.2817
4,Bromley,51.4039,0.0198


In [7]:
df_boroughs.dtypes

Borough       object
Latitude     float64
Longitude    float64
dtype: object

In [8]:
df_boroughs.loc[df_boroughs['Borough'] == 'Newham']

Unnamed: 0,Borough,Latitude,Longitude
23,Newham,51.5077,0.0469


### Get the Latitude and Longitude of London City using geopy library

In [9]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'London, UK'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London City are 51.5073219, -0.1276474.


### Create a map of London with Boroughs superimposed on top.

In [10]:
import folium 

# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
fill_opacity=0.7).add_to(map_london)  
map_london

# Preferred location for Asian restuarant - Newham Borough
As the borough Newham is having high asian population, we will consider to Segment and Cluster only the neighbourhoods of this borough. For that we have to get the latitude and longitude details of all the areas(neighbourhoods) of Newham borough.

Read the latitude and longitude coordinates of all the neighborhoods(areas) in Newham Borough

In [11]:
from urllib.request import urlopen
import re
URL = "https://en.wikipedia.org/wiki/List_of_areas_of_London"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

codes = []
areas_list = []
href_links_list = []
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data0 = data[0]
    area_name = data0.text

    data1 = data[1]
    data1 = data1.text
    borough = data1.split('[')
    borough_name = borough[0]
    data5 = data[5]
    code = data5.text
    code = code.strip()
    
    if borough_name == 'Newham':
        codes.append(code)
        areas_list.append((borough_name,area_name,code))

                
for link in soup.findAll('a', attrs={'href': re.compile("^https://tools.wmflabs.org")}):
            htext = link.text
            if htext in codes:
                hlink = link.get('href')
                href_links_list.append((htext, hlink))

### Create a DataFrame from the Areas list

In [12]:
df_areas = pd.DataFrame(areas_list, columns=['Borough', 'Area', 'Code'])
df_areas.head()

Unnamed: 0,Borough,Area,Code
0,Newham,Beckton,TQ435815
1,Newham,Canning Town,TQ405815
2,Newham,Custom House,TQ408807
3,Newham,East Ham,TQ425835
4,Newham,Forest Gate,TQ405855


In [13]:
df_areas.columns

Index(['Borough', 'Area', 'Code'], dtype='object')

In [14]:
df_areas.shape

(14, 3)


Create a DataFrame from the list of href links

In [16]:
df_links = pd.DataFrame(href_links_list, columns=['Code','href'])
df_links.columns

Index(['Code', 'href'], dtype='object')

In [17]:
df_links.shape

(15, 2)

In [18]:
df_links

Unnamed: 0,Code,href
0,TQ435815,https://tools.wmflabs.org/os/coor_g/?pagename=...
1,TQ405815,https://tools.wmflabs.org/os/coor_g/?pagename=...
2,TQ408807,https://tools.wmflabs.org/os/coor_g/?pagename=...
3,TQ425835,https://tools.wmflabs.org/os/coor_g/?pagename=...
4,TQ405855,https://tools.wmflabs.org/os/coor_g/?pagename=...
5,TQ435855,https://tools.wmflabs.org/os/coor_g/?pagename=...
6,TQ425855,https://tools.wmflabs.org/os/coor_g/?pagename=...
7,TQ391849,https://tools.wmflabs.org/os/coor_g/?pagename=...
8,TQ435795,https://tools.wmflabs.org/os/coor_g/?pagename=...
9,TQ405825,https://tools.wmflabs.org/os/coor_g/?pagename=...


### Merge the Areas and href Links DataFrames

In [19]:
cols = df_links.columns.difference(df_areas.columns)

cols

Index(['href'], dtype='object')

In [20]:
df_areas_links = pd.concat([df_areas, df_links[cols]], axis=1)

df_areas_links.shape

(15, 4)

In [21]:
df_areas_links

Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/os/coor_g/?pagename=...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/os/coor_g/?pagename=...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/os/coor_g/?pagename=...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/os/coor_g/?pagename=...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/os/coor_g/?pagename=...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/os/coor_g/?pagename=...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/os/coor_g/?pagename=...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/os/coor_g/?pagename=...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/os/coor_g/?pagename=...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/os/coor_g/?pagename=...


### Remove the row where there is no data

In [22]:
df_areas_links = df_areas_links.dropna(how='any')

df_areas_links

Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/os/coor_g/?pagename=...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/os/coor_g/?pagename=...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/os/coor_g/?pagename=...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/os/coor_g/?pagename=...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/os/coor_g/?pagename=...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/os/coor_g/?pagename=...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/os/coor_g/?pagename=...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/os/coor_g/?pagename=...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/os/coor_g/?pagename=...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/os/coor_g/?pagename=...


### Get the geo co-ordinates for all the areas in the Newham borough

In [23]:
geo_codes = []
for row in df_areas_links.itertuples():
    url = row.href
    code = row.Code
    res = requests.get(url).text
    soup1 = BeautifulSoup(res,'lxml')
    
    for lat in soup1.find('span',{'class':'latitude'}):
        latitude = lat
        latitude = float(latitude)
            
    for long in soup1.find('span',{'class':'longitude'}):    
        longitude = long
        longitude = float(longitude)
        
    geo_codes.append((code, latitude, longitude))

print(geo_codes)

[('TQ435815', 51.514642, 0.067375), ('TQ405815', 51.515396, 0.024169), ('TQ408807', 51.508133, 0.028171), ('TQ425835', 51.532867, 0.053782), ('TQ405855', 51.551339, 0.025765), ('TQ435855', 51.550584, 0.069004), ('TQ425855', 51.550838, 0.054591), ('TQ391849', 51.546294, 0.005349), ('TQ435795', 51.496671, 0.066561), ('TQ405825', 51.524382, 0.024568), ('TQ415795', 51.497175, 0.037769), ('TQ385845', 51.542847, -0.003456), ('TQ405837', 51.535165, 0.025046), ('TQ405837', 51.535165, 0.025046)]


### Create a DataFrame from the above list

In [24]:
df_geo_codes = pd.DataFrame(geo_codes, columns=['Code','Latitude','Longitude'])

df_geo_codes

Unnamed: 0,Code,Latitude,Longitude
0,TQ435815,51.514642,0.067375
1,TQ405815,51.515396,0.024169
2,TQ408807,51.508133,0.028171
3,TQ425835,51.532867,0.053782
4,TQ405855,51.551339,0.025765
5,TQ435855,51.550584,0.069004
6,TQ425855,51.550838,0.054591
7,TQ391849,51.546294,0.005349
8,TQ435795,51.496671,0.066561
9,TQ405825,51.524382,0.024568


### Now merge the Neighborhoods and Geocodes DataFrames

In [25]:
df_areas.columns

Index(['Borough', 'Area', 'Code'], dtype='object')

In [26]:
df_areas.shape

(14, 3)

In [27]:
df_geo_codes.columns

Index(['Code', 'Latitude', 'Longitude'], dtype='object')

In [28]:
df_geo_codes.shape

(14, 3)

In [30]:
cols = df_geo_codes.columns.difference(df_areas.columns)
cols

Index(['Latitude', 'Longitude'], dtype='object')

In [31]:
Newham_borough = pd.concat([df_areas, df_geo_codes[cols]], axis=1)
Newham_borough

Unnamed: 0,Borough,Area,Code,Latitude,Longitude
0,Newham,Beckton,TQ435815,51.514642,0.067375
1,Newham,Canning Town,TQ405815,51.515396,0.024169
2,Newham,Custom House,TQ408807,51.508133,0.028171
3,Newham,East Ham,TQ425835,51.532867,0.053782
4,Newham,Forest Gate,TQ405855,51.551339,0.025765
5,Newham,Little Ilford,TQ435855,51.550584,0.069004
6,Newham,Manor Park,TQ425855,51.550838,0.054591
7,Newham,Maryland,TQ391849,51.546294,0.005349
8,Newham,North Woolwich,TQ435795,51.496671,0.066561
9,Newham,Plaistow,TQ405825,51.524382,0.024568


### Change the name of the column 'Area' to 'Neighborhood '

In [32]:
Newham_borough = Newham_borough.rename(columns={'Area' :'Neighborhood'})

### We do not need the column Code for our further analysis, so we will drop it

In [33]:
Newham_borough.drop(['Code'], axis=1, inplace=True)

Newham_borough.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [34]:
Newham_borough

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Newham,Beckton,51.514642,0.067375
1,Newham,Canning Town,51.515396,0.024169
2,Newham,Custom House,51.508133,0.028171
3,Newham,East Ham,51.532867,0.053782
4,Newham,Forest Gate,51.551339,0.025765
5,Newham,Little Ilford,51.550584,0.069004
6,Newham,Manor Park,51.550838,0.054591
7,Newham,Maryland,51.546294,0.005349
8,Newham,North Woolwich,51.496671,0.066561
9,Newham,Plaistow,51.524382,0.024568


In [35]:
Newham_borough.dtypes

Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

### Now lets get the Coordinates of Newham Borough

In [37]:
address = 'Newham, London'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Newham are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Newham are 51.52999955, 0.0293179602938221.


# Let's Visualize the Areas(Neighborhoods) of Newham Borough

In [38]:
# create map of Newham using latitude and longitude values
map_Newham = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Newham_borough['Latitude'], Newham_borough['Longitude'], Newham_borough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Newham)  
    
map_Newham

### Define FourSquare Credentials

In [39]:
CLIENT_ID = '4S2SWSJCB4RJUNTMOXCBRS1DJ5TBD23RSSJHX1NO2ZAEA4U1' 
CLIENT_SECRET = 'L2K1DJXYPKRXTC0LTUFLHMOQLN2MSZVQKYEI43CQASFNOFTI' 
VERSION = '20191102'

### Let's explore the first Neighborhood(Area) in Newham Borough

In [40]:
Newham_borough.shape

(14, 4)

In [41]:
Newham_borough.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [42]:
Newham_borough.loc[0, 'Neighborhood']

'Beckton'

In [43]:
Newham_borough.loc[0]

Borough           Newham
Neighborhood     Beckton
Latitude         51.5146
Longitude       0.067375
Name: 0, dtype: object

### Get the Neighborhood's latitude and longitude value

In [44]:
neighborhood_latitude = Newham_borough.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Newham_borough.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Newham_borough.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Beckton are 51.514642, 0.067375.


### Now get the top 100 places of Newham Neighborhood within 500KM radius

### Create the GET request URL. Name your URL url.

In [45]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret=\
       {}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=4S2SWSJCB4RJUNTMOXCBRS1DJ5TBD23RSSJHX1NO2ZAEA4U1&client_secret=       L2K1DJXYPKRXTC0LTUFLHMOQLN2MSZVQKYEI43CQASFNOFTI&v=20191102&ll=51.514642,0.067375&radius=500&limit=100'

In [46]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c7950174434b95787cf0813'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4cd462e4886cb60c1bcb7d89-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d118951735',
         'name': 'Grocery Store',
         'pluralName': 'Grocery Stores',
         'primary': True,
         'shortName': 'Grocery Store'}],
       'id': '4cd462e4886cb60c1bcb7d89',
       'location': {'address': 'Alpine Way (Unit 4C, Beckton Retail Park)',
        'cc': 'GB',
        'city': 'London',
        'country': 'United Kingdom',
        'distance': 367,
        'formattedAddress': ['Alpine Way (Unit 4C, Beckton Retail Park)',
         'London',
         'Greater London',
         'E6 6

### Extract the Categories of the Venues

In [47]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### We are ready to clean the json and structure it into a pandas dataframe.¶

In [48]:
results = requests.get(url).json()

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lituanica,Grocery Store,51.516442,0.062927
1,Home Bargains,Discount Store,51.51719,0.062754
2,Premier Inn London Beckton,Hotel,51.515017,0.060978
3,Dreams Beckton,Furniture / Home Store,51.516101,0.063028
4,Beckton DLR Station,Light Rail Station,51.514365,0.06146


In [49]:
nearby_venues.shape

(8, 4)

In [50]:
nearby_venues['name'].value_counts()

Home Bargains                 1
Matalan                       1
Lituanica                     1
Premier Inn London Beckton    1
Beckton Retail Park           1
Beckton DLR Station           1
Dreams Beckton                1
Brewers Fayre                 1
Name: name, dtype: int64

### And how many venues were returned by Foursquare?

In [52]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

8 venues were returned by Foursquare.


# 2) Explore Neighborhoods in Newham Borough

In [53]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, 
            VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, lat, lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                             'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

### Now write the code to run the above function on each neighborhood and create a new dataframe called Newham venues

In [54]:
Newham_venues = getNearbyVenues(names=Newham_borough['Neighborhood'],
                                   latitudes=Newham_borough['Latitude'],
                                   longitudes=Newham_borough['Longitude']
                                  )

In [55]:
Newham_venues.shape

(206, 7)

In [56]:
Newham_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Beckton,51.514642,0.067375,Lituanica,51.516442,0.062927,Grocery Store
1,Beckton,51.514642,0.067375,Home Bargains,51.51719,0.062754,Discount Store
2,Beckton,51.514642,0.067375,Premier Inn London Beckton,51.515017,0.060978,Hotel
3,Beckton,51.514642,0.067375,Dreams Beckton,51.516101,0.063028,Furniture / Home Store
4,Beckton,51.514642,0.067375,Beckton DLR Station,51.514365,0.06146,Light Rail Station


### Let's check how many venues were returned for each neighborhood

In [57]:
Newham_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Beckton,8,8,8,8,8,8
Canning Town,4,4,4,4,4,4
Custom House,23,23,23,23,23,23
East Ham,19,19,19,19,19,19
Forest Gate,11,11,11,11,11,11
Little Ilford,4,4,4,4,4,4
Manor Park,5,5,5,5,5,5
Maryland,22,22,22,22,22,22
North Woolwich,18,18,18,18,18,18
Plaistow,8,8,8,8,8,8


### Let's find out how many unique categories can be curated from all the returned venues

In [58]:
print('There are {} uniques categories.'.format(len(Newham_venues['Venue Category'].unique())))

There are 94 uniques categories.


# 3) Analyze Each Neighborhood in Newham borough

In [59]:
# one hot encoding
Newham_onehot = pd.get_dummies(Newham_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Newham_onehot['Neighborhood'] = Newham_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Newham_onehot.columns[-1]] + list(Newham_onehot.columns[:-1])
Newham_onehot = Newham_onehot[fixed_columns]

Newham_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bar,Boat or Ferry,Bookstore,Boutique,...,Tapas Restaurant,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Train Station,Tunnel,Turkish Restaurant,Warehouse Store,Wine Bar
0,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
Newham_onehot.shape

(206, 95)

### Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [61]:
Newham_grouped = Newham_onehot.groupby('Neighborhood').mean().reset_index()
Newham_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bar,Boat or Ferry,Bookstore,Boutique,...,Tapas Restaurant,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Train Station,Tunnel,Turkish Restaurant,Warehouse Store,Wine Bar
0,Beckton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Canning Town,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Custom House,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,...,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
3,East Ham,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.0
4,Forest Gate,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0


In [62]:
Newham_grouped.columns

Index(['Neighborhood', 'Accessories Store', 'American Restaurant',
       'Art Gallery', 'Asian Restaurant', 'Bakery', 'Bar', 'Boat or Ferry',
       'Bookstore', 'Boutique', 'Brewery', 'Bridal Shop', 'Bridge',
       'Bubble Tea Shop', 'Burger Joint', 'Bus Stop', 'Café',
       'Chinese Restaurant', 'Clothing Store', 'Coffee Shop',
       'Convenience Store', 'Cosmetics Shop', 'Creperie', 'Dance Studio',
       'Department Store', 'Dessert Shop', 'Discount Store', 'Doctor's Office',
       'Doner Restaurant', 'Eastern European Restaurant', 'Electronics Store',
       'English Restaurant', 'Fast Food Restaurant', 'Fish & Chips Shop',
       'Food Court', 'Fried Chicken Joint', 'Furniture / Home Store',
       'Gas Station', 'General Entertainment', 'Gift Shop', 'Go Kart Track',
       'Greek Restaurant', 'Grocery Store', 'Gym / Fitness Center', 'Gym Pool',
       'Health Food Store', 'History Museum', 'Home Service', 'Hotel',
       'Hotel Bar', 'Ice Cream Shop', 'Indian Restaurant',
 

In [63]:
Newham_grouped.shape

(14, 95)

### Let's print each neighborhood along with the top 5 most common venues

In [64]:
num_top_venues = 5
for hood in Newham_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Newham_grouped[Newham_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Beckton----
                    venue  freq
0                   Hotel  0.12
1           Grocery Store  0.12
2                     Pub  0.12
3          Clothing Store  0.12
4  Furniture / Home Store  0.12


----Canning Town----
               venue  freq
0               Park  0.25
1        Gas Station  0.25
2  Convenience Store  0.25
3       Tennis Court  0.25
4    Paintball Field  0.00


----Custom House----
                venue  freq
0               Hotel  0.17
1  Chinese Restaurant  0.09
2                 Pub  0.09
3    Tapas Restaurant  0.04
4      Scenic Lookout  0.04


----East Ham----
                  venue  freq
0  Fast Food Restaurant  0.11
1        Clothing Store  0.11
2              Gym Pool  0.05
3     Electronics Store  0.05
4       Doctor's Office  0.05


----Forest Gate----
           venue  freq
0  Grocery Store  0.18
1         Market  0.18
2           Café  0.09
3  Moving Target  0.09
4         Bakery  0.09


----Little Ilford----
                 venue  freq
0   

### Let's put that into a pandas dataframe
### First, let's write a function to sort the venues in descending order.

In [65]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [66]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Newham_grouped['Neighborhood']

for ind in np.arange(Newham_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Newham_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head()    

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Light Rail Station,Discount Store,Hotel,Grocery Store,Clothing Store,Furniture / Home Store,Pub,Shopping Plaza,Wine Bar,Doctor's Office
1,Canning Town,Park,Convenience Store,Tennis Court,Gas Station,Fish & Chips Shop,Dessert Shop,Discount Store,Doctor's Office,Doner Restaurant,Eastern European Restaurant
2,Custom House,Hotel,Chinese Restaurant,Pub,Wine Bar,Scenic Lookout,Hotel Bar,Italian Restaurant,Japanese Restaurant,English Restaurant,Light Rail Station
3,East Ham,Fast Food Restaurant,Clothing Store,Coffee Shop,Shopping Mall,Warehouse Store,Jewelry Store,Electronics Store,Doctor's Office,Grocery Store,Park
4,Forest Gate,Grocery Store,Market,Train Station,Bakery,Italian Restaurant,Pub,Moving Target,Café,Fast Food Restaurant,Wine Bar


# 4) Cluster the Neighborhoods¶

# Run K-means algorithm to cluster the neighborhood into 5 clusters.

In [67]:
Newham_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bar,Boat or Ferry,Bookstore,Boutique,...,Tapas Restaurant,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Train Station,Tunnel,Turkish Restaurant,Warehouse Store,Wine Bar
0,Beckton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Canning Town,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Custom House,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,...,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
3,East Ham,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.0
4,Forest Gate,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0


In [68]:
# set number of clusters
kclusters = 5
Newham_grouped_clustering = Newham_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Newham_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 2, 1, 1, 1, 3, 1, 1, 1, 0], dtype=int32)

In [69]:
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [70]:
Newham_merged = Newham_borough
# add clustering labels
Newham_merged['Cluster Labels'] = kmeans.labels_

# merge Neighborhoods dataframe with Newham borough dataframe to add latitude/longitude for each neighborhood
Newham_merged = Newham_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Newham_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Newham,Beckton,51.514642,0.067375,1,Light Rail Station,Discount Store,Hotel,Grocery Store,Clothing Store,Furniture / Home Store,Pub,Shopping Plaza,Wine Bar,Doctor's Office
1,Newham,Canning Town,51.515396,0.024169,2,Park,Convenience Store,Tennis Court,Gas Station,Fish & Chips Shop,Dessert Shop,Discount Store,Doctor's Office,Doner Restaurant,Eastern European Restaurant
2,Newham,Custom House,51.508133,0.028171,1,Hotel,Chinese Restaurant,Pub,Wine Bar,Scenic Lookout,Hotel Bar,Italian Restaurant,Japanese Restaurant,English Restaurant,Light Rail Station
3,Newham,East Ham,51.532867,0.053782,1,Fast Food Restaurant,Clothing Store,Coffee Shop,Shopping Mall,Warehouse Store,Jewelry Store,Electronics Store,Doctor's Office,Grocery Store,Park
4,Newham,Forest Gate,51.551339,0.025765,1,Grocery Store,Market,Train Station,Bakery,Italian Restaurant,Pub,Moving Target,Café,Fast Food Restaurant,Wine Bar


### Let's Visualize the Cluster

In [71]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
y = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(y)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Newham_merged['Latitude'], Newham_merged['Longitude'], Newham_merged['Neighborhood'], Newham_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 5) Examine the Clusters

In [72]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 0, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Plaistow,Park,Café,Gym / Fitness Center,Grocery Store,Indian Restaurant,Bus Stop,Fish & Chips Shop,Discount Store,Doctor's Office,Doner Restaurant
10,Silvertown,Park,Gym / Fitness Center,Go Kart Track,Theater,Café,Paintball Field,English Restaurant,Department Store,Dessert Shop,Discount Store


In [73]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 1, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Light Rail Station,Discount Store,Hotel,Grocery Store,Clothing Store,Furniture / Home Store,Pub,Shopping Plaza,Wine Bar,Doctor's Office
2,Custom House,Hotel,Chinese Restaurant,Pub,Wine Bar,Scenic Lookout,Hotel Bar,Italian Restaurant,Japanese Restaurant,English Restaurant,Light Rail Station
3,East Ham,Fast Food Restaurant,Clothing Store,Coffee Shop,Shopping Mall,Warehouse Store,Jewelry Store,Electronics Store,Doctor's Office,Grocery Store,Park
4,Forest Gate,Grocery Store,Market,Train Station,Bakery,Italian Restaurant,Pub,Moving Target,Café,Fast Food Restaurant,Wine Bar
6,Manor Park,Home Service,Turkish Restaurant,Gym / Fitness Center,Indian Restaurant,Restaurant,Fish & Chips Shop,Dessert Shop,Discount Store,Doctor's Office,Doner Restaurant
7,Maryland,Pub,Bus Stop,Grocery Store,Hotel,Supermarket,Dance Studio,Liquor Store,Eastern European Restaurant,Café,Portuguese Restaurant
8,North Woolwich,Pier,History Museum,Outdoor Sculpture,Scenic Lookout,Gym / Fitness Center,Hotel,Italian Restaurant,Clothing Store,Pharmacy,Plaza
11,Stratford,Pub,Sandwich Place,Fast Food Restaurant,Bookstore,Clothing Store,Cosmetics Shop,Café,Bus Stop,Burger Joint,Pizza Place


In [74]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 2, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Canning Town,Park,Convenience Store,Tennis Court,Gas Station,Fish & Chips Shop,Dessert Shop,Discount Store,Doctor's Office,Doner Restaurant,Eastern European Restaurant


In [75]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 3, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Little Ilford,Ice Cream Shop,Indian Restaurant,Grocery Store,Fried Chicken Joint,Wine Bar,Food Court,Discount Store,Doctor's Office,Doner Restaurant,Eastern European Restaurant


In [76]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 4, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Upton Park,Convenience Store,Bookstore,Boutique,Bus Stop,Wine Bar,Fried Chicken Joint,Doner Restaurant,Eastern European Restaurant,Electronics Store,English Restaurant
13,West Ham,Convenience Store,Bookstore,Boutique,Bus Stop,Wine Bar,Fried Chicken Joint,Doner Restaurant,Eastern European Restaurant,Electronics Store,English Restaurant


# Conclusion:
### After examining the above 5 clusters, we can recommend our stakeholders that Beckton,Custom House, Maryland, Eastham and Manor Park are the best neighborhoods in Newham borough, to open their asian restuarant. This is because in these areas, the most common venue visited by the public is the restuarants and as these areas are highly populated with asians, opening an asian restuarant would definitley be a good idea.