# <b><h> Capstone Project - Clustering the coffee shops of London - Code

In [2]:
#Importing Libraries

import pandas as pd
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from pandas.io.json import json_normalize

In [2]:
#Scraping the neighborhood data for London from wikipedia

url_london = "https://en.wikipedia.org/wiki/List_of_areas_of_London"
wiki_london = requests.get(url_london)
london_data = pd.read_html(wiki_london.text)
london_data = london_data[1]
london_data

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,020,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",020,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,020,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,020,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",020,TQ478728
...,...,...,...,...,...,...
526,Woolwich,Greenwich,LONDON,SE18,020,TQ435795
527,Worcester Park,"Sutton, Kingston upon Thames",WORCESTER PARK,KT4,020,TQ225655
528,Wormwood Scrubs,Hammersmith and Fulham,LONDON,W12,020,TQ225815
529,Yeading,Hillingdon,HAYES,UB4,020,TQ115825


In [3]:
#Data preprocessing and cleaning the dataframe

#replacing spaces with underscores
london_data.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)

#dropping unecessary columns
london_df = london_data.drop( [ london_data.columns[0], london_data.columns[4], london_data.columns[5] ], axis=1)

#renaming columns
london_df.columns = ['borough','town','post_code']

#Stripping the number and square brackets from borough column
london_df['borough'] = london_df['borough'].map(lambda x: x.rstrip(']').rstrip('0123456789').rstrip('['))

#Refining data to only locations in London
london_df = london_df[london_df['town'].str.contains('LONDON')]

In [4]:
#Checking out the refined df

print (london_df.shape)
london_df.head()

(308, 3)


Unnamed: 0,borough,town,post_code
0,"Bexley, Greenwich",LONDON,SE2
1,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4"
6,City,LONDON,EC3
7,Westminster,LONDON,WC2
9,Bromley,LONDON,SE20


In [5]:
#Downloading ARCGIS packages

from arcgis.geocoding import geocode
from arcgis.gis import GIS
gis = GIS()

In [6]:
#Creat function to get coordinates

def get_x_y_uk(address1):
   lat_coords = 0
   lng_coords = 0
   g = geocode(address='{}, London, England, GBR'.format(address1))[0]
   lng_coords = g['location']['x']
   lat_coords = g['location']['y']
   return str(lat_coords) +","+ str(lng_coords)


In [7]:
#Post Codes of UK from the df

geo_coordinates_uk = london_df['post_code']    
geo_coordinates_uk

0           SE2
1        W3, W4
6           EC3
7           WC2
9          SE20
         ...   
521    IG8, E18
522         IG8
525         N12
526        SE18
528         W12
Name: post_code, Length: 308, dtype: object

In [8]:
#Finding coordinates for each post code

coordinates = geo_coordinates_uk.apply(lambda x: get_x_y_uk(x))
coordinates

0       51.492450000000076,0.12127000000003818
1        51.51324000000005,-0.2674599999999714
6       51.51200000000006,-0.08057999999994081
7       51.51651000000004,-0.11967999999995982
9       51.41009000000008,-0.05682999999993399
                        ...                   
521    51.589770000000044,0.030520000000024083
522      51.50642000000005,-0.1272099999999341
525     51.615920000000074,-0.1767399999999384
526      51.48207000000008,0.07143000000002075
528      51.50645000000003,-0.2369099999999662
Name: post_code, Length: 308, dtype: object

In [9]:
#Splitting previous result into latitude

lat_uk = coordinates.apply(lambda x: x.split(',')[0])
lat_uk

0      51.492450000000076
1       51.51324000000005
6       51.51200000000006
7       51.51651000000004
9       51.41009000000008
              ...        
521    51.589770000000044
522     51.50642000000005
525    51.615920000000074
526     51.48207000000008
528     51.50645000000003
Name: post_code, Length: 308, dtype: object

In [10]:
#Splitting previous result into longitude

lng_uk = coordinates.apply(lambda x: x.split(',')[1])
lng_uk

0       0.12127000000003818
1       -0.2674599999999714
6      -0.08057999999994081
7      -0.11967999999995982
9      -0.05682999999993399
               ...         
521    0.030520000000024083
522     -0.1272099999999341
525     -0.1767399999999384
526     0.07143000000002075
528     -0.2369099999999662
Name: post_code, Length: 308, dtype: object

In [11]:
#Merging the london df with the coordinates

london_merged = pd.concat([london_df,lat_uk.astype(float), lng_uk.astype(float)], axis=1)
london_merged.columns= ['borough','town','post_code','latitude','longitude']
london_merged

Unnamed: 0,borough,town,post_code,latitude,longitude
0,"Bexley, Greenwich",LONDON,SE2,51.49245,0.12127
1,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",51.51324,-0.26746
6,City,LONDON,EC3,51.51200,-0.08058
7,Westminster,LONDON,WC2,51.51651,-0.11968
9,Bromley,LONDON,SE20,51.41009,-0.05683
...,...,...,...,...,...
521,Redbridge,LONDON,"IG8, E18",51.58977,0.03052
522,"Redbridge, Waltham Forest","LONDON, WOODFORD GREEN",IG8,51.50642,-0.12721
525,Barnet,LONDON,N12,51.61592,-0.17674
526,Greenwich,LONDON,SE18,51.48207,0.07143


In [12]:
#Getting the geocode of london

london = geocode(address='London, England, GBR')[0]
long = london['location']['x']
lat = london['location']['y']
print (long, lat)

-0.1272099999999341 51.50642000000005


In [13]:
# Creating the map of London
map_London = folium.Map(location=[lat, long], zoom_start=12)
map_London

# adding markers to map
for latitude, longitude, borough, town in zip(london_merged['latitude'], london_merged['longitude'], london_merged['borough'], london_merged['town']):
    label = '{}, {}'.format(town, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='red',
        fill=True
        ).add_to(map_London)  
    
map_London


In [14]:
#Foursquare API details

CLIENT_ID = 'LJRIOPWXRMA5UQFWDFWIBDVT3LVFX2V3HIZAYGTHRQRN2O2H'
CLIENT_SECRET = 'WEJBTAHLZOOPSXTAKADWTTD4D12DSFI0U2NKE2DCOTKFNEW1'
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('Foursquare_ID: ' + CLIENT_ID)
print('Foursquare_Secret:' + CLIENT_SECRET)

Your credentials:
Foursquare_ID: LJRIOPWXRMA5UQFWDFWIBDVT3LVFX2V3HIZAYGTHRQRN2O2H
Foursquare_Secret:WEJBTAHLZOOPSXTAKADWTTD4D12DSFI0U2NKE2DCOTKFNEW1


In [15]:
#Creating a function to pull near by venues from the Foursqaure API

LIMIT=100

def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            LIMIT
            )

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Category']

    return(nearby_venues)

In [16]:
#Extracting near by venues

ldn_venues = getNearbyVenues(london_merged['borough'], london_merged['latitude'], london_merged['longitude'])

Bexley, Greenwich 
Ealing, Hammersmith and Fulham
City
Westminster
Bromley
Islington
Islington
Barnet
Enfield
Wandsworth
Southwark
City
Richmond upon Thames
Barnet
Islington
Wandsworth
Westminster
Bromley
Newham
Ealing
Westminster
Lewisham
Camden
Southwark
Tower Hamlets
Bexley
City
Lewisham
Greenwich
Tower Hamlets
Camden
Haringey
Tower Hamlets
Haringey
Barnet
Brent
Lambeth
Lewisham
Tower Hamlets
Kensington and Chelsea, Hammersmith and Fulham
Brent
Barnet
Barnet
Southwark
Tower Hamlets
Camden
Tower Hamlets
Waltham Forest
Newham
Islington
Richmond upon Thames
Lewisham
Camden
Westminster
Greenwich
Kensington and Chelsea
Barnet
Westminster
Lewisham
Waltham Forest
Hounslow, Ealing, Hammersmith and Fulham
Brent
Barnet
Lambeth, Wandsworth
Islington
Barnet
Merton
Barnet
Westminster
Barnet, Brent, Camden
Lewisham
Bexley
Haringey
Bromley
Tower Hamlets
Newham
Hackney
Islington
Southwark
Lewisham
Brent
Southwark
Ealing
Kensington and Chelsea
Wandsworth
Southwark
Barnet
Newham
Richmond upon Thames


In [17]:
#Resulting data looks like this

ldn_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Category
0,"Bexley, Greenwich",51.49245,0.12127,Lesnes Abbey,Historic Site
1,"Bexley, Greenwich",51.49245,0.12127,Sainsbury's,Supermarket
2,"Bexley, Greenwich",51.49245,0.12127,Lidl,Supermarket
3,"Bexley, Greenwich",51.49245,0.12127,Abbey Wood Railway Station (ABW),Train Station
4,"Bexley, Greenwich",51.49245,0.12127,Bean @ Work,Coffee Shop


In [18]:
#looking at the most popular venue categories

ldn_venues2 = ldn_venues['Venue Category'].value_counts()
ldn_venues2.head(20)

Pub                     776
Coffee Shop             699
Café                    594
Hotel                   326
Italian Restaurant      314
Grocery Store           304
Bakery                  224
Park                    219
Sandwich Place          210
Pizza Place             201
Gym / Fitness Center    184
Indian Restaurant       172
Supermarket             169
Bar                     154
Fast Food Restaurant    137
Restaurant              134
Burger Joint            129
Bus Stop                126
Pharmacy                125
Clothing Store          117
Name: Venue Category, dtype: int64

In [40]:
#Narrowing down the venues to just coffee shops and cafes

all_ldn_coffee = ldn_venues[ldn_venues['Venue Category'].isin(["Café", "Coffee Shop"])]
all_ldn_coffee.count()

Neighbourhood              1293
Neighbourhood Latitude     1293
Neighbourhood Longitude    1293
Venue                      1293
Venue Category             1293
dtype: int64

In [41]:
#Counting coffee shops/cafes by neighbourhoods

all_ldn_coffee['Neighbourhood'].value_counts()

Westminster                                       187
Camden                                            113
Islington                                          97
Barnet                                             94
Hackney                                            82
Haringey                                           64
Hammersmith and Fulham                             58
Southwark                                          57
Lewisham                                           54
Tower Hamlets                                      52
Kensington and Chelsea                             47
Wandsworth                                         41
Newham                                             40
Waltham Forest                                     38
Lambeth                                            37
Merton                                             26
City                                               17
Croydon                                            16
Brent                       

In [42]:
#Our new dataframe containing coffee shops

all_ldn_coffee.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Category
4,"Bexley, Greenwich",51.49245,0.12127,Bean @ Work,Coffee Shop
12,City,51.512,-0.08058,The Association,Coffee Shop
13,City,51.512,-0.08058,Curators Coffee Studio,Coffee Shop
28,City,51.512,-0.08058,Black Sheep Coffee,Coffee Shop
50,City,51.512,-0.08058,canteenM,Café


In [43]:
#Mapping coffee shops in London

coffee_map = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, name, categories, address in zip(all_ldn_coffee['Neighbourhood Latitude'], all_ldn_coffee['Neighbourhood Longitude'], 
                                           all_ldn_coffee['Venue'], all_ldn_coffee['Venue Category'],\
                                               all_ldn_coffee['Neighbourhood']):
    label = '{}, {}'.format(name, address)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(coffee_map)  
    
coffee_map

In [44]:
# set number of clusters for K-means clustering
k_num_clusters = 5

London_grouped_clustering = all_ldn_coffee.drop(['Neighbourhood', 'Venue', 'Venue Category'], 1)

# run k-means clustering
kmeans_london = KMeans(n_clusters=k_num_clusters, random_state=0).fit(London_grouped_clustering)
kmeans_london

KMeans(n_clusters=5, random_state=0)

In [45]:
#labelling London 
kmeans_london.labels_

array([2, 4, 4, ..., 1, 0, 0])

In [46]:
#Inserting a cluster label column into df
all_ldn_coffee.insert(0, 'Cluster Labels', kmeans_london.labels_ +1)

In [47]:
#seeing new df with new column
all_ldn_coffee.head()

Unnamed: 0,Cluster Labels,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Category
4,3,"Bexley, Greenwich",51.49245,0.12127,Bean @ Work,Coffee Shop
12,5,City,51.512,-0.08058,The Association,Coffee Shop
13,5,City,51.512,-0.08058,Curators Coffee Studio,Coffee Shop
28,5,City,51.512,-0.08058,Black Sheep Coffee,Coffee Shop
50,5,City,51.512,-0.08058,canteenM,Café


In [48]:
#dropping na's within cluster column

ldn_data_nonan = all_ldn_coffee.dropna(subset=['Cluster Labels'])
#ldn_data_nonan = ldn_data_nonan.drop_duplicates()

In [49]:
ldn_data_nonan.head()

Unnamed: 0,Cluster Labels,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Category
4,3,"Bexley, Greenwich",51.49245,0.12127,Bean @ Work,Coffee Shop
12,5,City,51.512,-0.08058,The Association,Coffee Shop
13,5,City,51.512,-0.08058,Curators Coffee Studio,Coffee Shop
28,5,City,51.512,-0.08058,Black Sheep Coffee,Coffee Shop
50,5,City,51.512,-0.08058,canteenM,Café


In [50]:
ldn_data_nonan['Neighbourhood'].value_counts()

Westminster                                       187
Camden                                            113
Islington                                          97
Barnet                                             94
Hackney                                            82
Haringey                                           64
Hammersmith and Fulham                             58
Southwark                                          57
Lewisham                                           54
Tower Hamlets                                      52
Kensington and Chelsea                             47
Wandsworth                                         41
Newham                                             40
Waltham Forest                                     38
Lambeth                                            37
Merton                                             26
City                                               17
Croydon                                            16
Brent                       

In [61]:
#mapping clusters

ldn_map = folium.Map(location=[lat, long], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k_num_clusters)
ys = [i + x + (i*x)**2 for i in range(k_num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ldn_data_nonan['Neighbourhood Latitude'], ldn_data_nonan['Neighbourhood Longitude'], ldn_data_nonan['Neighbourhood'], ldn_data_nonan['Cluster Labels']):
    label = folium.Popup('Cluster ' + str(int(cluster) +1) + '\n' + str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)]
        ).add_to(ldn_map)
        
ldn_map

In [59]:
#Cluster 1
ldn_data_nonan.loc[ldn_data_nonan['Cluster Labels'] == 1, ldn_data_nonan.columns[[1] + list(range(5, ldn_data_nonan.shape[1]))]]

Unnamed: 0,Neighbourhood,Venue Category
348,Wandsworth,Coffee Shop
350,Wandsworth,Coffee Shop
358,Wandsworth,Coffee Shop
372,Wandsworth,Café
373,Wandsworth,Coffee Shop
...,...,...
10233,Merton,Café
10238,Merton,Coffee Shop
10242,Merton,Coffee Shop
10386,Hammersmith and Fulham,Café


In [33]:
#Cluster 2
ldn_data_nonan.loc[ldn_data_nonan['Cluster Labels'] == 2, ldn_data_nonan.columns[[1] + list(range(5, ldn_data_nonan.shape[1]))]]

Unnamed: 0,Neighbourhood,Venue Category
273,Islington,Café
277,Islington,Coffee Shop
281,Islington,Coffee Shop
284,Islington,Coffee Shop
289,Islington,Coffee Shop
...,...,...
10356,Barnet,Coffee Shop
10358,Barnet,Coffee Shop
10360,Barnet,Coffee Shop
10372,Barnet,Coffee Shop


In [34]:
#Cluster 3
ldn_data_nonan.loc[ldn_data_nonan['Cluster Labels'] == 3, ldn_data_nonan.columns[[1] + list(range(5, ldn_data_nonan.shape[1]))]]

Unnamed: 0,Neighbourhood,Venue Category
4,"Bexley, Greenwich",Coffee Shop
719,Newham,Café
726,Newham,Café
727,Newham,Café
732,Newham,Café
...,...,...
9465,Croydon,Coffee Shop
9466,Croydon,Café
9521,Newham,Café
9526,Newham,Café


In [35]:
#Cluster 4
ldn_data_nonan.loc[ldn_data_nonan['Cluster Labels'] == 4, ldn_data_nonan.columns[[1] + list(range(5, ldn_data_nonan.shape[1]))]]

Unnamed: 0,Neighbourhood,Venue Category
978,Tower Hamlets,Café
980,Tower Hamlets,Café
981,Tower Hamlets,Coffee Shop
982,Tower Hamlets,Café
983,Tower Hamlets,Coffee Shop
...,...,...
9900,Newham,Coffee Shop
9913,Newham,Coffee Shop
10197,Tower Hamlets,Coffee Shop
10276,Redbridge,Coffee Shop


In [36]:
#Cluster 5
ldn_data_nonan.loc[ldn_data_nonan['Cluster Labels'] == 5, ldn_data_nonan.columns[[1] + list(range(5, ldn_data_nonan.shape[1]))]]

Unnamed: 0,Neighbourhood,Venue Category
12,City,Coffee Shop
13,City,Coffee Shop
28,City,Coffee Shop
50,City,Café
82,City,Coffee Shop
...,...,...
10102,Westminster,Coffee Shop
10113,Westminster,Café
10123,Westminster,Coffee Shop
10127,Westminster,Café
