## Segmenting and Clustering Neighborhoods of Toronto 

### Import libraries
Prepare and get all needed dependencies

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # HTTP library

!pip install beautifulsoup4
from bs4 import BeautifulSoup # scraping library

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/3b/c8/a55eb6ea11cd7e5ac4bacdf92bac4693b90d3ba79268be16527555e186f0/beautifulsoup4-4.8.1-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 3.5MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.1 soupsieve-1.9.5
Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    -------------------------

## Data Scraping and Cleaning
Scrape the Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [2]:
#get the entire html of the article as a str
wiki_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

#transform the wiki text via BeautifulSoup
parsed_result = BeautifulSoup(wiki_text, 'html.parser')

#find table format data
neighborhood_info_table = parsed_result.find('table', class_ = 'wikitable')
#find all the rows of table
neighborhood_rows = neighborhood_info_table.find_all('tr')
# for each row of the table, find all the table data
for neighborhood_rows in parsed_result.find('table').find_all('tr'):
    neighborhood_cells = neighborhood_rows.find_all('td')

# append the data into the lists
for neighborhood_rows in parsed_result.find('table').find_all('tr'):
    neighborhood_cells = neighborhood_rows.find_all('td')
    if(len(neighborhood_cells) > 0):
        postalCodeList.append(neighborhood_cells[0].text)
        boroughList.append(neighborhood_cells[1].text)
        neighborhoodList.append(neighborhood_cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

# create a new DataFrame from the three lists
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()    


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Drop cells with a borough that is not assigned

In [3]:
# drop cells with a borough that is Not assigned
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_dropna.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Not assigned
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


## Group neighborhoods in the same postal code

In [4]:
# group neighborhoods in the same postal code
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## For Neighborhood=Not assigned, set Neighborhood = Borough

In [5]:
# for Neighborhood="Not assigned", set Neighborhood Borough
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Shape of data thus far

In [6]:
# print the dataframe shape
toronto_df_grouped.shape

(103, 3)

## Add geographical coordinates to dataframe

In [7]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data') # read the csv file into a dataframe

print('The geographical coordinates dataframe shape is', geo_df.shape)
geo_df.head()



The geographical coordinates dataframe shape is (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
#merge with group data
postcodes_with_geo_df = toronto_df_grouped.join(geo_df.set_index('Postal Code'), on='PostalCode')
postcodes_with_geo_df.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Explore and cluster the neighborhoods in Toronto

Since optional, not restricting neighborhoods to those with 'Toronto' in the name, because why? 
All large cities have neighborhoods with varied names.

In [10]:
#setup FourSquare -- note, may delete id/secret upon save/share for security
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version



### Explore

In [11]:
address = 'Toronto, Ontario, Canada'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [12]:
toronto_data = postcodes_with_geo_df #prettier name


## Create a map of Toronto using latitude and longitude values

### Create a map of Toronto with Postal Code centers superimposed on top.

In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postalcode in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['PostalCode']):
    label = '{}'.format(postalcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [16]:
#Explore venues within each borough

### Go to one postal area and explore to test concept

In [14]:
#Get a name
# skipping this time # 
toronto_data.loc[0, 'PostalCode']

postalcode_latitude = toronto_data.loc[0, 'Latitude'] # PostalCode latitude value
postalcode_longitude = toronto_data.loc[0, 'Longitude'] # PostalCode longitude value

postalcode_name = toronto_data.loc[0, 'PostalCode'] # PostalCode name

print('Latitude and longitude values of postal code {} are {}, {}.'.format(postalcode_name, 
                                                               postalcode_latitude, 
                                                               postalcode_longitude))

Latitude and longitude values of postal code M1B are 43.806686299999996, -79.19435340000001.


### Determine if there is a grocery venue in the postal code within a radius of 1600 meters (1.6 km).
We'll limit the number of venues returned to 2 (could do 1) as we just want to determine IF there are any grocery stores within a given radius.

In [15]:
LIMIT = 2 # limit of number of venues returned by Foursquare API
radius = 1600 # define radius of 1.6 km or ~ 1 mile

# create URL
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&intent=browse&radius={}&limit={}&categoryId=4bf58dd8d48988d118951735'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    postalcode_latitude, 
    postalcode_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()
results


{'meta': {'code': 200, 'requestId': '5dd709800de0d9001bfc822d'},
 'response': {'venues': [{'id': '53976a6f498ecd1162705c9a',
    'name': 'Fusion supermart',
    'location': {'crossStreet': 'Milner ave',
     'lat': 43.800433740639924,
     'lng': -79.200439453125,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.800433740639924,
       'lng': -79.200439453125}],
     'distance': 850,
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['Milner ave', 'Toronto ON', 'Canada']},
    'categories': [{'id': '4bf58dd8d48988d118951735',
      'name': 'Grocery Store',
      'pluralName': 'Grocery Stores',
      'shortName': 'Grocery Store',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1574373762',
    'hasPerk': False}]}}

In [16]:

#Now we are ready to clean the json and structure it into a pandas dataframe.

#stores = results
stores = results['response']['venues']
nearby_stores = json_normalize(stores) # flatten JSON

# filter columns
filtered_columns = ['name', 'location.lat', 'location.lng', 'location.distance']
nearby_stores =nearby_stores.loc[:, filtered_columns]

# filter the category for each row
#nearby_stores['venue.categories'] = nearby_stores.apply(get_category_type, axis=1)

# clean columns
#nearby_stores.columns = [col.split(".")[-1] for col in nearby_stores.columns]

nearby_stores.head()


Unnamed: 0,name,location.lat,location.lng,location.distance
0,Fusion supermart,43.800434,-79.200439,850


### That was a positive result. Now let's check all Toronto postal code areas
Note that wear are doing a 'search' in Foursquare rather than 'explore.' This will return a list of venues near a given location.

Also, we are setting intent to "browse" as this searches an entire region instead of only finding venues closest to a point. We can specify the region with radius. The categoryId is the one for "grocery store."


In [17]:

LIMIT = 1 #only need to see if at least one grocery nearby. If not, food desert

def getNearbyGrocery(pcodes, latitudes, longitudes, radius=1600):
    
    grocery_list=[]
    for name, lat, lng in zip(pcodes, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&intent=browse&radius={}&limit={}&categoryId=4bf58dd8d48988d118951735'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        
        # return only relevant information for each nearby venue
        grocery_list.append([(
            name, 
            v['name'], #name of store
            v['location']['lat'], 
            v['location']['lng'], 
            v['location']['distance']) for v in results])
    
    nearby_grocery = pd.DataFrame([item for groceries_list in grocery_list for item in groceries_list])

    nearby_grocery.columns = ['Postal Code', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Distance' ]
    
    return(nearby_grocery)


In [18]:
toronto_grocery = getNearbyGrocery(pcodes=toronto_data['PostalCode'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
toronto_grocery.head()

Unnamed: 0,Postal Code,Venue,Venue Latitude,Venue Longitude,Venue Distance
0,M1B,Fusion supermart,43.800434,-79.200439,850
1,M1C,Coppa's Fresh Market,43.772094,-79.16634,1462
2,M1E,Joseph's No Frills,43.769591,-79.187669,675
3,M1G,Panchvati Supermarket,43.77605,-79.23047,1226
4,M1H,FreshCo.,43.773475,-79.251392,958


In [19]:
toronto_grocery.shape

(98, 5)

## Grocery Stores in postal code areas
Note that the shape of the toronto_grocery dataframe has fewer rows that that of the postal codes. 
This means some areas did NOT find grocery stores in the area.

### Join dataframes to determine where food deserts exist

In [20]:
food_desert = pd.merge(geo_df, toronto_grocery, on='Postal Code', how='left')
food_desert

Unnamed: 0,Postal Code,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Distance
0,M1B,43.806686,-79.194353,Fusion supermart,43.800434,-79.200439,850.0
1,M1C,43.784535,-79.160497,Coppa's Fresh Market,43.772094,-79.16634,1462.0
2,M1E,43.763573,-79.188711,Joseph's No Frills,43.769591,-79.187669,675.0
3,M1G,43.770992,-79.216917,Panchvati Supermarket,43.77605,-79.23047,1226.0
4,M1H,43.773136,-79.239476,FreshCo.,43.773475,-79.251392,958.0
5,M1J,43.744734,-79.239476,Stephen's No Frills,43.737627,-79.246737,983.0
6,M1K,43.727929,-79.262029,Rob's No Frills,43.732606,-79.266706,642.0
7,M1L,43.711112,-79.284577,Tom's No Frills,43.708313,-79.295954,967.0
8,M1M,43.716316,-79.239476,No Frills,43.709391,-79.24814,1039.0
9,M1N,43.692657,-79.264848,,,,


## Find the postal codes (rows) where no grocery stores were found 

In [21]:
food_desert = food_desert[food_desert.isnull().any(axis=1)]

In [22]:
food_desert

Unnamed: 0,Postal Code,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Distance
9,M1N,43.692657,-79.264848,,,,
16,M1X,43.836125,-79.205636,,,,
86,M7R,43.636966,-79.615819,,,,
96,M9L,43.756303,-79.565963,,,,
102,M9W,43.706748,-79.594054,,,,


## Create a map that shows the buffer areas searched

In [23]:
# create map of Toronto using latitude and longitude values
grocery_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postalcode in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['PostalCode']):
    label = '{}'.format(postalcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(grocery_map)
    folium.Circle(
        [lat, lng],
        radius=1600,
        color='red',
        fill = True,
        fill_color = 'red',
        fill_opacity = 0.2
    ).add_to(grocery_map)
        
grocery_map

## Map of area where urban food deserts are found in Toronto, ON

In [24]:
#Food desert map - color areas that are considered a food desert

desert_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postalcode in zip(food_desert['Latitude'], food_desert['Longitude'], food_desert['Postal Code']):
    label = '{}'.format(postalcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(desert_map) 
    folium.Circle(
        [lat, lng],
        radius=1600,
        color='yellow',
        fill = True,
        fill_color = 'yellow',
        fill_opacity = 0.2
    ).add_to(desert_map)

    
desert_map