In [5]:
import pandas as pd
import numpy as np

In [6]:
 !conda install -c anaconda beautifulsoup4 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.1       |           py36_0         153 KB  anaconda

The following packages will be UPDATED:

    beautifulsoup4: 4.7.1-py36_1 --> 4.8.1-py36_0 anaconda


Downloading and Extracting Packages
beautifulsoup4-4.8.1 | 153 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [7]:
import requests
from bs4 import BeautifulSoup

## get postal codes html

In [16]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'html5lib')
#print(soup.prettify())

## get headings

In [28]:
my_table = soup.find('table',{'class':'wikitable sortable'})
trs=my_table.find_all('tr')
ths = my_table.find_all('th')
headings = [th.text.strip() for th in ths]
headings

['Postcode', 'Borough', 'Neighbourhood']

## get df

In [44]:
postal_codes_dict = {} # initialize an empty dictionary to save the data in
for tr in my_table.find_all('tr'):
    try:
        tds = tr.find_all('td')
        if not tds:
            continue
        postal_code, borough, neighborhood = [td.text.strip() for td in tds[:3]]
        #print('; '.join([postal_code, borough, neighborhood]))
        if borough == 'Not assigned':
            neighborhoods = []
        else:
            postal_codes_dict[postal_code] = {}
            postal_codes_dict[postal_code]['borough'] = borough
            if neighborhood == 'Not assigned':
                neighborhood = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhood
    except:
        pass
postal_codes_dict
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"PostalCode": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

df2=toronto_data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
#df2
df2.shape

(103, 3)

In [46]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
from pandas.io.json import json_normalize
import json

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install geopy
from geopy.geocoders import Nominatim 

!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/72/ff/004bfe344150a064e558cb2aedeaa02ecbf75e60e148a55a9198f0c41765/folium-0.10.0-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 15.1MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.0


## get geo df

In [48]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
df_geo.shape

(103, 3)

In [50]:
df_new = df2

## add lang and lat

In [51]:
df_new['Latitude'] = df_geo['Latitude']
df_new['Longitude'] = df_geo['Longitude']
df_new

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.806686,-79.194353
1,M1C,Scarborough,Port Union,43.784535,-79.160497
2,M1E,Scarborough,West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park,43.727929,-79.262029
7,M1L,Scarborough,Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Cliffside West,43.692657,-79.264848


## describe

In [52]:
df_new['Borough'].describe()

count            103
unique            11
top       North York
freq              24
Name: Borough, dtype: object

In [55]:
df_new['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

## focus on toronto

In [57]:
df_tor = df_new[df_new['Neighborhood'].str.contains('Toronto')] #Use a mask to filter out only neighborhoods that include Toronto
df_tor

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
40,M4J,East York,East Toronto,43.685347,-79.338106
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
60,M5K,Downtown Toronto,Toronto Dominion Centre,43.647177,-79.381576
66,M5S,Downtown Toronto,University of Toronto,43.662696,-79.400049
88,M8V,Etobicoke,New Toronto,43.605647,-79.501321


In [58]:
latitude = df_tor.Latitude.mean()
longitude = df_tor.Longitude.mean()
print(latitude)
print(longitude)

43.66324988
-79.40534626


## localising Toronto on the map

In [61]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor.Latitude, df_tor.Longitude, df_tor.Borough, df_tor.Neighborhood):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## foursquare api

In [62]:
CLIENT_ID = 'HABVHWCBNSBIHAEM0D5V4ZSGUGWDT0BZ2ILL0PVP2X01O4BR' # your Foursquare ID
CLIENT_SECRET = 'J2GTX1AKNOHAL0YDVBFBYHXQXZ5WVRKQ2ZKMILXKIG4EPWZL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HABVHWCBNSBIHAEM0D5V4ZSGUGWDT0BZ2ILL0PVP2X01O4BR
CLIENT_SECRET:J2GTX1AKNOHAL0YDVBFBYHXQXZ5WVRKQ2ZKMILXKIG4EPWZL


## nearby venue function

In [68]:
def getNearbyVenues(names, latitudes, longitudes, radius=500): #This function was built by the Coursera lab
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## using the function to pull venues for each neighbourhood

In [65]:
LIMIT = 500
toronto_venues = getNearbyVenues(names=df_tor['Neighborhood'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )

East Toronto
North Toronto West
Toronto Dominion Centre
University of Toronto
New Toronto


In [66]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 76 uniques categories.
