Scrape data from Wikipedia and convert into dataframe

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

Only process cells that have an assigned borough

In [2]:
df2 = df[df.Borough != 'Not assigned']

If Neighbourhood has not been assigned, then it's the same as the Borough

In [3]:
df2.Neighbourhood.replace('Not assigned',df2.Borough,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [4]:
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Rename column from Postcode to Postal Code

In [5]:
df2.rename(columns={"Postcode":"Postal Code"})

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Group by Postal Code

In [6]:
df3 = df2.groupby('Postcode')['Borough','Neighbourhood'].agg(lambda x: ', '.join(set(x))).reset_index()

In [7]:
df3.shape

(103, 3)

Merge with geospatial data

In [8]:
df4 = pd.read_csv('http://cocl.us/Geospatial_data')

In [9]:
df5 = pd.merge(df4, df3, how='inner', left_on = 'Postal Code', right_on = 'Postcode')

In [10]:
df5.shape

(103, 6)

In [11]:
df5

Unnamed: 0,Postal Code,Latitude,Longitude,Postcode,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,M1B,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union"
2,M1E,43.763573,-79.188711,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,43.770992,-79.216917,M1G,Scarborough,Woburn
4,M1H,43.773136,-79.239476,M1H,Scarborough,Cedarbrae
5,M1J,43.744734,-79.239476,M1J,Scarborough,Scarborough Village
6,M1K,43.727929,-79.262029,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,43.711112,-79.284577,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile"
8,M1M,43.716316,-79.239476,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,43.692657,-79.264848,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [36]:
!pip -q install folium

In [37]:
import folium

In [43]:
latitude = 43.6532
longitude = -79.3832

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

In [44]:
map_toronto

In [47]:
for lat, lng, borough, neighborhood in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [55]:
CLIENT_ID = 'XXX' ##Removed
CLIENT_SECRET = 'XXX'  ##Removed 
VERSION = '20190801' 

In [56]:
def getNearbyVenues(names, latitudes, longitudes, radius=500): 
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [57]:
LIMIT = 500
toronto_venues = getNearbyVenues(names=df5['Neighbourhood'],
                                   latitudes=df5['Latitude'],
                                   longitudes=df5['Longitude']
                                  )

Rouge, Malvern
Rouge Hill, Highland Creek, Port Union
Morningside, Guildwood, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Oakridge, Clairlea, Golden Mile
Scarborough Village West, Cliffside, Cliffcrest
Birch Cliff, Cliffside West
Scarborough Town Centre, Wexford Heights, Dorset Park
Wexford, Maryvale
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Henry Farm, Fairview, Oriole
Bayview Village
York Mills, Silver Hills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Wilson Heights, Bathurst Manor, Downsview North
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale, The Danf

In [58]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(2244, 280)

In [59]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
1,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
2,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
3,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
4,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.017544,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
5,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
6,"Brockton, Parkdale Village, Exhibition Place",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
7,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
8,"CFB Toronto, Downsview East",0.000000,0.0,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0
9,"CN Tower, South Niagara, King and Spadina, Rai...",0.000000,0.0,0.000000,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.0


In [60]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                venue  freq
0      Breakfast Spot   0.2
1  Chinese Restaurant   0.2
2              Lounge   0.2
3        Skating Rink   0.2
4      Sandwich Place   0.2


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                             venue  freq
0                             Park   0.5
1                       Playground   0.5
2                      Yoga Studio   0.0
3                      Men's Store   0.0
4  Molecular Gastronomy Restaurant   0.0


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1   Chinese Restaurant  0.25
2                 Bank  0.25
3                 Café  0.25
4          Yoga Studio  0.00


----Bedford Park, Lawrence Manor East----
                 venue  freq
0   Italian Restaurant  0.09
1          Coffee Shop  0.09
2  Japanese Restaurant  0.05
3    Indian Restaurant  0.05
4              Butcher  0.05


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1    