In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
r1 = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(r1.content,'html.parser')
table = soup.find(lambda tag: tag.name =='table' and ("wikitable" in tag['class']))

In [3]:
df = pd.read_html(str(table), flavor='bs4')[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Cleanup table data

In [5]:
df_2 = (df['Borough'] == 'Not assigned')|(df['Neighbourhood'] == 'Not assigned')
df = df[~df_2]

In [6]:
(df.Borough == 'Not assigned').sum()

0

In [7]:
(df.Neighbourhood == 'Not Assigned').sum()

0

In [8]:
post=df.Postcode.unique()

## Group by postcode

In [9]:
Toronto = pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'])
for code in post:
    temp_df = df[['Borough','Neighbourhood']][df['Postcode'] == code]
    boro = temp_df.Borough.unique()
    hood = temp_df.Neighbourhood.unique()
    Toronto = Toronto.append({
        'Postcode':code,
        'Borough':",".join(boro),
        'Neighbourhood':",".join(hood)},ignore_index=True)

In [10]:
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [11]:
Toronto.shape

(103, 3)

## Include lat/long data from geocode

In [13]:
postalcodes_from_csv = pd.read_csv('http://cocl.us/Geospatial_data')
postalcodes_from_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
Toronto = Toronto.sort_values(by=['Postcode'])
Toronto.reset_index(inplace=True, drop=True)
pcode = postalcodes_from_csv.sort_values(by=['Postal Code'])
pcode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
Toronto=pd.concat([Toronto, pcode[['Latitude','Longitude']]], axis = 1)
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Filter only Boroughs in Toronto

In [17]:
Toronto = Toronto[Toronto.Borough.str.contains('Toronto', na=False)]
Toronto.reset_index( inplace = True)
Toronto

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [18]:
Toronto.drop(['index'], axis = 1, inplace = True)
Toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [20]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 9.2MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


## Create folium map

In [21]:
import folium

map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Borough'], Toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

## Extract information from FourSquare

In [22]:
# The code was removed by Watson Studio for sharing.

In [23]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json

def getNearbyVenues(postcodes, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    postcodes_done = ""
    for code, lat, lng in zip(postcodes, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        res = requests.get(url).json()["response"]
        if 'groups' not in res:
            continue;
        
        postcodes_done += code+","
        results = res['groups'][0]['items']
            
        # return only relevant information for each nearby venue
        venues_list.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print(postcodes_done[:-1])
    return(nearby_venues)

## Include venues into dataframe

In [24]:
toronto_venues = getNearbyVenues(Toronto['Postcode'], Toronto['Latitude'], Toronto['Longitude'],radius=500, limit=100)

M4E,M4K,M4L,M4M,M4N,M4P,M4R,M4S,M4T,M4V,M4W,M4X,M4Y,M5A,M5B,M5C,M5E,M5G,M5H,M5J,M5K,M5L,M5N,M5P,M5R,M5S,M5T,M5V,M5W,M5X,M6G,M6H,M6J,M6K,M6P,M6R,M6S,M7A,M7Y


In [25]:
toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


## One-hot encoding

In [26]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(1715, 234)

In [28]:
toronto_grouped = toronto_onehot.groupby('Postcode').sum()
toronto_grouped.reset_index(inplace=True)
toronto_grouped.head()

Unnamed: 0,Postcode,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4K,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,M4L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4M,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,1,0,0,0,1
4,M4N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Find Restaurant of choice

In [31]:
df2 = toronto_grouped[['Postcode','Afghan Restaurant']]
df2

Unnamed: 0,Postcode,Afghan Restaurant
0,M4E,0
1,M4K,0
2,M4L,0
3,M4M,0
4,M4N,0
5,M4P,0
6,M4R,0
7,M4S,0
8,M4T,0
9,M4V,0


## Load csv file

In [36]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Topic,Characteristic,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,Banbury-Don Mills,Bathurst Manor,Bay Street Corridor,Bayview Village,...,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park,Unnamed: 142
0,Income of economic families in 2015,Total - Economic family income decile group fo...,28825,23475,12030,28650,27005,15580,25600,21135,...,22150,53005,12430,7850,13200,11810,12370,27570,14020,
1,Income of economic families in 2015,In the bottom half of the distribution,18535,15205,4735,12080,10760,8150,15230,10745,...,10895,36245,5025,3490,6355,3840,4165,19105,7735,
2,Income of economic families in 2015,In the bottom decile,4195,3905,725,4810,2600,1825,8820,4005,...,2005,9965,1490,890,1685,1185,1340,5410,1630,
3,Income of economic families in 2015,In the second decile,5030,3565,815,2225,2195,1865,2020,1995,...,2450,8715,1040,715,1480,660,675,4135,1665,
4,Income of economic families in 2015,In the third decile,3700,2940,1005,1710,2050,1595,1660,1585,...,2295,6815,850,625,1175,630,715,3625,1775,


In [45]:
neighborhoods_id = df_data_0
neighborhoods_id.head()

Unnamed: 0,Topic,Characteristic,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,Banbury-Don Mills,Bathurst Manor,Bay Street Corridor,Bayview Village,...,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park,Unnamed: 142
2,Income of economic families in 2015,In the bottom decile,4195,3905,725,4810,2600,1825,8820,4005,...,2005,9965,1490,890,1685,1185,1340,5410,1630,
3,Income of economic families in 2015,In the second decile,5030,3565,815,2225,2195,1865,2020,1995,...,2450,8715,1040,715,1480,660,675,4135,1665,
4,Income of economic families in 2015,In the third decile,3700,2940,1005,1710,2050,1595,1660,1585,...,2295,6815,850,625,1175,630,715,3625,1775,
5,Income of economic families in 2015,In the fourth decile,2990,2590,1060,1630,1875,1455,1380,1550,...,2120,5845,780,610,965,665,715,3235,1380,
6,Income of economic families in 2015,In the fifth decile,2610,2210,1140,1735,2045,1410,1350,1620,...,2030,4955,870,645,1060,700,730,2710,1290,


In [46]:
neighborhoods_id = neighborhoods_id.transpose()
neighborhoods_id.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13
Topic,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Income of economic families in 2015,Ethnic origin population
Characteristic,In the bottom decile,In the second decile,In the third decile,In the fourth decile,In the fifth decile,In the top half of the distribution,In the sixth decile,In the seventh decile,In the eighth decile,In the ninth decile,In the top decile,Afghan
Agincourt North,4195,5030,3700,2990,2610,10280,2500,2340,2260,1900,1305,85
Agincourt South-Malvern West,3905,3565,2940,2590,2210,8265,2015,1840,1800,1505,1125,255
Alderwood,725,815,1005,1060,1140,7290,1265,1420,1500,1750,1345,10


In [47]:
neighborhoods_id.drop(neighborhoods_id.index[0], inplace = True)
neighborhoods_id.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13
Characteristic,In the bottom decile,In the second decile,In the third decile,In the fourth decile,In the fifth decile,In the top half of the distribution,In the sixth decile,In the seventh decile,In the eighth decile,In the ninth decile,In the top decile,Afghan
Agincourt North,4195,5030,3700,2990,2610,10280,2500,2340,2260,1900,1305,85
Agincourt South-Malvern West,3905,3565,2940,2590,2210,8265,2015,1840,1800,1505,1125,255
Alderwood,725,815,1005,1060,1140,7290,1265,1420,1500,1750,1345,10
Annex,4810,2225,1710,1630,1735,16570,1720,1820,2210,2830,7995,35


In [48]:
neighborhoods_id.sort_values(by=[13])

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13
Lawrence Park South,930,500,460,435,565,12200,650,685,1005,1475,8385,0
North Riverdale,1000,880,570,710,770,7745,755,820,1050,1555,3555,0
Palmerston-Little Italy,2055,1340,1150,1060,935,7205,1145,1090,1330,1395,2250,0
Playter Estates-Danforth,790,635,480,410,505,4825,535,620,620,865,2185,0
Roncesvalles,2405,1635,1220,1085,1090,7440,1100,1165,1280,1575,2325,0
Corso Italia-Davenport,1620,1370,1350,1390,1390,6965,1410,1405,1510,1440,1185,0
Markland Wood,575,610,840,720,750,6960,925,1070,1250,1575,2145,0
Rosedale-Moore Park,1750,950,740,750,870,15555,1025,1170,1460,2120,9760,0
Runnymede-Bloor West Village,515,495,485,560,595,7410,725,945,990,1735,3010,0
Casa Loma,1085,690,545,515,560,7395,530,725,755,1080,4305,0
