This is Data Science Capstone assignment - "Segmenting and Clustering Neighborhoods in Toronto"

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
 
WIKI_URL ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
res = requests.get(WIKI_URL)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df_w=pd.DataFrame(df)

df_w.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [2]:
#rename columns
df_w.columns=['Postcode','Borough','Neighborhood']
df_w.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [3]:
#drop the first row...here is the cleaned-up dataframe
df_w.drop(0,inplace=True)
df_w.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [4]:
# removed boroughs with 'not assigned' 
df_w1=df_w[df_w.Borough.str.contains("Not assigned") == False]
df_w1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [5]:
#reset index
df_w2=df_w1.reset_index()
df_w3=df_w2.drop(df_w2.columns[0],axis=1)
df_w3.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [6]:
# combine rows with the same Postcode & Borough
df_w4=df_w3.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_w4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# if any neighborhood has the value of "not assigned", use the name in borough instead
row=0
for row in range(103):
    if df_w4.Neighborhood[row]=='Not assigned':
        df_w4.Neighborhood[row] = df_w4.Borough[row]
        row=row+1
    
df_w4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
 df_w4.shape

(103, 3)

In [9]:
#https://geocoder.readthedocs.io/index.html
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 16.7MB/s 
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [10]:
import geocoder
import time

In [11]:
# test geocoder on a simple one to make sure it's working (since it can be very slow and unpredictable)
start_time=time.time()
l=None
check1=0
while l is None:
    if check1 < 100:
        g = geocoder.google('Mountain View, CA')
        l=g.latlng
        check1=check1+1
        print(check1)
    else:
        l='done'

print(l)    
print("--- %s seconds ---" % round((time.time() - start_time), 2))

1
2
3
4
5
6
7
8
9
[37.3860517, -122.0838511]
--- 1.32 seconds ---


In [12]:
# test geocoder on a simple one to make sure it's working (since it can be very slow and unpredictable)
start_time=time.time()
l=None
while l is None:
    g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
    l=g.latlng
    print('try again')

print(l)    
print("--- %s seconds ---" % round((time.time() - start_time), 2))

try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
try again
[43.6579524, -79.3873826]
--- 4.32 seconds ---


In [13]:
df_w4['Latitude']=0.0
df_w4['Longitude']=0.0
df_w4.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0


In [14]:
start_time=time.time()
row=0
for row in range(103):
    print(row)
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df_w4.Postcode[row]))
        lat_lng_coords = g.latlng
        
    print("--- %s seconds ---" % round((time.time() - start_time), 2))
    df_w4.Latitude[row] = lat_lng_coords[0]
    df_w4.Longitude[row] = lat_lng_coords[1]
    row=row+1
    
df_w4

0
--- 2.65 seconds ---
1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


--- 4.28 seconds ---
2
--- 4.83 seconds ---
3
--- 6.18 seconds ---
4
--- 7.26 seconds ---
5
--- 8.17 seconds ---
6
--- 9.09 seconds ---
7
--- 14.69 seconds ---
8
--- 23.33 seconds ---
9
--- 25.04 seconds ---
10
--- 25.9 seconds ---
11
--- 26.47 seconds ---
12
--- 27.38 seconds ---
13
--- 29.26 seconds ---
14
--- 32.28 seconds ---
15
--- 32.89 seconds ---
16
--- 34.65 seconds ---
17
--- 36.77 seconds ---
18
--- 38.32 seconds ---
19
--- 39.38 seconds ---
20
--- 40.65 seconds ---
21
--- 41.37 seconds ---
22
--- 45.03 seconds ---
23
--- 48.57 seconds ---
24
--- 53.08 seconds ---
25
--- 53.75 seconds ---
26
--- 56.45 seconds ---
27
--- 57.25 seconds ---
28
--- 58.15 seconds ---
29
--- 59.13 seconds ---
30
--- 61.42 seconds ---
31
--- 63.34 seconds ---
32
--- 67.45 seconds ---
33
--- 68.25 seconds ---
34
--- 73.11 seconds ---
35
--- 74.62 seconds ---
36
--- 76.74 seconds ---
37
--- 77.45 seconds ---
38
--- 78.77 seconds ---
39
--- 81.16 seconds ---
40
--- 82.52 seconds ---
41
--- 83.69 secon

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
