# Datascience Capstone Week 3: Torono Neighborhood data

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as soup

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

**Using requests to call the url and get the data**

In [4]:
wiki_url=requests.get(url)

## Part 1: Using Pandas to create the Dataframe

In [5]:
df_wiki=pd.read_html(wiki_url.text)
len(df_wiki),type(df_wiki)

(3, list)

In [6]:
torono_data=df_wiki[0]
torono_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
torono_data.shape

(180, 3)

**Remove the rows with no assigned borough**

In [8]:
torono_data["Borough"].replace("Not assigned",np.nan,inplace=True)
torono_data.dropna(inplace=True)
torono_data.reset_index(drop=True, inplace=True)
torono_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Change the "Not assigned" neighborhood with the corresponding borough name**

In [9]:
torono_data[torono_data["Neighbourhood"]=="Not assigned"].index

Int64Index([], dtype='int64')

There are no row with the Neighbourhood column containing "Not assigned"

In [10]:
count = 0
for row in range(len(torono_data)):
    if torono_data.iloc[row,2]=="Not assigned":
        torono_data.iloc[row,2]=torono_data.iloc[row,1]
        count=count+1
print("Replaced {} rows".format(count))

Replaced 0 rows


**Combine the Neighbourhoods with the same Postal Code**

In [11]:
torono_data[torono_data.duplicated(subset=["Postal Code"])].index

Int64Index([], dtype='int64')

In [12]:
if len(torono_data["Postal Code"]==set(torono_data["Postal Code"])): #check if there are any rows to modify
    print("No rows with repeating postal codes")
else: #Combine the rows with the same postal code
    for duplicate in torono_data["Postal Code"][torono_data.duplicated(subset=["Postal Code"], keep='first')]:
        comb_borough=""
        for brough in torono_data["Borough"][torono_data["Postal Code"]==duplicate]:
            comb_borough=comb_borough+","+brough
        torono_data["Borough"][torono_data["Postal Code"]==duplicate][0]=comb_borough
    torono_data.drop(axis=0,index=torono_data[torono_data.duplicated(subset=["Postal Code"])].index,inplace=True) 

No rows with repeating postal codes


**Final Shape of the DataFrame**

In [13]:
torono_data.shape

(103, 3)

## Part 2: Get the latitude and Logitude of the zipcodes ##

Could not make "geocoder" work so used "pgeocode" instead to get the latitude and longitudes frm postal codes

Documentation for pgeocode: https://pypi.org/project/pgeocode/

In [14]:
import pgeocode

nomi = pgeocode.Nominatim('ca')

In [15]:
# Initializing dummy variables
latitude=[]
longitude=[]

# For loop to get the latitude and logitude for the zipcodes
for postal_code in torono_data["Postal Code"]:
    
    nomi.query_postal_code(postal_code)
        
    latitude.append(nomi.query_postal_code(postal_code).latitude)
    longitude.append(nomi.query_postal_code(postal_code).longitude)
    if np.isnan(nomi.query_postal_code(postal_code).latitude):
        print(postal_code)
    


M7R


In [16]:
torono_data["Latitude"]=latitude
torono_data["Longitude"]=longitude
torono_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


In [17]:
np.isnan(torono_data["Latitude"]).sum()

1

In [18]:
torono_data["Latitude"].replace(np.nan,43.636966,inplace=True)
torono_data[torono_data["Postal Code"]=="M7R"]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,


In [19]:
torono_data["Longitude"].replace(np.nan,-79.615819,inplace=True)
torono_data[torono_data["Postal Code"]=="M7R"]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819


Completed Part 2 of the exercise. Need to start with the third question

## Part 3: Neighbourhood analysis and Segmentation

In [39]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [26]:
toronto_only = torono_data[torono_data['Borough'].str.contains(pat = 'toronto',case=False)].reset_index(drop=True)
toronto_only.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,M4E,East Toronto,The Beaches,43.6784,-79.2941


In [27]:
set(toronto_only.Borough)

{'Central Toronto', 'Downtown Toronto', 'East Toronto', 'West Toronto'}

In [28]:
toronto_only.shape

(39, 5)

Let's get the geographical coordinates of Downtown Toronto.

In [22]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


In [29]:
latitude = 43.6532
longitude = -79.3832

Visualizing the neighbourhoods in Toronto

In [38]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_only['Latitude'], toronto_only['Longitude'], toronto_only['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto