# Capstone Project - Week 3 - Phase 2 - Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Install and Import libraries

In [1]:
!pip install html5lib
!pip install requests
!pip install beautifulsoup4



In [2]:
import pandas as pd
import numpy as np
import requests

### Wikipedia data location

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Read Wikipedia data into pandas dataframe using read_html function

In [4]:
wiki_data = pd.read_html(url,flavor='html5lib', attrs={'class':'wikitable sortable'},skiprows=0)[0]
wiki_data.head(100)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
95,M6N,York,"Runnymede, The Junction North"
96,M7N,Not assigned,Not assigned
97,M8N,Not assigned,Not assigned
98,M9N,York,Weston


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
my_df = wiki_data[wiki_data['Borough']!='Not assigned']
my_df.head(100)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
153,M1X,Scarborough,Upper Rouge
156,M4X,Downtown Toronto,"St. James Town, Cabbagetown"
157,M5X,Downtown Toronto,"First Canadian Place, Underground city"
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"


#### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [6]:
my_df.Neighbourhood = np.where(
    (my_df['Neighbourhood'] =='Not assigned') & (my_df['Borough'] != 'Not assigned'),
        my_df['Borough'], 
        my_df['Neighbourhood']
        )
my_df.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
153,M1X,Scarborough,Upper Rouge
156,M4X,Downtown Toronto,"St. James Town, Cabbagetown"
157,M5X,Downtown Toronto,"First Canadian Place, Underground city"
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"


### Shape

In [7]:
my_df.shape

(103, 3)

## Get Geo Data

In [8]:
!pip install geocoder



In [9]:
import geocoder # import geocoder

def get_lat_log(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    #latitude = lat_lng_coords[0]
    #longitude = lat_lng_coords[1]
    return lat_lng_coords

In [10]:
print(get_lat_log('M6A'))

KeyboardInterrupt: 

### GeoCoder not working - not returning any value so will use csv

In [11]:
pd_geo = pd.read_csv('https://cocl.us/Geospatial_data')
pd_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### merge 2 dataframes - one with neighborhood and borough info and another with geo location

In [12]:
dfinal = my_df.merge(pd_geo, how='inner', left_on='Postal Code', right_on='Postal Code')
dfinal.head(100)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
95,M1X,Scarborough,Upper Rouge,43.836125,-79.205636
96,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
97,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.382280
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944


In [14]:
dfinal.rename(columns = {'Postal Code':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace = True) 
dfinal.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [15]:
dfinal.shape

(103, 5)

### APPENDIX: Alternate to read data from wikipedia using BeautifulSoup

In [None]:
from bs4 import BeautifulSoup

In [None]:
page = requests.get(url)
print(page.status_code)
text = page.text
soup = BeautifulSoup(text, "html5lib")
neighborhood_html_table = soup.find_all(class_='wikitable sortable')[0]
len(list(neighborhood_html_table.children))
len(list(neighborhood_html_table.descendants))