# Segmenting and Clustering Neighborhoods in Toronto

## Problem 1: Scrape Wikipedia Data

In [60]:
#Needed to install fake_user agent before I could do imports
#!pip install fake_useragent

### Import necessary modules

In [61]:
# Import necessary modules
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Enter the selected URL

In [62]:
# URL to scrape data from
scrapeLink = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Pull HTML code from URL and find all tables

In [63]:
# Request data from URL and identify all tables
ua1 = UserAgent()
randomHeader = {'User-Agent':str(ua1.random)}
page = requests.get(scrapeLink, randomHeader)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')[0]

### Convert HTML code to Pandas DataFrame

In [64]:
# Parse table and convert HTML to DataFrame
n_columns = 0
n_rows=0
column_names = []
    
# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):
                
    # Determine the number of rows in the table
    td_tags = row.find_all('td')
    if len(td_tags) > 0:
        n_rows+=1
        if n_columns == 0:
            # Set the number of columns for our table
            n_columns = len(td_tags)
                        
    # Handle column names if we find them
    th_tags = row.find_all('th') 
    if len(th_tags) > 0 and len(column_names) == 0:
        for th in th_tags:
            column_names.append(th.get_text())
    
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
    raise Exception("Column titles do not match the number of columns")
    
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns, index= range(0,n_rows))

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        df.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1
    if len(columns) > 0:
        row_marker += 1
                    
df

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


### Clean up the data

In [65]:
# Delete all newline characters ('\n')
df.columns = df.columns.str.replace(r"[\n]",'')
df.replace(['\n'], '', regex=True, inplace=True)
# Drop rows where Borough = "Not assigned"
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


*Note: The next two lines seem unnecessary because the data is grouped by Postal Codes in the HTML code, and there do not appear to be any rows where the Neighbourhood is unassigned, but the Borough is assigned. The instructions call for the data to be cleaned this way though, so I added the lines for completeness.*

In [66]:
# Grouping by Postal Codes
df = df.groupby(['Postal Code', 'Borough'], as_index=False).agg({'Neighbourhood': ', '.join})
# Replacing "Not assigned" Neighbourhoods with the name of Borough
df.loc[df['Neighbourhood'] == "Not assigned", 'Neighbourhood'] = df['Borough']

In [67]:
# View final dataframe
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### View the shape of the DataFrame

In [68]:
# View shape of DataFrame
df.shape

(103, 3)

## Problem 2: Add Longitude/Latitude to previous DataFrame

In [69]:
# Needed to install geocoder
! pip install geocoder



In [70]:
# import geocoder
import geocoder # import geocoder

In [73]:
# Function to get Long/Lat for Toronto postal codes from Geocoder
def get_geocoder(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

### Read Longitude/Latitude for each postal code from CSV URL into new DataFrame

In [74]:
# Getting Long/Lat from geocoder was taking too long, so reading from this file instead
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Merge Part 1 DataFrame with Longitude/Latitude DataFrame

In [75]:
# Merge Borough/Neighbourhood DataFrame with Long/Lat DataFrame
geo_data = pd.merge(df, geo_df, how='inner', on=['Postal Code'])
geo_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
