# Segmenting and Clustering Neighborhoods in Toronto

## New project notebook created with project name : Segmenting and Clustering...

### Submission 1 - Download & Prepare Dataset

In [16]:
import pandas as pd
import numpy as np
!conda install -c conda lxml
!pip install et_xmlfile
!pip install bs4
!pip install html5lib
!pip install lxml
import bs4.builder._lxml 
from lxml import etree
import requests 
from bs4 import BeautifulSoup

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lxml-4.5.0                 |   py36hefd8a0e_0         1.6 MB

The following packages will be UPDATED:

    lxml: 4.3.1-py36hefd8a0e_0 --> 4.5.0-py36hefd8a0e_0


Downloading and Extracting Packages
lxml-4.5.0           | 1.6 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467

In [36]:
# Get to webpage and extract source code using bs4 module
headers = requests.utils.default_headers()
headers.update({ 'User-Agent': 'Chrome/6.0.472.63 Sfari/534.3a'})

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [37]:
# Identify table to scrap data from 
tables = soup.find('table', {'class':'wikitable sortable'})
table_rows = tables.find_all('tr')

# Create empty list to input table values into from for loop below
data = []

# Using for loop to pull text from each cell and add to list
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

# Convert list to dataframe and remove row containing null 
df_tdot = pd.DataFrame(data, columns = ['Postal Code', 'Borough', 'Neighborhood'])
df_tdot.dropna(inplace=True)
df_tdot.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [38]:
# Ignore rows with Borough columnn containing 'Not assigned' by modifying existing dataframe to only include rows where Borough column does not match 'Not assigned'
df_tdot = df_tdot[df_tdot['Borough'] != 'Not assigned']

# Groupby function used to group Postal Code column so more than one Neighborhood can exist for a postal code row
df_tdot.groupby(by = ['Postal Code'], axis = 0)

# Reset index as it is missing values after the groupby function is applied
df_tdot.reset_index(drop = True, inplace = True)

# The groupby function defaulted seperating multiple Neighborhoods in a cell with '/'.  Used replace function to replace '/' with a comma.
df_tdot['Neighborhood'] = df_tdot['Neighborhood'].str.replace('/',',')

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.  Note that no neighborhoods in scraped table has contains 'Not assigned'.
mask = df_tdot['Neighborhood'] == 'Not assigned'
df_tdot.loc[mask, 'Neighborhood'] = df_tdot.loc[mask, 'Borough']

df_tdot.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [39]:
# Show shape of dataframe after dataframe is pre-processed
df_tdot.shape

(103, 3)

In [40]:
url = 'https://cocl.us/Geospatial_data'
df_coords = pd.read_csv(url)
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
# Merge Dataframes so Coordinates are Added
df_tdot_coords = pd.merge(df_tdot, df_coords, on = 'Postal Code', how = 'inner')
df_tdot_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
