# My Assignment - segmenting and clustering neighborhoods in Toronto

#### Install BeautifulSoup4 python module

In [1]:
#!pip install beautifulsoup4

#### Import required modules

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import folium

import requests
from bs4 import BeautifulSoup

#### Retrieve the wikipedia page and make parser using BeautifulSoup

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
raw_page = requests.get(url)
page = raw_page.text
#print(page)
soup = BeautifulSoup(page, 'html.parser')

#### Define get_text() function to collect the text data in the specific html tag

In [4]:
def get_text(tlist):
    _temp = list()
    for t in tlist:
        _temp.append(t.text.strip())
    return _temp

#### Make the DataFrame
- Find table header and gather names of the columns

In [5]:
toronto_fsa = soup.find("table",{"class": "wikitable sortable"})
col_headers = toronto_fsa.find_all("th")
columns = get_text(col_headers)
columns

['Postcode', 'Borough', 'Neighbourhood']

- Find all table cells and gather the values of them
- Then make a DataFrame, 'df' using the column names and values

In [6]:
val_list = toronto_fsa.find_all("td")
vlist = np.asarray(get_text(val_list))
data_dict = {}
for i, c in enumerate(columns):
    data_dict[c] = vlist[i::3]

df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Refine the dataframe
- Remove the rows with the value, "Not assigned", in the "Borough" columns
- Merge rows with the same "Postcode"

In [7]:
df = df[df.Borough != "Not assigned"]
df = df.groupby("Postcode").aggregate({"Borough":"first", "Neighbourhood":lambda x: ",".join(x)}).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


- Fill the "Not assigned" values in "Neighbourhood" column with the values in the "Borough" field

In [8]:
_tmpdf = df[df.Neighbourhood == "Not assigned"]
for i, v in zip(_tmpdf.index, _tmpdf.values):
    df.iloc[i].Neighbourhood = df.iloc[i].Borough
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df.shape

(103, 3)

#### Get latitude and longitude using Geocoder
- parse csv file of geocode 
 - I tried to use "geocoder" but it wasn't response for a long time

In [10]:
#!wget -r http://cocl.us/Geospatial_data

In [11]:
geodata = pd.read_csv("cocl.us/Geospatial_data")
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
geodata.shape

(103, 3)

- add latitude and longitude of geodata to df
- df and geodata are already sorted, so just add columns of geodata to df

In [15]:
df["Latitude"] = geodata["Latitude"]
df["Longitude"] = geodata["Longitude"]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
