# Assignment : Segmenting and Clustering Neighborhoods in Toronto #


## Scraping the page ##

In [1]:
# import the library we use to open URLs
import urllib.request

In [2]:
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# open the url using urllib.request and put the HTML into the page variable
# The page is read and converted into utf-8 string
page = urllib.request.urlopen(url).read().decode("utf-8")
type(page)

str

## Parsing the page content ##
I have not used any library to parse the content, for it can be easily done using regex.

In [4]:
import re                                     # import regex
page=re.sub(r'[\n\r]',"",page)                # remove all line feed and carriage return
tablecontent=re.search('<tabl.*?table>',page) # The content lies between first occurance of <table> </table>
tablerows=re.findall(r'<tr.*?/tr>',tablecontent[0])      # Break all table rows
del(tablerows[0])                             # remove index 0 as it contains headers
tablerows[0]   # This is how each row would look like

'<tr><td>M1A</td><td>Not assigned</td><td></td></tr>'

## Create Dataframe from page content ##

In [5]:
import pandas as pd                                       # Import Pandas

# define the dataframe columns
column_names = ['PostalCode','Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

# Iterate through each record
for record in tablerows:
  pry=re.findall(r'<td>(.*?)\s*</td>',record)             # Data is contained within <td> </td>
  # Check if Postal code is assigned to a Borough
  if pry[1]!='Not assigned':
    neighborhoods = neighborhoods.append({"PostalCode": pry[0], "Borough": pry[1], "Neighborhood": pry[2]},ignore_index=True)

# Preview the dataframe prepared from scraped page
neighborhoods.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Getting the Latitude & Longitude data from CSV ##
Instead of issuing 103 API calls for these static data on each run, I prefer gathering data beforehand.

In [6]:
geospatial = pd.read_csv('https://cocl.us/Geospatial_data')   # Read the CSV file

# Preview the geospatial data
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Join the two Pandas dataframe into one ##

In [7]:
neighborhoods=neighborhoods.join(geospatial.set_index('Postal Code'), on='PostalCode')


# The final Dataframe
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
