# Assignment : Segmenting and Clustering Neighborhoods in Toronto Part 2 #

## Details on how to scrap and prepare data in seperate Jupyter notebook ##
I am skipping markdowns from previous notebook and putting entire code in fewer cells to move fast to part 2

In [1]:
## Scraping the page
# import the library we use to open URLs
import urllib.request
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
# The page is read and converted into utf-8 string
page = urllib.request.urlopen(url).read().decode("utf-8")
type(page)

str

In [2]:
## Parsing the page and preparing Dataframe
import re                                     # import regex
page=re.sub(r'[\n\r]',"",page)                # remove all line feed and carriage return
tablecontent=re.search('<tabl.*?table>',page) # The content lies between first occurance of <table> </table>
tablerows=re.findall(r'<tr.*?/tr>',tablecontent[0])      # Break all table rows
del(tablerows[0])                             # remove index 0 as it contains headers
# tablerows[0]   # This is how each row would look like
import pandas as pd                                       # Import Pandas

# define the dataframe columns
column_names = ['PostalCode','Borough', 'Neighborhood'] 
# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
# Iterate through each record
for record in tablerows:
  pry=re.findall(r'<td>(.*?)\s*</td>',record)             # Data is contained within <td> </td>
  # Check if Postal code is assigned to a Borough
  if pry[1]!='Not assigned':
    neighborhoods = neighborhoods.append({"PostalCode": pry[0], "Borough": pry[1], "Neighborhood": pry[2]},ignore_index=True)
# Preview the dataframe prepared from scraped page
neighborhoods.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
geospatial = pd.read_csv('https://cocl.us/Geospatial_data')   # Read the CSV file

neighborhoods=neighborhoods.join(geospatial.set_index('Postal Code'), on='PostalCode')


# The final Dataframe
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Getting all rows containing word Toronto


In [4]:
# Check for "Toronto" in Borough field and only retain those who match
neighborhoods=neighborhoods[neighborhoods["Borough"].str.contains("Toronto")]
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Generating a map and adding all neighborhoods to it

In [5]:
import folium # plotting library
venues_map = folium.Map(location=[43.654260,-79.360636], zoom_start=11) 
for index,row in neighborhoods.iterrows():
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(venues_map)
venues_map