# Capstone Project Segmenting and Clustering Neighborhoods in Toronto

## Question 2

In [1]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup 

In [2]:
#Scrape the List of postal codes of Canada
URL ="https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695"
req = requests.get(URL) 
  
soup = BeautifulSoup(req.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 


In [3]:
#Collecting Data
postalCodes = [];
boroughs= [];
neighborhoods = [];
columnNum = 1;
Passvalue = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            Passvalue = False
            if columnNum == 1:
                if Passvalue == False and cell.string[1].isdigit():
                    postalCodes.append(cell.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if cell.string == 'Not assigned':
                    Passvalue = True
                    del postalCodes[-1]
                    columnNum = 1
                    continue
                else:
                    boroughs.append(cell.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string); 
                columnNum = 1
                

In [4]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighbors_dataframe = pd.DataFrame(columns=column_names)


In [5]:
for data in range(len(neighborhoods)):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]

    neighbors_dataframe = neighbors_dataframe.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name}, ignore_index=True)

neighbors_dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
...,...,...,...
206,M8Z,Etobicoke,Kingsway Park South West\n
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West\n


In [6]:
neighbors_dataframe.shape

(211, 3)

In [13]:
temp_neighbors=neighbors_dataframe.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_neighbors=temp_neighbors.reset_index(drop=False)
temp_neighbors.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [14]:
neighbors_merge = pd.merge(neighbors_dataframe, temp_neighbors, on='PostalCode')

In [15]:
neighbors_merge.drop(['Neighborhood'],axis=1,inplace=True)
neighbors_merge.drop_duplicates(inplace=True)
neighbors_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [16]:
neighbors_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [17]:
neighbors_merge.shape

(103, 3)

In [18]:
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

In [19]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [20]:
geo_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_merged = pd.merge(geo_df, neighbors_merge, on='PostalCode')

In [22]:
geo_data=geo_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]

In [23]:
geo_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
