# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np

## Importing packages

In [2]:

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

! pip install geocoder

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


### Reading table from wiki Page 

In [3]:
#! conda install  -c conda-forge lxml --yes
#! conda install  -c conda-forge html5lib --yes
#! conda install  -c conda-forge BeautifulSoup4 --yes

In [4]:
from bs4 import BeautifulSoup

In [5]:

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)

soup = BeautifulSoup(page.content, "lxml")
#soup

In [6]:
source=soup.find('table', class_='wikitable sortable')

A=[]
B=[]
C=[]


for row in source.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))


In [7]:
df=pd.DataFrame()

In [8]:
A=[w.replace('\n', '') for w in A]
B=[w.replace('\n', '') for w in B]
C=[w.replace('\n', '') for w in C]

In [9]:
df['Postal_Code']=A
df['Borough']=B
df['Neighborhood']=C

## Deleting rows with 'Not assigned' Borough

In [10]:
df=df.loc[df['Borough'] != 'Not assigned']

## Setting 'Not assigned' Neighborhood to Borough

In [11]:
df['Neighborhood']=np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])

In [12]:
df.shape

(103, 3)

## Getting latitude and logitude

In [13]:
import geocoder  # import geocoder

latitude_list=[]
longitude_list=[]

for pcode in df['Postal_Code']:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(pcode))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    latitude_list.append(latitude)
    longitude_list.append(longitude)
    
    


## Final data frame with coordinates

In [14]:
df['Latitude']=latitude_list
df['Longitude']=longitude_list

print(df.shape)
df.head()

(103, 5)


Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.752935,-79.335641
3,M4A,North York,Victoria Village,43.728102,-79.31189
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
