# Applied Data Science Capstone: Week 3 Final Assignment (Segmenting and Clustering Neighborhoods in Toronto) - Part 2

## NOTE: This notebook starts off with Part 1 of the assignment. Please scroll to the bottom for Part 2

## PART 1

In [1]:
# Importing libraries

import requests
import lxml.html as lh
import pandas as pd
import numpy as np

### Obtaining data from the provided Wikipedia page and transforming into a pandas dataframe

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)

doc = lh.fromstring(page.content)

tr_elements = doc.xpath('//tr')

### Checking if the dataframe consists of three columns: PostalCode, Borough, and Neighborhood

In [3]:
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [4]:
tr_elements = doc.xpath('//tr')

col=[]
i=0

for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postal code
"
2:"Borough
"
3:"Neighborhood
"


### Creating the pandas dataframe

In [5]:
for j in range(1,len(tr_elements)):
    T=tr_elements[j]
    
    
    if len(T)!=3:
        break
   
    i=0
    
    
    for t in T.iterchildren():
        data=t.text_content() 
        
        if i>0:
        
            try:
                data=int(data)
            except:
                pass
       
        col[i][1].append(data)
       
        i+=1

In [6]:
[len(C) for (title,C) in col]

[181, 181, 181]

In [7]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [8]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


### Cleaning the data in the dataframe

In [9]:
df = df.replace('\n',' ', regex=True)

df.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### Dropping "Not assigned" Borough rows

In [10]:
df=df[~df["Borough\n"].str.contains("Not assigned")]
df=df.reset_index(drop=True)

df.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Grouping by neighborhoods based on Postal codes and Boroughs

In [11]:
df = df.groupby(['Postal code\n', 'Borough\n'])['Neighborhood\n'].apply(','.join).reset_index()
#df.columns = ['Postal code\n','Borough\n','Neighborhood\n']

df['Neighborhood\n'] = df['Neighborhood\n'].str.replace('/',',')

df.head(100)

Unnamed: 0,Postal code,Borough,Neighborhood
0,,Canadian postal codes,
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
8,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
9,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village W..."


### Replacing 'Not assigned' neighborhoods with the name of the Borough

In [12]:
df['Neighborhood\n'] = df['Neighborhood\n'].str.strip()
df.loc[df['Neighborhood\n'] == 'Not assigned', 'Neighborhood\n'] = df['Borough\n']
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,,Canadian postal codes,
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn


In [13]:
df=df[~df["Borough\n"].str.contains("Canadian postal codes")]
df=df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


### Printing the number of rows of the dataframe using Shape method

In [14]:
df.shape

(103, 3)

# **************************** PART 2 starts here ****************************

In [15]:
import json

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

### Reading Geospatial data from the csv file as the functioning of geocoder can be unreliable

In [80]:
# Read the cvs file and convert it to a dataframe

url='http://cocl.us/Geospatial_data'
df_postalcodes=pd.read_csv(url)
df_postalcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Renaming Postal Code column

In [81]:
df_postalcodes.columns = ['Postal code', 'Latitude', 'Longitude']
df_postalcodes.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [82]:
df.columns = ['Postal code', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Sorting dataframe by postal codes and merging with values read from the csv file

In [83]:
dfs=df.sort_values(by='Postal code', ascending=True)
dfs.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Cleaning main dataframe values to ensure a clean merge with the other dataframe (postalcodes, latitude, and longitude)

In [84]:
dfs['Postal code'] = dfs['Postal code'].str.strip()
dfs['Borough'] = dfs['Borough'].str.strip()
dfs['Neighborhood'] = dfs['Neighborhood'].str.strip()


### Merging columns into the dataframe

In [85]:
geo_df=pd.merge(dfs,df_postalcodes, how='right', on='Postal code')
geo_df.head(15)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


### Checking the dimensions of the new dataframe

In [87]:
geo_df.shape

(103, 5)