# Datascience Capstone Week 3: Torono Neighborhood data

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as soup

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

**Using requests to call the url and get the data**

In [3]:
wiki_url=requests.get(url)

## Using Pandas to create the Dataframe

In [4]:
df_wiki=pd.read_html(wiki_url.text)
len(df_wiki),type(df_wiki)

(3, list)

In [5]:
torono_data=df_wiki[0]
torono_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
torono_data.shape

(180, 3)

**Remove the rows with no assigned borough**

In [7]:
torono_data["Borough"].replace("Not assigned",np.nan,inplace=True)
torono_data.dropna(inplace=True)
torono_data.reset_index(drop=True, inplace=True)
torono_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Change the "Not assigned" neighborhood with the corresponding borough name**

In [8]:
torono_data[torono_data["Neighborhood"]=="Not assigned"].index

Int64Index([], dtype='int64')

There are no row with the Neighbourhood column containing "Not assigned"

In [9]:
count = 0
for row in range(len(torono_data)):
    if torono_data.iloc[row,2]=="Not assigned":
        torono_data.iloc[row,2]=torono_data.iloc[row,1]
        count=count+1
print("Replaced {} rows".format(count))

Replaced 0 rows


**Combine the Neighbourhoods with the same Postal Code**

In [10]:
torono_data[torono_data.duplicated(subset=["Postal Code"])].index

Int64Index([], dtype='int64')

In [18]:
if len(torono_data["Postal Code"]==set(torono_data["Postal Code"])): #check if there are any rows to modify
    print("No rows with repeating postal codes")
else: #Combine the rows with the same postal code
    for duplicate in torono_data["Postal Code"][torono_data.duplicated(subset=["Postal Code"], keep='first')]:
        comb_borough=""
        for brough in torono_data["Borough"][torono_data["Postal Code"]==duplicate]:
            comb_borough=comb_borough+","+brough
        torono_data["Borough"][torono_data["Postal Code"]==duplicate][0]=comb_borough
    torono_data.drop(axis=0,index=torono_data[torono_data.duplicated(subset=["Postal Code"])].index,inplace=True) 

No rows with repeating postal codes


**Final Shape of the DataFrame**

In [19]:
torono_data.shape

(103, 3)