# City of Toronto Data
### Using Wikipedia, I scraped data to explore, segment, and cluster the neighborhoods in the city of Toronto

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


In [2]:
#URL to wiki page
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# download wiki page 
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#### Downloading and Extracting Data From Wikipedia
##### Assumptions: Wikipedia data is correct


In [5]:
# use data attribute text to extract XML as a string and assign to page
page = (wiki_page.text)

# extracting data
beginning= page.find('<table class="wikitable sortable">')
end= page.find('</tbody></table>')

wiki_table_text = (page[beginning:end])
# print (wiki_table_text)

In [6]:
# parse html and create pandas
soup = BeautifulSoup(wiki_table_text,"html.parser")
table_rows = soup.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)
        
# drop rows with value 'not assigned' in borough column
df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighbourhood"])
df_filtered = df.query("Borough != 'Not assigned'")


#### Cleaning the Data
##### Assumptions: Dropped data with unassigned boroughs, unassigned Neighborhoods were replaced with Borough name

In [7]:
#group by Postcode and Borough to make neighbourhood a list
def create_list(neighbourhood_list):
    final_string = ""
    for neighbourhood in neighbourhood_list:
        final_string = final_string + neighbourhood + ', '
    return final_string[:-2]

df_new = df_filtered.groupby(['Postcode', 'Borough']).agg({'Neighbourhood': create_list})
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
M1N,Scarborough,"Birch Cliff, Cliffside West"


In [17]:
df_new = df_new.reset_index()
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
df_new['Neighbourhood'] = np.where (df_new['Neighbourhood'] == 'Not assigned', df_new['Borough'], df_new['Neighbourhood'])
df_new

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [21]:
df_new.shape

(103, 3)