# Introduction.

## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto.

### Contains the first exercise.

## 1. Download dependencies needed.

import numpy as np
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

## 2. Download Dataset

In [21]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(URL) 

## 3. Scraping...

In [23]:
scraped_data = BeautifulSoup(res.text, 'html.parser')

## 4. Cleaning...

In [24]:
data_rows = scraped_data.select('.wikitable tr')

In [25]:
data_header = data_rows[0]
data_content = data_rows[1:] 

In [26]:
data_columns = []

for column in data_header.select('th'): 
    column_name = column.getText()
    data_columns.append(column_name.replace('\n', ''))  
    
data_columns

['Postal code', 'Borough', 'Neighborhood']

In [27]:
final_data = []

for row in data_content:
    data_row = []
    for data in row.select('td'): 
        data_row.append(data.getText().replace('\n', ''))
    final_data.append(data_row)

## 5. Creating Pandas DataFrame

In [29]:
torontoDF = pd.DataFrame(data=final_data, columns=data_columns)
torontoDF.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [30]:
torontoDF.tail()

Unnamed: 0,Postal code,Borough,Neighborhood
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
179,M9Z,Not assigned,


## 6. Cleaning Pandas DataFrame
##### 1. Ignore cells with a borough that is Not assigned.
##### 2. More than one neighborhood can exist in one postal code area. These rows will be combined into one row with the neighborhoods separated with a comma.
##### 3. Reset relative number of rows.

In [34]:
torontoDF = torontoDF[torontoDF.Borough != 'Not assigned']
torontoDF['Neighborhood'] = torontoDF['Neighborhood'].apply(lambda n: n.replace(' / ', ', '))
torontoDF = torontoDF.reset_index(drop=True)
torontoDF.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 7. Saving result to csv file

In [35]:
torontoDF.to_csv('Toronto_pc.csv')

## 8. Number of rows

In [36]:
torontoDF.shape

(103, 3)