##### Importing necessary packages

In [441]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

##### Reading the data from wiki page into a variable

In [450]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

##### Using BeautifulSoup, we can take a look at the underlying structure of the wiki page

In [None]:
soup = BeautifulSoup(source, 'html5lib')
print(soup.prettify())

##### Using the .find function on the variable 'soup' , we isolate the specific table that has the required data about Boroughs and Neighborhoods and write it out as a text file.

In [None]:
table = soup.find('table',{'class': 'wikitable sortable'})
table

In [None]:
with open ('toronto.txt', 'w') as r:
    for row in table.find_all('tr'):
        for cell in row.find_all('td'):
            r.write(cell.text.ljust(25))
        r.write('\n')
    

##### Read in the text file generated above with the data table and set column names

In [452]:
df = pd.read_fwf('toronto.txt', skip_blank_lines = True)
df.columns = [['PostalCode','Borough','Neighborhoods']]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,,,
2,M2A,Not assigned,Not assigned
3,,,
4,M3A,North York,Parkwoods
5,,,
6,M4A,North York,Victoria Village
7,,,
8,M5A,Downtown Toronto,Harbourfront
9,,,


##### In the following step, we deal with the NAs that are generated in the dataframe using the .dropna() function, with the threshold value as 3

In [453]:
df.dropna(thresh = 3, inplace = True)
df.shape
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
4,M3A,North York,Parkwoods
6,M4A,North York,Victoria Village
8,M5A,Downtown Toronto,Harbourfront
10,M5A,Downtown Toronto,Regent Park
12,M6A,North York,Lawrence Heights
14,M6A,North York,Lawrence Manor
16,M7A,Queen's Park,Not assigned
18,M8A,Not assigned,Not assigned


##### Dealing with the "Not assigned" Values

In [461]:
df = df.replace({"Not assigned"}, {''}, regex = True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1A,,
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Par, Ionview, Kennedy Park"
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village"


##### For records that have valid values for Boroughs but no value in the neighborhood columns, we have to add the borough name as the neighborhood. The next step takes care of that using the Where function in NumPy

In [462]:
df.columns = ['PostalCode', 'Borough', 'Neighborhoods']

df['Neighborhoods'] = np.where(df['Neighborhoods'] == "",df['Borough'], df['Neighborhoods'])
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1A,,
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Par, Ionview, Kennedy Park"
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village"


##### Using the groupby function, we collect Neighborhoods that belong to the same Borough

In [466]:
df = df.groupby(['PostalCode','Borough'])['Neighborhoods'].apply(', '.join).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1A,,
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Par, Ionview, Kennedy Park"
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village"


##### Printing shape of the final dataframe

In [468]:
print("Shape of the dataframe is:", df.shape)

Shape of the dataframe is: (180, 3)
