# Part 1. Web scraping Wikipedia page

### Importing needed libraries

In [1]:
import bs4, requests
import pandas as pd
import numpy as np

### Downloading the page and creating dataframe for the table

In [2]:
page_addr = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#creating request to donwload page above
res = requests.get(page_addr)
res.raise_for_status()

In [3]:
#creating beautiful soup object
wikiSoup = bs4.BeautifulSoup(res.text, 'html.parser')

In [4]:
#locating table
Table  = wikiSoup.find_all('table', class_="wikitable sortable")

In [5]:
#creating pandas dataframe from table
toronto_df = pd.read_html(str(Table))[0]
toronto_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## Prettifying dataframe to match all requests from the assignment

In [6]:
#replacing '/' in Neighborhood with comma, as requested in the instructions
toronto_df['Neighborhood'].replace(regex = True, to_replace = '\s\/',
                                          value = ',', inplace = True)


Dropping rows where Borough value is 'Not assigned' by changing it to NaN an then droping all NaN values from DF:

In [7]:
toronto_df['Borough'].replace(to_replace = 'Not assigned', value = np.nan, inplace=True)
toronto_df.dropna(inplace=True)
toronto_df = toronto_df.reset_index(drop=True)

In [8]:
#check dataframe
toronto_df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check dataframe dimensions:

In [9]:
toronto_df.shape

(103, 3)

# Part 2. Adding coordinates to dataframe

In [19]:
#reading csv with coordinates
geo_csv = requests.get('http://cocl.us/Geospatial_data')
geo_csv.raise_for_status()
coord_file = geo_csv.text


coord_file now contains string with all geo data. Lets convert it to StringIO object and create new data frame from it.

In [45]:
import sys
from io import StringIO
Data = StringIO(coord_file)
coordinates_df = pd.read_csv(Data, sep=',')
coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [44]:
coordinates_df.shape

(103, 3)

Now we have 2 dataframes. First contain all our borough and neighborhood data, called "toronto_df". Second contains coordinates to all postal codes. Lets merge them together and create new dataframe with all combined columns. 

In [41]:
toronto_joined_df = pd.merge(left = toronto_df, right = coordinates_df, left_on = 'Postal code', right_on = 'Postal Code')
toronto_joined_df = toronto_joined_df.drop('Postal Code', axis=1)
toronto_joined_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [42]:
#check if the dimensions are correct
toronto_joined_df.shape

(103, 5)