# Capstone Project

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## I.Scrap Data from Wikipedia

In [13]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=884970902e45eb0068f85e29237eb30376a049571d50f97b8aee546b037ca576
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [18]:
import requests
from bs4 import BeautifulSoup
import lxml.etree as xml
import re

headers = {
    'authority': 'scrapeme.live',
    'dnt': '1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}

URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [32]:
def _extractData(URL):
    try:
        response = requests.get(URL, headers=headers)

        web_page = BeautifulSoup(response.text, 'html5lib')
        table = web_page.find_all(name="table", attrs={"class": "wikitable"})[0]

        df = pd.read_html(str(table))[0]
    except Exception as e:
        print(e)
        
    return df


dataframe = _extractData(URL)

## II.PreProcessing the Data
1. Deleting rows where Borough is "Not Assigned"
2. If "Neighborhood" is not assigned it takes the same value of "Borough"

In [36]:
dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [101]:
pro_dataframe = dataframe.loc[ dataframe.Borough != "Not assigned"]
pro_dataframe = pro_dataframe.sort_values(by="Postal Code")
pro_dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood
9,M1B,Scarborough,"Malvern, Rouge"
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
36,M1G,Scarborough,Woburn
45,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
107,M9P,Etobicoke,Westmount
116,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
143,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


**Rows with Borough Not Assigned were taken off the dataframe**

In [54]:
pro_dataframe.loc[ pro_dataframe["Neighbourhood"] == "Not assigned" ]

Unnamed: 0,Postal Code,Borough,Neighbourhood


**All Neighbourhood are assigned to a Borough**

In [63]:
print("The shape of this Dataframe is : ", pro_dataframe.shape)
print(r"With",pro_dataframe.shape[0],"rows and",pro_dataframe.shape[1],"columns")

The shape of this Dataframe is :  (103, 3)
With 103 rows and 3 columns


## III.GeoCoding

**Google's API NOT WORKING**

In [68]:
import geocoder

# initialize your variable to None
lat_lng_coords = None

latslongs = []

# loop until you get the coordinates
while(lat_lng_coords is None):
    for p in postal_codes:
        try:
            g = geocoder.google('{}, Toronto, Ontario'.format(p))
            lat_lng_coords = g.latlng
            latslongs.append([p, lat_lng_coords[0], lat_lng_coords[1]])
        except Exception as e:
            print("Error ",e)

**Try with the geospatial datasheets**

In [102]:
## Download the geospatial datasheets
# !wget https://cocl.us/Geospatial_data

df_coords = pd.read_csv('Geospatial_data')
df_coords = df_coords.sort_values(by="Postal Code")
df_coords

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Merge both the dataframes

In [104]:
final_df = pro_dataframe.merge(df_coords)
final_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


## IV.Visualization