# Data Scraping Wikipedia for Capstone #

Installing Packages and Importing Libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml



In [2]:
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

Assigning URL and using Beautiful Soup to import data

In [3]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641"

In [4]:
page = urllib.request.urlopen(url)

In [5]:
soup = BeautifulSoup(page, "lxml")

Assign a variable with all tables on web page 

In [6]:
all_tables=soup.find_all("table")
#all_tables

Pulling out the table we want

In [7]:
right_table=soup.find('table', class_='wikitable sortable')

Extracting the rows from the table and adding values to different lists. Each list represents a Column on the table.

In [71]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

Turning lists into a Dataframe

In [72]:
df=pd.DataFrame(A,columns=['Postcode'])
df['Borough']=B
df['Neighborhood']=C

Removing cells that are not assigned a value (as indicated with a "Not assigned" string in table)

In [73]:
df = df[~df.Borough.str.contains("Not assigned")]

Grouping the data together and making it easier to read

In [77]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].unique().to_frame()

Getting shape of DataFrame

In [68]:
df.shape

(103, 1)

In [80]:
df.reset_index(inplace=True)

In [81]:
df_2 = pd.read_csv('http://cocl.us/Geospatial_data')

In [82]:
df_2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [83]:
df_2.rename(columns = {'Postal Code':'Postcode'}, inplace = True)

In [120]:
df_toronto = df.merge(df_2, how='inner', on='Postcode')

In [121]:
df_toronto['Neighborhood']

0                                       [Rouge, Malvern]
1               [Highland Creek, Rouge Hill, Port Union]
2                  [Guildwood\n, Morningside, West Hill]
3                                               [Woburn]
4                                          [Cedarbrae\n]
                             ...                        
98                                              [Weston]
99                                         [Westmount\n]
100    [Kingsview Village, Martin Grove Gardens\n, Ri...
101    [Albion Gardens\n, Beaumond Heights, Humbergat...
102                                        [Northwest\n]
Name: Neighborhood, Length: 103, dtype: object

In [122]:
df_neighborhoods = df_toronto['Neighborhood'].str.join(',')

In [123]:
df_neighborhoods = df_neighborhoods.replace('\n','', regex=True)

In [124]:
df_toronto['Neighborhood'] = df_neighborhoods

In [125]:
df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


In [88]:
from geopy.geocoders import Nominatim

In [93]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario 43.6534817, -79.3839347.


In [90]:
from sklearn.cluster import KMeans
import folium

In [92]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto