# Capstone Project
### IBM Data Science Specialization - Course 9
_this notebook will be used for the Capstone Project - Battling Neighbourhoods_

## Importing libraries

In [195]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import lxml
import geocoder

## Web scraping with BeautifulSoup library

### Part 1 - retrieving data from Wikipedia, preparing a DataFrame

In [196]:
#getting the sourse html of the webpage
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#creating the BS object to work with
soup = BeautifulSoup(source, 'lxml')

In [197]:
#fetching the table component from the html wikipedia page
table = soup.find("table") 

#each cell represents a row in the future DataFrame
cells = table.find_all("tr")

#we will make DataFrame from these arrays
PostalCode   = list()
Borough      = list()
Neighborhood = list()

#going through the cells and filling out the arrays
for i in range(1,len(cells)):
    current_elems = cells[i].find_all("td")
    
    nbhood = str(current_elems[2].text) #some issues with the last column 
    
    PostalCode.append(current_elems[0].text)
    Borough.append(current_elems[1].text)
    Neighborhood.append(nbhood[0:len(nbhood)-1])

In [198]:
data_ = {"PostalCode" : PostalCode, "Borough": Borough, "Neighborhood": Neighborhood}
df =pd.DataFrame(data=data_)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [199]:
#Processing the DataFrame according to given conditions

#1: all entries with Borough = Not Assigned must be dropped

df = df.loc[df['Borough'] != 'Not assigned'].reset_index()


#2: If neighborhood is unavailable, we copy borough into it

df.loc[df['Neighborhood'] == "Not assigned", 'Neighborhood'] = df['Borough']

df = df[['PostalCode','Borough','Neighborhood']]

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [200]:
#3: grouping neighborhoods by postal code

print(df['PostalCode'].value_counts().tolist())
print(len(df['PostalCode'].value_counts().tolist()))

[8, 8, 7, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
103


In [201]:
for elem in df['PostalCode'].tolist():
    #print('elem: ' + str(elem))
    borough = df.loc[df['PostalCode'] == elem,'Borough'].tolist()[0]
    neighborhoods = df.loc[df['PostalCode']==elem,'Neighborhood'].tolist()
    neighbString  = ""
    
    for i in neighborhoods:
        neighbString += (i+", ")
        
    df = df.loc[df['PostalCode'] != elem]
    df = df.append({'PostalCode': elem, 'Borough': borough, 'Neighborhood': neighbString[0:-2]},ignore_index=True)
    df.reset_index()

In [202]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
102,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


In [203]:
df.shape

(103, 3)

### Part 2 - adding latitude and longitude coordinates

Geocoder just kept returning Nones, so I opted for the CSV file way.

In [204]:
llframe = pd.read_csv("http://cocl.us/Geospatial_data")
llframe.head()  

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [205]:
postalcodes = df['PostalCode'].tolist()
latitudes   = list()
longitudes  = list()

for code in postalcodes:
        lat = llframe.loc[llframe['Postal Code'] == code, 'Latitude'].tolist()[0]
        lng = llframe.loc[llframe['Postal Code'] == code, 'Longitude'].tolist()[0]
        latitudes.append(lat)
        longitudes.append(lng)
df['Latitude'] = pd.Series(latitudes, index=df.index)
df['Longitude']= pd.Series(longitudes,index=df.index)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
