# IBM Applied Data Science Capstone Course by Coursera


Week-3 Part-1,2

* To Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in the city of   Toronto.
* Get the geographical coordinates of the neighborhoods in Toronto.






# 1.  Importing Libraries

In [65]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


from bs4 import BeautifulSoup # library to parse HTML and XML documents

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# 2. Scrapping the wikipedia data into data frame using beautiful soup.

In [66]:
# send the GET request to retrive the data from wikipedia page consisting toronto neighbourhood details.

dataset = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [67]:
# Parsing the data from the html into a beautifulsoup object

soup = BeautifulSoup(dataset, 'html.parser')

In [68]:
# create three fields to store the data
postalCode = []
borough = []
neighborhood = []

# Now as the wikipedia page has been scrapped successfully we need to look        for the specific data such as the table of our interest.

   Hence here comes the use of BeautifulSoup to specifically retrieve the table.

# for each row of the table, find all the table data i.e for each row all columns's      data should be made available.

   
   



In [69]:
#Using the BeautifulSoup Command to append the data into the respective data fields.

for rows in soup.find('table').find_all('tr'):
    cellData=rows.find_all('td')
    if(len(cellData) > 0):
        postalCode.append(cellData[0].text)
        borough.append(cellData[1].text)
        neighborhood.append(cellData[2].text.rstrip('\n')) 
        
        
        

In [70]:
#Converting it into a data frame
toronto_DF=pd.DataFrame({"PostalCode":postalCode,
                        "Borough":borough,
                        "Neighborhood":neighborhood})

toronto_DF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 3.Dropping the borough values having 'Not assigned' values.

In [71]:
toronto_DF=toronto_DF[toronto_DF.Borough !="Not assigned"].reset_index(drop=True)
toronto_DF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


# 4. Group neighborhoods in the same borough

In [72]:
toronto_grouped=toronto_DF.groupby(["PostalCode","Borough"],as_index=False).agg(lambda x:",".join(x))

toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 5. For Neighborhood="Not assigned", make the value the same as Borough

In [73]:
for index,row in toronto_grouped.iterrows():
    if row["Neighborhood"]=="Not assigned":
        row["Neighborhood"]=row["Borough"]
        
        
toronto_grouped.head()        

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 6. Check whether it is the same as required by the question as per the sample data

In [74]:
# create a new test dataframe

column_names = ["PostalCode", "Borough", "Neighborhood"]

test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postalcode in test_list:
    test_df=test_df.append(toronto_grouped[toronto_grouped["PostalCode"]==postalcode],ignore_index=True)
    
test_df.head()    


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens,Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside


# 7. Printing the shape of the resulting dataset.

In [75]:
toronto_grouped.shape

(103, 3)

# 8. Getting the geospatial cooridinates by the given Coursera.

In [76]:
import os
os.chdir("E:/capstone/week 3")

coordinates=pd.read_csv("Geospatial_Coordinates.csv")


#Rename the column name Postal Code as PostalCode

coordinates.rename(columns={"Postal Code":"PostalCode"},inplace=True)

coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# 9. Merging the two tables so that latitudes and longitudes could be added.

In [77]:
toronto_grouped=toronto_grouped.merge(coordinates,on="PostalCode",how="left")
toronto_grouped.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# 10. Finally, check to make sure the coordinates are added as required by the question given in coursera

In [78]:
columns_names =["PostalCode","Borough","Neighborhood","Latitude","Longitude"]

test_df=pd.DataFrame(columns=columns_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]


for postalcode in test_list:
    test_df=test_df.append(toronto_grouped[toronto_grouped["PostalCode"]==postalcode],ignore_index=True)
    
test_df.head()   

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
