# <span style="color:blue">Question 1</span>
### <span style="color:blue">Use the BeautifulSoup package or any other way you are comfortable with to transform the data in the table on the Wikipedia page into the above pandas dataframe</span>

In [61]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [62]:
import requests
webseitdata=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [63]:
# parse data from the html into a beautifulsoup object
from bs4 import BeautifulSoup
soup = BeautifulSoup(webseitdata,'html.parser')
#print(soup.prettify())

In [64]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [65]:
#The table is available at class
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if (len(cells)>0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text)

In [66]:
postalCodeList=list(map(lambda x:x.rstrip("\n"),postalCodeList))
boroughList=list(map(lambda x:x.rstrip("\n"),boroughList))
neighborhoodList=list(map(lambda x:x.rstrip("\n"),neighborhoodList))

In [67]:
df=pd.DataFrame({"PostalCode":postalCodeList, "Borough":boroughList,"Neighborhood":neighborhoodList})

In [68]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Q 3. Only processing the cells that have an assigned borough. Ignoring the cells with a borough that is Not assigned. Droping row where borough is "Not assigned"

In [69]:
df_clean1=df[df.Borough!="Not assigned"].reset_index(drop=True)

In [70]:
df_clean1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


###  4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [71]:
# group neighborhoods in the same borough
toronto_df_grouped = df_clean1.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 5. For Neighborhood="Not assigned", make the value the same as Borough

In [72]:
toronto_df_grouped["Neighborhood"]=toronto_df_grouped["Neighborhood"].replace("Not assigned",toronto_df_grouped["Borough"])

In [73]:
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [74]:
### 6. Check whether the data looks same as the list reqired

In [75]:
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

test_df=pd.DataFrame(columns=toronto_df_grouped.columns)

for postalcode in test_list:
    test_df=test_df.append(toronto_df_grouped[toronto_df_grouped["PostalCode"]==postalcode],ignore_index=True)

In [76]:
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


### 7. Finally, print the number of rows of the cleaned dataframe

In [77]:
toronto_df_grouped.shape

(103, 3)

# <span style="color:blue">Question 2</span>
### <span style="color:blue">Use the Geocoder package or the csv file to create dataframe with longitude and latitude values</span> 

#### We will be using a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [78]:
geo_url="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(geo_url)

In [79]:
#geo_data.columns
geo_data.columns=['PostalCode', 'Latitude', 'Longitude']

In [80]:
toronto_df2=pd.merge(toronto_df_grouped,geo_data,how='inner',on="PostalCode")

In [81]:
toronto_df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# <span style="color:blue">Question 3</span>
### <span style="color:blue">Explore and cluster the neighborhoods in Toronto</span>

In [82]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df2['Borough'].unique()),
        toronto_df2.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [83]:
from geopy.geocoders import Nominatim 
import geopy
# convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [84]:
#get Latitute and longitude of toronto

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ON")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.6534817, -79.3839347.


In [85]:
# create map of Torronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df2['Latitude'], toronto_df2['Longitude'], toronto_df2['Borough'], toronto_df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 
map_toronto