# IBM Data Science Capstone Project

# Outline
1. Import data from website and clean the dataframe
2. Get the log and lat data from csv data
3. Combine two dataframe into one
4. visualize the map using folium

In [1]:
#need to import the pandas to read the Html data
import pandas as pd
import numpy as np


In [2]:
# Using the pandas to read hte html tables; 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_list=pd.read_html(url) # list has 3 tables; each table is dataframe object;
df=df_list[0] # tabel 1 is our target tabel
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


# Clean the data frame
1.    The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
2.    Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
3.    More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.

4.    If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
5.    Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
6.    In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [3]:
df=df[df['Borough']!= "Not assigned"] # remove 'not assigned' borough
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [4]:
## search number of unique code

In [5]:
len(df['Postal Code'].unique())#unique post code

103

## Check the "Not assigned" in the Neighbourhood Column

In [6]:
print (df[df['Neighbourhood'] == 'Not assigned'].count()) 

Postal Code      0
Borough          0
Neighbourhood    0
dtype: int64


In [7]:
df.shape #no reeat post code in the tabel

(103, 3)

# Get the latitude and the longitude coordinates of each neighborhood. 

In [8]:
#!pip install geocoder

In [9]:
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None
#postal_code='M5G'
# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('29650, greer, SC')
#  lat_lng_coords = g.latlng
#
#print('{}, Toronto, Ontario'.format(postal_code))
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]
#lat_lng_coords

In [10]:
!wget -q -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data
df_co=pd.read_csv('Geospatial_Coordinates.csv')

In [11]:
df_co.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_co.shape

(103, 3)

# Combine two data frame into one with unique code "post code"

In [13]:
df_final=pd.merge(df, df_co, on='Postal Code', how='left') # combine two data frames into one with 'left' method
df_final

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [14]:
type(df_final)

pandas.core.frame.DataFrame

# Map the Toronta area

In [15]:
import folium

In [16]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[43.753259, -79.329656], zoom_start=10)


In [17]:
# add markers to map
for i in range (len(df_final)): 
    label = '{}, {}'.format(df_final.iloc[i,1], df_final.iloc[i,2])
    Latitude=df_final.iloc[i,3]
    Longitude=df_final.iloc[i,4]
    label = folium.Popup(label)
    folium.CircleMarker(
        [Latitude, Longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto