# Coursera_Capstone - Week 3 Submission

## Problem 1

### Install Required Libraries

In [None]:
"""
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install geocoder
!pip install wget
!pip install geopy
"""

### Import Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Download `index.html` From Wikipedia Using `wget`

In [None]:
!wget -O index.html https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

### Read The `.html` File and Extract the `<table>` From It

In [2]:
fp = open('index.html', encoding="utf-8")
soup = BeautifulSoup(fp, 'html5lib')
table = soup.table.prettify()

### Create a Pandas Dataframe for the `<table>`

In [3]:
df = pd.read_html(table, na_values=[])
df = df[0]

### Process the Dataframe with the Following Requirements:

1. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
2. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
3. More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
5. Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
6. In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

#### Rename `Postal code` to `PostalCode`.

In [4]:
df.rename(columns = {'Postal code':'PostalCode'}, inplace = True) 

#### Remove any "Not assigned" from the Borough column.

In [5]:
df = df[df.Borough != 'Not assigned']

#### Search for any "Not assigned" from the Neighborhood Column and replace with its corresponding Borough.

In [6]:
for index, row in df.iterrows():
    if(row[2]=='NaN' or row[2]== '' or row[2]=='Not assigned'):
        print('Found one at index: {}', index)
        df.set_value(index, 'Neighborhood', row[1])
    else:
        continue

#### Merge any duplicate postal codes into one.

**Note: Wikipedia has already merged duplicate zipcodes together and is delimitated by `"/"`. We will therefore replace `"/"` with `","`.**

In [7]:
duplicates = df.duplicated()
duplicates # Shows that there are no duplicate zipcodes

2      False
3      False
4      False
5      False
6      False
       ...  
160    False
165    False
168    False
169    False
178    False
Length: 103, dtype: bool

In [8]:
df.replace({'Neighborhood': r' /'}, {'Neighborhood': ','}, regex=True, inplace=True)

#### Reset the index to cleanup dataframe.

In [9]:
df.reset_index(drop=True, inplace=True)

#### Output the shape of the dataframe

In [10]:
df.shape

(103, 3)

### Final Output

In [11]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## Problem 2

### Download `Geospatial_Coordinates.csv` From https://cocl.us/Geospatial_data

In [None]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data

### Read the `.csv` file into a Pandas DataFrame.

In [12]:
coord = pd.read_csv('Geospatial_Coordinates.csv')

### Use `geocoder` to obtain latitude and longitude of our postal codes.
**Note: I had an issue with executing the following code on the bottom when testing out a random postal_code from our DataFrame**

In [None]:
"""
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M1B'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]"""


### Since `geocoder` was unable to execute and was stalled at `Executing cell`,  the `Geospatial_Coordinates.csv` we will be used to merge the coordinates with our DataFrame in lieu of `geocoder`.

#### Rename `Postal Code` to `PostalCode` for matching columns to join.

In [13]:
coord.rename(columns = {'Postal Code':'PostalCode'}, inplace = True) 

#### Merge

In [14]:
result = pd.merge(df, coord, how="left", on=['PostalCode'])

### Final Product

In [15]:
result

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Problem 3

### Import Libraries

In [16]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
import folium

### Obtain geographical location for Toronto, Canada

In [17]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.6534817, -79.3839347.


### Create a map visual of postal codes with `Folium`.

In [18]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(result['Latitude'], result['Longitude'], result['Borough'], result['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 
map_toronto

### Import library required to cluster data.

In [19]:
from sklearn.cluster import KMeans

### Group the postal codes into 5 clusters

In [20]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = result.drop(['Neighborhood','Borough', 'PostalCode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:] 

array([4, 4, 2, 0, 2, 1, 3, 4, 4, 2, 0, 1, 3, 4, 4, 2, 2, 1, 3, 4, 2, 2,
       3, 4, 2, 2, 3, 0, 0, 4, 2, 2, 3, 4, 0, 4, 2, 2, 4, 0, 0, 2, 2, 2,
       4, 0, 1, 4, 2, 1, 1, 3, 0, 1, 2, 0, 1, 1, 4, 0, 1, 0, 0, 1, 1, 4,
       0, 0, 2, 1, 1, 4, 0, 0, 2, 2, 1, 1, 3, 2, 2, 1, 4, 2, 2, 3, 2, 2,
       1, 1, 4, 2, 2, 1, 1, 3, 2, 2, 1, 2, 4, 1, 1])

In [21]:
kmeans.labels_.shape

(103,)

### Insert cluster results into our DataFrame.

In [22]:
result.insert(0, 'Cluster Labels', kmeans.labels_)

### Render a new map with the postal code clusters color coded.

In [23]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(result['Latitude'], result['Longitude'], result['Neighborhood'], result['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters