## Importing Libraries

In [None]:
# To access system-specific parameters and functions
import sys
# A general-purpose array-processing package
import numpy as np
# A library to manage the file-related input and output operations
import io
#from IPython.display import Image

!pip install geocoder
import geocoder

# library for Data Analsysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Matplotlib and Associated Plotting Modules
import matplotlib.pyplot as plt
import matplotlib.colors as colors

# Library to Handle JSON Files
import json

# Library to Handle Requests
import requests

# uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge geopy --yes
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

!conda install -c conda-forge scikit-learn
# import k-means from clustering stage
from sklearn.cluster import KMeans

# uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

print('Libraries imported.')
%matplotlib inline

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: \ 

## Part 1) Create DataFrame from Wikipedia page

### Fetching the Data from Wikipedia and Creating a Table with it

In [None]:
# Reading Wikipedia's page
read_url = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# To veryify the reading of Wikipedia's page
#print(type(read_url))
#print(len(read_url))

# The desired DataFrame is the first item in the list read_url. We don't need the other two DataFrames
#print(read_url[0])
#print(read_url[1])
#print(read_url[2])

df = read_url[0]
#df.head(5)

# Checking if there is a duplicate in Postal Code. Every Postal Code must present once only.
a = df["Postal Code"].value_counts()
    
for item in a:
    if item != 1:
        print("Attention: There is a duplicate in Postal Code!")

# Getting rid of the "Not assigned"-values in the Borough Column:
df["Borough"].replace("Not assigned", np.nan, inplace=True)
#df.head(5)

df_new = df.dropna(subset=["Borough"])
df_new.reset_index(drop=True, inplace=True)
df_new.head(5)

### Counting the number of "Not assigned"-values, that is left in Neighbourhood-Column:

In [None]:
#There should be no "Not assigned"-values in Neighbourhood-column!
df_new["Neighbourhood"].isin(['Not assigned']).sum()
df_new.shape

## Part 2) Modify the created Dataframe

### Load the coordinates data and sort the dataframe by its postal code:

In [None]:
url="https://cocl.us/Geospatial_data"
s=requests.get(url).content
df_coords=pd.read_csv(io.StringIO(s.decode('utf-8')))
df_coords.sort_values(by=["Postal Code"], inplace=True, ignore_index=True)
df_coords.head()

### Sort the dataframe, gained from wikipedia, by its postal code too:

In [None]:
df_new.sort_values(by=["Postal Code"], inplace=True, ignore_index=True)
df_new.head(10)

### Checking if the two DataFrames are sorted the same way and if they have the same length:

In [None]:
if df_coords["Postal Code"].values.all() == df_new["Postal Code"].values.all():
    print("The two dataframes are sorted in the same order and have the same length!")
else:
    print("The two dataframes are NOT sorted in the same order!!! Don't concate them of the coordinates will be mixed!!!")

### Drop the postal code column in df_coords and concate the two DataFrames:

In [None]:
df_coords.drop("Postal Code", axis=1, inplace=True)
df_coords.head()

In [None]:
pd.options.display.max_rows = 200
df_final = pd.concat([df_new, df_coords], axis=1)
df_final.head(103)

## Part 3) Exploring and cluster the neighborhoods in Toronto

### Creating a Map of Toronto with all the Places in our created DataFrame:

In [None]:
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Focusing on the Downtown of Toronto:

In [None]:
downtown_data = df_final[df_final['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head(20)

In [None]:
address = 'Downtown, Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown, Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Borough'], downtown_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown )  
    
map_downtown 

### Exploring the neighbourhood "Central Bay Street" in Downtown Toronto:

In [None]:
CLIENT_ID = 'MJVQQV5B0FX2FCNI24B0JUYBWFBQAU1RVSWPVKQO20A1HR3S' # your Foursquare ID
CLIENT_SECRET = 'DQM1EE5GLE3MHXAF23ZXNHQ0I1RXURU051T2IJRFMAFUO0GE' # your Foursquare Secret
#ACCESS_TOKEN = 'deleted ;)' # your FourSquare Access Token
VERSION = '20210228' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)



In [None]:
selected_address = "Central Bay Street"

index = downtown_data[downtown_data["Neighbourhood"]==selected_address].index.values[0]
neighborhood_latitude = downtown_data["Latitude"].iloc[index]
neighborhood_longitude = downtown_data["Longitude"].iloc[index]

print('Latitude and longitude values of {} are {}, {}.'.format(selected_address, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()
results

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']

# flatten JSON
nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print(len(nearby_venues))
nearby_venues.head(10)

In [None]:
print(f"{len(nearby_venues)} venues in the area of the '{selected_address}' neighbourhood have been reported from foursquare")

### Mark all the gained venues in the neighbourhood "Central Bay Street":

In [None]:
# create map of Central Bay Street neighbourhood using latitude and longitude values
map_nearby_venues = folium.Map(location=[neighborhood_latitude, neighborhood_longitude], zoom_start=16)

# add markers to map
for lat, lng, name, categories in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories']):
    label = '{}, {}'.format(name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nearby_venues )  
    
map_nearby_venues

### NOTICE: The Venues are either located along the Yonge Street or Collage Street. <br> Let's try to cluster these venues!
 

In [None]:
feature_matrix = np.column_stack((nearby_venues["lat"], nearby_venues["lng"]))
print(len(feature_matrix))
feature_matrix[0:10]

In [None]:
k_means = KMeans(init="k-means++", n_clusters=2, n_init=20)
k_means.fit(feature_matrix)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_

In [None]:
# initialize the plot with the specified dimensions.
fig = plt.figure(figsize=(15, 10))

# colors uses a color map, which will produce an array of colors based on
# the number of labels. We use set(k_means_labels) to get the
# unique labels.
colors = plt.cm.Spectral(np.linspace(0, 1, len(set(k_means_labels))))

# create a plot
ax = fig.add_subplot(1, 1, 1)

# loop through the data and plot the datapoints and centroids.
# k will range from 0-3, which will match the number of clusters in the dataset.
for k, col in zip(range(len([[4,4], [-2, -1], [2, -3], [1, 1]])), colors):

    # create a list of all datapoints, where the datapoitns that are 
    # in the cluster (ex. cluster 0) are labeled as true, else they are
    # labeled as false.
    my_members = (k_means_labels == k)
    
    # define the centroid, or cluster center.
    cluster_center = k_means_cluster_centers[k]
    
    # plot the datapoints with color col.
    ax.plot(feature_matrix[my_members, 0], feature_matrix[my_members, 1], 'w', markerfacecolor=col, marker='.', markersize=10)
    
    # plot the centroids with specified color, but with a darker outline
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,  markeredgecolor='k', markersize=10)

# title of the plot
ax.set_title('KMeans')

# show the plot
plt.show()

### Successfully grouped the venues in the 'Central Bay Street' neighbourhood  in two clusters