#### I have imported all necessary libraries 

In [32]:
from bs4 import BeautifulSoup
import requests 
import csv
import pandas as pd



####  First step was to use the GET request to scrape code from Wikipedia page. I used the BeautifulSoup library to pull data out of the HTML version of the webpage. I wrote the data obtained in a csv file which I've subsequently read into a DataFrame.

In [33]:
# using GET request to scrape code from Wikipedia page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# using BeautifulSoup library to pull data out of the HTML version of the webpage
soup = BeautifulSoup(source.content, 'html.parser')
tables = soup.find('table', class_='wikitable sortable')
rows = tables.select('tr')
header = [th.text.rstrip() for th in rows[0].find_all('th')]
# write the data obtained in a csv file
with open('output.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)
    for row in rows[1:]:
         data = [th.text.rstrip() for th in row.find_all('td')]
         writer.writerow(data)
# read the created csv file into a DataFrame        
csv_file='output.csv'
df = pd.read_csv(csv_file)
# display the first 5 rows
df.head(5)




Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### I have 'cleaned' the data by following the steps explained in the comments

In [34]:
# set columns to PostalCode, Borough, and Neighborhood
df.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True)

# only consider rows where the Borough is assigned
df = df[df.Borough != 'Not assigned']

# group the data by Postal Code, aggregate the data and use the join function to put together the string values in column 'Neighborhood'
df=df.groupby("PostalCode").agg(lambda x:','.join(set(x)))


# I've used a conditional selection with boolean arrays to replace the cell 
# where Neighborhoods are not assigned, with the value shown in the Borough cell 
df.loc[df['Neighborhood']=="Not assigned",'Neighborhood']=df.loc[df['Neighborhood']=="Not assigned",'Borough']
        

    
df.head(5)


Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


#### I've used the .shape method to print the number of rows in my dataframe

In [35]:
df.shape

(103, 2)

####  Read the csv file which contains the geographical coordinates of each postal code

In [36]:
new_df=pd.read_csv('http://cocl.us/Geospatial_data')
new_df.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Copying the Longitude, Latitude columns from the new_df into the existing DataFrame

In [37]:
df['Latitude']=new_df['Latitude'].values
df['Longitude']=new_df['Longitude'].values

df.head(5)

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [38]:
import folium
from geopy.geocoders import Nominatim
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Importing the libraries needed for the new exercise

#### I have used geopy library to get the latitude and longitude values of Toronto City

In [39]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### I have created the map of Toronto 

In [40]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=14)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Defining Foursquare Credentials and Version

In [41]:
CLIENT_ID = 'S4KPCJKZIODIUMGINZHTYKMOHV2I2C0N1PWL4BRXKPCZGWE5' 
CLIENT_SECRET = 'MTFOEKLBB2GMG1HX0BGC2COQJJOXOLCJDCLYHJPLZ0VBZSOU' 
VERSION = '20180605' 

#### I have created the function to EXPLORE all neighborhoods in Toronto

In [42]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

#### I have created anew DataFrame which runs the function above 

In [43]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Malvern,Rouge
Highland Creek,Rouge Hill,Port Union
Morningside,Guildwood,West Hill
Woburn
Cedarbrae
Scarborough Village
Ionview,Kennedy Park,East Birchmount Park
Golden Mile,Clairlea,Oakridge
Cliffside,Scarborough Village West,Cliffcrest
Cliffside West,Birch Cliff
Scarborough Town Centre,Wexford Heights,Dorset Park
Wexford,Maryvale
Agincourt
Sullivan,Clarks Corners,Tam O'Shanter
Agincourt North,Steeles East,L'Amoreaux East,Milliken
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Oriole,Henry Farm
Bayview Village
Silver Hills,York Mills
Willowdale,Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South,Flemingdon Park
Downsview North,Wilson Heights,Bathurst Manor
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale,The Danforth West
The Beaches West,Indi

In [44]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")


toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### I have analized each neighborhood using onehot encoding 

In [45]:
num_top_venues = 5
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,Richmond,King----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2                  Bar  0.04
3           Steakhouse  0.04
4  American Restaurant  0.04


----Agincourt----
                venue  freq
0      Sandwich Place  0.25
1              Lounge  0.25
2      Breakfast Spot  0.25
3  Chinese Restaurant  0.25
4         Yoga Studio  0.00


----Agincourt North,Steeles East,L'Amoreaux East,Milliken----
                venue  freq
0          Playground  0.33
1    Asian Restaurant  0.33
2                Park  0.33
3         Yoga Studio  0.00
4  Mexican Restaurant  0.00


----Alderwood,Long Branch----
            venue  freq
0     Pizza Place   0.2
1    Skating Rink   0.1
2  Sandwich Place   0.1
3    Dance Studio   0.1
4             Pub   0.1


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1                 Bank  0.25
2   Chinese Restaurant  0.25
3                 Café  0.25
4    Mobile Phone Shop  0.00

#### Printing the top 5 most common venues of each neighborhood

#### Create a new dataframe to display the top 10 venues for each neighborhood.

In [46]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Steakhouse,Thai Restaurant,American Restaurant,Bar,Hotel,Cosmetics Shop,Gym,Burger Joint
1,Agincourt,Lounge,Breakfast Spot,Sandwich Place,Chinese Restaurant,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Dumpling Restaurant,Drugstore,Department Store
2,"Agincourt North,Steeles East,L'Amoreaux East,M...",Park,Asian Restaurant,Playground,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
3,"Alderwood,Long Branch",Pizza Place,Gym,Pharmacy,Pub,Sandwich Place,Pool,Dance Studio,Skating Rink,Coffee Shop,Drugstore
4,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


#### I have used kMeans to CLUSTER data into 4 clusters

In [47]:
kclusters = 4
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10] 

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 0], dtype=int32)

#### Creating a new dataframe which includes the clusters

In [50]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353,1.0,Fast Food Restaurant,Print Shop,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,1.0,Bar,Construction & Landscaping,Women's Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711,1.0,Electronics Store,Breakfast Spot,Mexican Restaurant,Tech Startup,Intersection,Medical Center,Rental Car Location,Pizza Place,Spa,Donut Shop
M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Convenience Store,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Hakka Restaurant,Thai Restaurant,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Lounge,Costume Shop,Falafel Restaurant


#### Vizualizing the clusters

In [53]:

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


TypeError: list indices must be integers or slices, not float