# Capstone Week 3 part 1: Wikipedia Web Scraper

Let's start by scraping our columns from Wikipedia's table:

In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

try:
    webpage = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
    doc = urllib.request.urlopen(webpage)
except:
    print('Unable to connect to page, please try again')
    quit()
#initializing the columns and Soup object
soup = BeautifulSoup(doc, 'html.parser')
PostalCode = []
Borough = []
Neighborhood = []

#loads items into their columns
count = 1
for item in soup.find_all('td')[:540]:
    item = item.get_text().strip()
    if count == 1:
        PostalCode.append(item)
        count += 1
        continue
    if count % 2 == 0:
        Borough.append(item)
        count += 1
        continue
    if count % 3 == 0:
        Neighborhood.append(item)
        count = 1
        continue



Now that the columns are built, let's go ahead and build the Dataframe

In [2]:
df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Now we clean it up, it is important to note that the some of the tasks are already done for us by 
wikipedia, the assignment appears to be out of date. This means that the final shape may not match
what the course wants.

In [3]:
#removes incomplete data
df_clean = df.loc[df['Borough'] != 'Not assigned']
df_clean.reset_index(drop = True, inplace = True)

#add commas
target = df_clean['Neighborhood'].values
for index, item in enumerate(target):
    tab = item.maketrans('/', ',')
    target[index] = item.translate(tab)
df_clean.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District , Ryerson"


In [4]:
df_clean.shape

(103, 3)

# End of part 1, start of part 2



In [5]:
import pandas as pd

geodata = pd.read_csv('Geospatial_Coordinates.csv')
geodata.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
geodata.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now we are ready to join the tables


In [6]:
complete_set = df_clean.set_index('PostalCode').join(geodata.set_index('PostalCode'))
complete_set.reset_index(inplace = True)
complete_set.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


# End of part 2, start of part 3

It is now time to explore the city with Foursquare's API. Let's use the function from the New York Lab.

In [7]:
import requests
CLIENT_ID = 'VV0202OTRJRZMEBKZVKCLIGMNID1TLBOOG0K10TPHQWKW25G'
CLIENT_SECRET = 'LYZSUPEY5XGIN20MGHIZAD1LE4AJ4WUB1YM431YUQVY4DOCM'
VERSION = '20180605' 
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
       
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

venue_df = getNearbyVenues(names=complete_set['Neighborhood'], latitudes=complete_set['Latitude'], 
                longitudes=complete_set['Longitude'])

venue_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


Now let's get everything one-hot encoded so we can put this into a clustering algorithm.

In [8]:
encoded = pd.get_dummies(venue_df['Venue Category'])
encoded.insert(0, 'Neighborhood_Name', venue_df['Neighborhood'].values)
encoded.insert(1, 'Neighborhood Latitude', venue_df['Neighborhood Latitude'].values)
encoded.insert(2, 'Neighborhood Longitude', venue_df['Neighborhood Longitude'].values)

encoded.head()

Unnamed: 0,Neighborhood_Name,Neighborhood Latitude,Neighborhood Longitude,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,43.753259,-79.329656,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,43.753259,-79.329656,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now let's group all the venues together by Neighborhood_Name and prepare to pass the data into the k-means algorithim.

In [26]:
dataset = encoded.groupby('Neighborhood_Name').mean().reset_index()
dataset_final = dataset.drop('Neighborhood_Name', 1)
dataset_final.head()

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,43.7942,-79.262029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,43.602414,-79.543484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,43.754328,-79.442259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,43.786947,-79.385975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,43.733282,-79.41975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


All that's left to do is run the model, insert the labels back into the data, and plot the Neighborhoods out on a folium map

In [27]:
from sklearn.cluster import KMeans
import folium as f

model = KMeans(n_clusters=5, random_state=3).fit(dataset_final)
dataset.insert(1, 'Labels', model.labels_)

mapset = dataset[['Neighborhood_Name', 'Labels']]
mapset.insert(2, 'Neighborhood Latitude', dataset_final['Neighborhood Latitude'].values)
mapset.insert(3, 'Neighborhood Longitude', dataset_final['Neighborhood Longitude'].values)


LAT, LON = 43.6532, -79.3832
map_clusters = f.Map(location=[LAT, LON], zoom_start=11)


mapsetf = mapset.groupby('Neighborhood_Name').mean()
for code, lab, lat, long in zip(mapsetf.index, mapset['Labels'], mapsetf['Neighborhood Latitude'], mapsetf['Neighborhood Longitude']):
    if lab == 0: colr = '#FFFFFF'
    if lab == 1: colr = '#FF00FF'
    if lab == 2: colr = '#00FF00'
    if lab == 3: colr = '#FFFF00'
    if lab == 4: colr = '#00FFFF'
    f.CircleMarker(
    location=[lat, long],
    radius=12,
    popup= code,
    color= colr,
    fill=True,
    fill_color='#001100'
    ).add_to(map_clusters)

map_clusters   
