# This is my main project file. The data scraping, visualization and analysis will be done in this notebook. You can read more about the project in the report pdf found on my repository.

First, let's import the necessary libraries

In [293]:
import numpy as np
import pandas as pd
import folium 
import matplotlib.pyplot as plt
from urllib.request import urlopen
import geocoder
import requests
from geopy.geocoders import Nominatim
from IPython.display import clear_output
from sklearn.cluster import KMeans

In order to keep my Client id and secret secure, I will be reading them from a file saved on my computer that I will exclude from git. If you want to experiment with the notebook, you can create a free account of foursquare and repace CLIENT_ID and CLIENT_SECRET with your own and skip reading the file. Or you can write a txt file in the same folder as the notebook that has the CLIENT_ID on the first line and CLIENT_SECRET on the second. After making the .txt file, point to it in the client_cred variable. 

In [294]:
client_cred = 'foursquare_client_creds.txt'
file = open(client_cred, "r")
CLIENT_ID = file.readline()[:-1] #the [:-1] is used to cut out the '\n' character at the end of the line
CLIENT_SECRET = file.readline()
VERSION = '20180605' # Foursquare API version
geolocator = Nominatim(user_agent="neighborhood-recommender")

Let's try using foursquare to obtain results to make sure the connection was successful

In [295]:
lat = '40.1872'
lng = '44.5152'
radius = 500
LIMIT = 10
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
if results!=None:
    print("Connection was successful")
else:
    print("Something went wrong")

Connection was successful


## Now, let's scrape wikipedia pages for New York, Chicago and LA neighborhoods. 

# Chicago

In [357]:
Chicago_wiki_url = 'https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago'
NYC_wiki_url = 'https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City'
LA_wiki_url = 'https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles'

In [358]:
Chicago_neighborhoods = pd.read_html(Chicago_wiki_url, match='Woodlawn')[0]
Chicago_neighborhoods.drop(1, axis=1, inplace=True) #Drop the community area col
Chicago_neighborhoods.drop(0, axis=0, inplace=True) #Drop the first row that shouldve been the header
Chicago_neighborhoods.rename(columns = {0:'Neighborhood'}, inplace=True) #Rename the columns

In [359]:
#Get the latitude and longitude of the neighborhoods
geolocator = Nominatim(user_agent="neighborhood-recommender")
lats = []
longs = []
got_none = [] #the list of neighborhoods we got none for
got_none_ind = []
for i, neighb in enumerate(Chicago_neighborhoods['Neighborhood']):
    print(neighb)
    location = geolocator.geocode( neighb + ', Chicago')
    if location != None:
        lats.append(location.latitude)
        longs.append(location.longitude)
    else:
        got_none.append(neighb)
        got_none_ind.append(i)
        lats.append(np.nan)
        longs.append(np.nan)
    clear_output()

In [360]:
Chicago_neighborhoods['Latitude'] = lats
Chicago_neighborhoods['Longitude'] = longs

Sometimes geopy cant obtain latitude and longitude values. I explore those neighborhoods further below. Looks like a small change in the name can often fix the issue. If it doesnt work, I'll find the location manually or drop the row. 

In [361]:
print("List of neighborhoods with no location", got_none)

List of neighborhoods with no location ['Ashburn Estates', 'Belmont Heights', 'Cottage Grove Heights', 'Irving Woods', 'Jackson Park Highlands', 'Lakewood / Balmoral', 'Legends South (Robert Taylor Homes)', 'Margate Park', 'Polish Downtown', 'Ranch Triangle', "River's Edge", "Saint Ben's", 'Sheffield Neighbors', 'Sheridan Station Corridor', "Talley's Corner", 'Wacławowo', 'West Chesterfield', 'Wrightwood Neighbors']


In [362]:
#Drop the indices we got none for 
Chicago_neighborhoods.reset_index(inplace=True, drop=True)
Chicago_neighborhoods.drop(got_none_ind, inplace=True)
#Chicago_neighborhoods.reset_index(inplace=True, drop=True)

In [363]:
#Add some of the neighborhoods I could recover locations for
location = geolocator.geocode('Cottage Grove, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Cottage Grove' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Jackson Park, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Jackson Park' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Lakewood, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Lakewood' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Balmoral, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Balmoral' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Wrightwood, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Wrightwood' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods.sort_values(by='Neighborhood')
Chicago_neighborhoods = Chicago_neighborhoods.reset_index(drop=True)

In [364]:
Chicago_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.971937,-87.716174
1,Altgeld Gardens,41.654864,-87.600446
2,Andersonville,41.977139,-87.669273
3,Archer Heights,41.811422,-87.726165
4,Armour Square,41.840033,-87.633107


In [365]:
Chicago_neighborhoods.shape

(233, 3)

In [366]:
chi_location = geolocator.geocode('Chicago, IL')

In [367]:
# create map of Toronto using latitude and longitude values
map_chicago = folium.Map(location=[chi_location.latitude, chi_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(Chicago_neighborhoods['Latitude'], Chicago_neighborhoods['Longitude'], Chicago_neighborhoods['Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago

## When we zoom in, we can see that some of the neighborhoods are not actually in Chicago. This is either a result of geopy's errors or a mistake from scraping neighborhood names. I'm fixing that below. 

I did so by going through the map and checking out the neighborhoods outside of chicago one by one. Some of them were just duplicated of neighborhoods with slightly different names or were names of residential properties instead of neighborhoods. Those I dropped. The others I got the location of by doing a google search or specifying the address more properly.

In [368]:
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Central Station']

In [369]:
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'River North']
loc = geolocator.geocode('River North, Chicago, IL, USA')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'River North', 'Latitude': loc.latitude, 'Longitude': loc.longitude}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'University Village']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Old Town Triangle']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'K-Town']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Lincoln Square']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Lincoln Square', 'Latitude': 41.976049, 'Longitude': -87.7079486}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Heart of Chicago']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Big Oaks']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Big Oaks', 'Latitude': 41.9992963, 'Longitude': -87.7540899}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Chrysler Village']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Chrysler Village', 'Latitude': 41.7800088, 'Longitude': -87.7405553}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Old Town']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Old Town', 'Latitude': 41.9111221, 'Longitude': -87.6492029}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Fifth City']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Fifth City', 'Latitude': 41.8776055, 'Longitude': -87.7152837}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'New Eastside']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Museum Campus']
Chicago_neighborhoods = Chicago_neighborhoods.sort_values(by='Neighborhood')
Chicago_neighborhoods.reset_index(drop=True, inplace=True)

In [370]:
Chicago_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.971937,-87.716174
1,Altgeld Gardens,41.654864,-87.600446
2,Andersonville,41.977139,-87.669273
3,Archer Heights,41.811422,-87.726165
4,Armour Square,41.840033,-87.633107


# Let's try this again. 

In [372]:
# create map of Toronto using latitude and longitude values
map_chicago = folium.Map(location=[chi_location.latitude, chi_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(Chicago_neighborhoods['Latitude'], Chicago_neighborhoods['Longitude'], Chicago_neighborhoods['Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago

In [373]:
Chicago_neighborhoods.to_csv('Chicago_neighborhoods.csv')

# NYC

In [374]:
NYC = pd.read_html(NYC_wiki_url, match='Arrochar')[0]
NYC.drop([0,1,2,3], axis=1, inplace=True)
NYC.drop([0,60], axis=0, inplace=True)
NYC_neighborhoods = pd.DataFrame(columns=['Neighborhood'])
NYC_neighborhoods

Unnamed: 0,Neighborhood


In [375]:
for n in NYC[4]:
    neighbs = n.split(',')
    for nb in neighbs:
        NYC_neighborhoods = NYC_neighborhoods.append({'Neighborhood': nb.strip()}, ignore_index=True)

In [376]:
NYC_neighborhoods.sort_values(by='Neighborhood', inplace=True)
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [377]:
lats = []
longs = []
got_none = [] #the list of neighborhoods we got none for
got_none_ind = []
for i, neighb in enumerate(NYC_neighborhoods['Neighborhood']):
    print(neighb)
    location = geolocator.geocode( neighb + ', NYC')
    if location != None:
        lats.append(location.latitude)
        longs.append(location.longitude)
    else:
        got_none.append(neighb)
        got_none_ind.append(i)
        lats.append(np.nan)
        longs.append(np.nan)
    clear_output()

In [378]:
NYC_neighborhoods['Latitude'] = lats
NYC_neighborhoods['Longitude'] = longs

In [379]:
print("List of neighborhoods with no location", got_none)

List of neighborhoods with no location ['Greenwood Heights', 'Hilltop Village', 'Kew Gardens Hills', 'Meiers Corners', 'Plum Beach', 'Prospect Lefferts Gardens', 'Van Cortlandt Village']


In [380]:
print(NYC_neighborhoods.shape)
NYC_neighborhoods.drop(got_none_ind, inplace=True, axis=0)
print(NYC_neighborhoods.shape)

(328, 3)
(321, 3)


In [381]:
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [382]:
#removing duplicates
NYC_neighborhoods = NYC_neighborhoods.groupby('Neighborhood').first().reset_index()

In [383]:
NYC_neighborhoods.shape

(305, 3)

In [384]:
NYC_neighborhoods.to_csv('NYC_neighborhoods.csv')

In [385]:
nyc_location = geolocator.geocode('NYC, NY')

In [387]:
# create map of Toronto using latitude and longitude values
map_nyc = folium.Map(location=[nyc_location.latitude, nyc_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(NYC_neighborhoods['Latitude'], NYC_neighborhoods['Longitude'], NYC_neighborhoods['Neighborhood']):
    label = '{}, NYC'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

## Same as before, cleaning out the neighborhoods that arent actually in New York

In [390]:
drop = ['Indian Village', 'Mount Hope']
geolocator.geocode('Indian Village, New York City, New York, USA')
for n in drop:
    NYC_neighborhoods = NYC_neighborhoods[NYC_neighborhoods.Neighborhood != n]
NYC_neighborhoods = NYC_neighborhoods[NYC_neighborhoods.Neighborhood != 'New Hyde Park']
NYC_neighborhoods = NYC_neighborhoods.append({'Neighborhood': 'New Hyde Park', 'Latitude': 40.7326609, 'Longitude': -73.6948062}, ignore_index=True)
NYC_neighborhoods = NYC_neighborhoods.sort_values(by='Neighborhood')
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [391]:
NYC_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allerton,40.866111,-73.850556
1,Alphabet City,40.725102,-73.979583
2,Annadale,40.54455,-74.176532
3,Arden Heights,40.557629,-74.188609
4,Arlington,40.632326,-74.165144


In [393]:
# create map of Toronto using latitude and longitude values
map_nyc = folium.Map(location=[nyc_location.latitude, nyc_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(NYC_neighborhoods['Latitude'], NYC_neighborhoods['Longitude'], NYC_neighborhoods['Neighborhood']):
    label = '{}, NYC'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

# LA
The wikipedia page had a list of neighborhoods, not a table. So instead of using pandas to scrape the data I copy pasted it into a txt file. It needs some cleaning still. 

In [394]:
LA_txt = 'LA_neighblist_unprocessed.txt'
file = open(LA_txt, "r")
neighbs = []
while file.readline()!='\n': #I put a newline at the end of the file
    neighb = file.readline()[:-1] #the [:-1] is used to cut out the '\n' character at the end of the line
    neighbs.append(neighb.split('[')[0])

In [395]:
LA_neighborhoods = pd.DataFrame()
LA_neighborhoods['Neighborhood']=neighbs
LA_neighborhoods.shape

(98, 1)

In [396]:
lats = []
longs = []
got_none = [] #the list of neighborhoods we got none for
got_none_ind = []
for i, neighb in enumerate(LA_neighborhoods['Neighborhood']):
    print(neighb)
    location = geolocator.geocode( neighb + ', LA')
    if location != None:
        lats.append(location.latitude)
        longs.append(location.longitude)
    else:
        got_none.append(neighb)
        got_none_ind.append(i)
        lats.append(np.nan)
        longs.append(np.nan)
    clear_output()

In [397]:
print("List of neighborhoods with no location", got_none)

List of neighborhoods with no location ['Beachwood Canyon', 'Filipinotown, Historic', 'Holmby Hills', 'NoHo Arts District', 'Picfair Village']


In [398]:
LA_neighborhoods['Latitude'] = lats
LA_neighborhoods['Longitude'] = longs

In [399]:
print(LA_neighborhoods.shape)
LA_neighborhoods.drop(got_none_ind, inplace=True, axis=0)
print(LA_neighborhoods.shape)

(98, 3)
(93, 3)


In [400]:
LA_neighborhoods.reset_index(drop=True, inplace=True)

In [401]:
LA_neighborhoods.to_csv('LA_neighborhoods.csv')

In [402]:
la_location = geolocator.geocode('Los Angeles, CA')

In [403]:
# create map of Toronto using latitude and longitude values
map_la = folium.Map(location=[la_location.latitude, la_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(LA_neighborhoods['Latitude'], LA_neighborhoods['Longitude'], LA_neighborhoods['Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_la)  
    
map_la

### The above code can take quite some time, and sometimes geopy crashes. So to avoid having to rerun the whole thing I saived the results in .csv file. The following cell loads them into dataframes. 

In [18]:
LA_neighborhoods = pd.read_csv('LA_neighborhoods.csv', header=0, index_col = 0)
NYC_neighborhoods = pd.read_csv('NYC_neighborhoods.csv', header=0, index_col = 0)
Chicago_neighborhoods = pd.read_csv('Chicago_neighborhoods.csv', header=0, index_col = 0)

In [19]:
print('LA, ', LA_neighborhoods.shape)
print('NYC, ', NYC_neighborhoods.shape)
print('Chicago, ', Chicago_neighborhoods.shape)

LA,  (93, 3)
NYC,  (305, 3)
Chicago,  (232, 3)


# Now that we have the list of neighborhoods and their locations, it's time for some feature extraction.

### First we make a dataframes with venues 

In [283]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [287]:
def getNearbyVenues(names, latitudes, longitudes):
    radius = 4000
    LIMIT = 500
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
        clear_output()

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [288]:
NYC_venues = getNearbyVenues(names=NYC_neighborhoods['Neighborhood'],
                   latitudes=NYC_neighborhoods['Latitude'],
                   longitudes=NYC_neighborhoods['Longitude']
                  )


In [289]:
NYC_venues['Neighborhood'] = NYC_venues['Neighborhood'] + ', NYC'
NYC_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Allerton, NYC",40.866111,-73.850556,Sal & Doms Bakery,40.865377,-73.855236,Dessert Shop
1,"Allerton, NYC",40.866111,-73.850556,Nicks Pizza,40.870352,-73.846171,Pizza Place
2,"Allerton, NYC",40.866111,-73.850556,Gun Hill Brewing Co.,40.872139,-73.855698,Brewery
3,"Allerton, NYC",40.866111,-73.850556,Fratelli's,40.863019,-73.843607,Italian Restaurant
4,"Allerton, NYC",40.866111,-73.850556,Four Seasons Nails,40.869402,-73.844527,Spa


In [290]:
Chicago_venues = getNearbyVenues(names=Chicago_neighborhoods['Neighborhood'],
                   latitudes=Chicago_neighborhoods['Latitude'],
                   longitudes=Chicago_neighborhoods['Longitude']
                  )


In [291]:
Chicago_venues['Neighborhood'] = Chicago_venues['Neighborhood'] + ', Chicago'
Chicago_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Albany Park, Chicago",41.971937,-87.716174,Tre Kronor,41.975842,-87.711037,Scandinavian Restaurant
1,"Albany Park, Chicago",41.971937,-87.716174,Cairo Nights Hookah Lounge,41.975776,-87.715547,Hookah Bar
2,"Albany Park, Chicago",41.971937,-87.716174,Noon O Kabab,41.9667,-87.708332,Middle Eastern Restaurant
3,"Albany Park, Chicago",41.971937,-87.716174,Great Sea Chinese Restaurant,41.968496,-87.710678,Chinese Restaurant
4,"Albany Park, Chicago",41.971937,-87.716174,Nighthawk,41.967974,-87.713415,Cocktail Bar


In [292]:
LA_venues = getNearbyVenues(names=LA_neighborhoods['Neighborhood'],
                   latitudes=LA_neighborhoods['Latitude'],
                   longitudes=LA_neighborhoods['Longitude']
                  )


Cheviot Hills


KeyboardInterrupt: 

In [None]:
LA_venues['Neighborhood'] = LA_venues['Neighborhood'] + ', LA'
LA_venues.head()

In [None]:
# Putting all the venues together
Venues = pd.concat([NYC_venues, Chicago_venues, LA_venues])
Venues.reset_index(drop=True, inplace=True)

In [138]:
# one hot encoding
onehot = pd.get_dummies(Venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['City, Neighborhood'] = Venues['Neighborhood'] 

In [139]:
columns = onehot.columns.tolist()
columns.remove('City, Neighborhood')
columns.insert(0, 'City, Neighborhood')

In [140]:
onehot = onehot.loc[:, columns]

In [141]:
onehot.head()

Unnamed: 0,"City, Neighborhood",ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"NYC, Allerton",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"NYC, Allerton",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"NYC, Allerton",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"NYC, Allerton",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"NYC, Allerton",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [179]:
Venues_grouped = onehot.groupby('City, Neighborhood').sum().reset_index()

In [180]:
Venues_grouped.head()

Unnamed: 0,"City, Neighborhood",ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Chicago, Albany Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
1,"Chicago, Altgeld Gardens",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Chicago, Andersonville",0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,2,0,0
3,"Chicago, Archer Heights",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,"Chicago, Armour Square",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [181]:
# Saving this dataframe for convenience. Later I won't need to rerun the code to get to it 
Venues_grouped.to_csv('Venues_grouped.csv')

## Some of the categories are very specific (e.g. wine shop and winery). Below, I merge a few of them together to get better features

In [212]:
Venues_grouped = pd.read_csv('Venues_grouped.csv', header=0, index_col = 0)

In [213]:
categories = Venues_grouped.columns.tolist()[1:]
print(*categories, sep='\n')

ATM
Accessories Store
Adult Boutique
Advertising Agency
Afghan Restaurant
African Restaurant
Airport
Airport Food Court
Airport Lounge
Airport Service
Airport Terminal
American Restaurant
Amphitheater
Animal Shelter
Antique Shop
Aquarium
Arcade
Arepa Restaurant
Argentinian Restaurant
Art Gallery
Art Museum
Arts & Crafts Store
Arts & Entertainment
Asian Restaurant
Athletics & Sports
Australian Restaurant
Austrian Restaurant
Auto Dealership
Auto Garage
Automotive Shop
BBQ Joint
Badminton Court
Bagel Shop
Bakery
Bank
Bar
Baseball Field
Baseball Stadium
Basketball Court
Basketball Stadium
Bath House
Bavarian Restaurant
Beach
Beach Bar
Bed & Breakfast
Beer Bar
Beer Garden
Beer Store
Belgian Restaurant
Big Box Store
Bike Rental / Bike Share
Bike Shop
Bike Trail
Bistro
Board Shop
Boat or Ferry
Bookstore
Botanical Garden
Boutique
Bowling Alley
Boxing Gym
Brazilian Restaurant
Breakfast Spot
Brewery
Bridal Shop
Bridge
Bubble Tea Shop
Buffet
Building
Burger Joint
Burmese Restaurant
Burrito Place


In [214]:
def join_features(list_to_join, new_name, df):
    df['temp_name'] = df[list_to_join].sum(axis=1)
    df = df.drop(columns=list_to_join)
    df = df.rename(columns={'temp_name': new_name})
    return df

In [215]:
Venues_grouped = join_features(['Wine Bar', 'Winery', 'Wine Shop'], 'Wine', Venues_grouped)

In [216]:
Venues_grouped = join_features(['Airport', 'Airport Food Court', 'Airport Lounge', 'Airport Service', 'Airport Terminal'], 'Airport', Venues_grouped)

In [217]:
Venues_grouped = join_features(['Art Gallery', 'Art Museum', 'Performing Arts Venue', 'Public Art', 'Street Art'], 'Art', Venues_grouped)

In [218]:
Venues_grouped = join_features(['Baseball Field', 'Baseball Stadium'], 'Baseball', Venues_grouped)
Venues_grouped = join_features(['Basketball Court', 'Basketball Stadium'], 'Basketball', Venues_grouped)
Venues_grouped = join_features(['Beer Bar', 'Beer Garden', 'Brewery'], 'Beer', Venues_grouped)
Venues_grouped = join_features(['Bike Rental / Bike Share', 'Bike Shop'], 'Bike Shop', Venues_grouped)
Venues_grouped = join_features(['Bus Station', 'Bus Stop'], 'Bus Stop', Venues_grouped)
Venues_grouped = join_features(['Café', 'Coffee Shop'], 'Cofee', Venues_grouped)
Venues_grouped = join_features(['Soccer Field', 'Soccer Stadium'], 'Soccer', Venues_grouped)
Venues_grouped = join_features(['Tennis Court', 'Tennis Stadium'], 'Tennis', Venues_grouped)
Venues_grouped = join_features(['Zoo', 'Zoo Exhibit'], 'Zoo', Venues_grouped)


In [223]:
#Sorting the columns
columns = Venues_grouped.columns.tolist()
columns.remove('City, Neighborhood')
columns.sort()
columns.insert(0, 'City, Neighborhood')
Venues_grouped = Venues_grouped.loc[:, columns]

In [226]:
Venues_grouped.shape

(626, 475)

# K-means 

In [232]:
kclusters = 10

Venues_grouped_clustering = Venues_grouped.drop('City, Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Venues_grouped_clustering)
kmeans.labels_[0:50] 

array([6, 4, 6, 4, 6, 4, 4, 4, 7, 7, 4, 6, 7, 7, 7, 4, 4, 4, 4, 7, 7, 4,
       4, 6, 4, 6, 7, 7, 7, 4, 0, 4, 6, 5, 4, 4, 6, 4, 7, 4, 4, 6, 4, 6,
       0, 4, 4, 4, 4, 7])

In [265]:
%%capture
Clusters_of_neighborhoods = Venues_grouped[['City, Neighborhood']]
Clusters_of_neighborhoods['Cluster label'] = kmeans.labels_
temp_df = Venues.groupby('Neighborhood').mean().reset_index()
Clusters_of_neighborhoods['Latitude'] = temp_df[['Neighborhood Latitude']]
Clusters_of_neighborhoods['Longitude'] = temp_df[['Neighborhood Longitude']]



In [266]:
Clusters_of_neighborhoods.head()

Unnamed: 0,"City, Neighborhood",Cluster label,Latitude,Longitude
0,"Chicago, Albany Park",6,41.971937,-87.716174
1,"Chicago, Altgeld Gardens",4,41.654864,-87.600446
2,"Chicago, Andersonville",6,41.977139,-87.669273
3,"Chicago, Archer Heights",4,41.811422,-87.726165
4,"Chicago, Armour Square",6,41.840033,-87.633107


In [281]:
# create map
latitude = 37.0902
longiture = 95.7129
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Clusters_of_neighborhoods['Latitude'], Clusters_of_neighborhoods['Longitude'], Clusters_of_neighborhoods['City, Neighborhood'], Clusters_of_neighborhoods['Cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=cmap[cluster],
        fill=True,
        fill_color=cmap[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

map_clusters

In [277]:
cmap = {0: 'red', 1: 'green', 2: 'yellow', 3: 'blue', 4: 'black', 5: 'white', 6: 'lightsteelblue', 7: 'lime', 8: 'deeppink', 9: 'purple', 10: 'plum'}

In [279]:
colors_array = [cmap[x] for x in Clusters_of_neighborhoods['Cluster label']]