# This is my main project file. The data scraping, visualization and analysis will be done in this notebook. You can read more about the project in the report pdf found on my repository.

First, let's import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import folium 
import matplotlib.pyplot as plt
from urllib.request import urlopen
import geocoder
import requests
from geopy.geocoders import Nominatim
from IPython.display import clear_output
from sklearn.cluster import KMeans

In order to keep my Client id and secret secure, I will be reading them from a file saved on my computer that I will exclude from git. If you want to experiment with the notebook, you can create a free account of foursquare and repace CLIENT_ID and CLIENT_SECRET with your own and skip reading the file. Or you can write a txt file in the same folder as the notebook that has the CLIENT_ID on the first line and CLIENT_SECRET on the second. After making the .txt file, point to it in the client_cred variable. 

In [2]:
client_cred = 'foursquare_client_creds.txt'
file = open(client_cred, "r")
CLIENT_ID = file.readline()[:-1] #the [:-1] is used to cut out the '\n' character at the end of the line
CLIENT_SECRET = file.readline()
VERSION = '20180605' # Foursquare API version
geolocator = Nominatim(user_agent="neighborhood-recommender")

Let's try using foursquare to obtain results to make sure the connection was successful

In [3]:
lat = '40.1872'
lng = '44.5152'
radius = 500
LIMIT = 10
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
if results!=None:
    print("Connection was successful")
else:
    print("Something went wrong")

Connection was successful


## Now, let's scrape wikipedia pages for New York, Chicago and LA neighborhoods. 

# Chicago

In [4]:
Chicago_wiki_url = 'https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago'
NYC_wiki_url = 'https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City'
LA_wiki_url = 'https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles'

In [5]:
Chicago_neighborhoods = pd.read_html(Chicago_wiki_url, match='Woodlawn')[0]
Chicago_neighborhoods.drop(1, axis=1, inplace=True) #Drop the community area col
Chicago_neighborhoods.drop(0, axis=0, inplace=True) #Drop the first row that shouldve been the header
Chicago_neighborhoods.rename(columns = {0:'Neighborhood'}, inplace=True) #Rename the columns

In [6]:
#Get the latitude and longitude of the neighborhoods
geolocator = Nominatim(user_agent="neighborhood-recommender")
lats = []
longs = []
got_none = [] #the list of neighborhoods we got none for
got_none_ind = []
for i, neighb in enumerate(Chicago_neighborhoods['Neighborhood']):
    print(neighb)
    location = geolocator.geocode( neighb + ', Chicago')
    if location != None:
        lats.append(location.latitude)
        longs.append(location.longitude)
    else:
        got_none.append(neighb)
        got_none_ind.append(i)
        lats.append(np.nan)
        longs.append(np.nan)
    clear_output()

In [7]:
Chicago_neighborhoods['Latitude'] = lats
Chicago_neighborhoods['Longitude'] = longs

Sometimes geopy cant obtain latitude and longitude values. I explore those neighborhoods further below. Looks like a small change in the name can often fix the issue. If it doesnt work, I'll find the location manually or drop the row. 

In [8]:
print("List of neighborhoods with no location", got_none)

List of neighborhoods with no location ['Ashburn Estates', 'Belmont Heights', 'Cottage Grove Heights', 'Irving Woods', 'Jackson Park Highlands', 'Lakewood / Balmoral', 'Legends South (Robert Taylor Homes)', 'Margate Park', 'Polish Downtown', 'Ranch Triangle', "River's Edge", "Saint Ben's", 'Sheffield Neighbors', 'Sheridan Station Corridor', "Talley's Corner", 'Wacławowo', 'West Chesterfield', 'Wrightwood Neighbors']


In [9]:
#Drop the indices we got none for 
Chicago_neighborhoods.reset_index(inplace=True, drop=True)
Chicago_neighborhoods.drop(got_none_ind, inplace=True)
#Chicago_neighborhoods.reset_index(inplace=True, drop=True)

In [10]:
#Add some of the neighborhoods I could recover locations for
location = geolocator.geocode('Cottage Grove, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Cottage Grove' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Jackson Park, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Jackson Park' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Lakewood, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Lakewood' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Balmoral, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Balmoral' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
location = geolocator.geocode('Wrightwood, Chicago')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood' : 'Wrightwood' , 'Latitude' : location.latitude, 'Longitude' : location.longitude}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods.sort_values(by='Neighborhood')
Chicago_neighborhoods = Chicago_neighborhoods.reset_index(drop=True)

In [11]:
Chicago_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.971937,-87.716174
1,Altgeld Gardens,41.654864,-87.600446
2,Andersonville,41.977139,-87.669273
3,Archer Heights,41.811422,-87.726165
4,Armour Square,41.840033,-87.633107


In [12]:
Chicago_neighborhoods.shape

(233, 3)

In [13]:
chi_location = geolocator.geocode('Chicago, IL')

In [14]:
# create map of Toronto using latitude and longitude values
map_chicago = folium.Map(location=[chi_location.latitude, chi_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(Chicago_neighborhoods['Latitude'], Chicago_neighborhoods['Longitude'], Chicago_neighborhoods['Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago

## When we zoom in, we can see that some of the neighborhoods are not actually in Chicago. This is either a result of geopy's errors or a mistake from scraping neighborhood names. I'm fixing that below. 

I did so by going through the map and checking out the neighborhoods outside of chicago one by one. Some of them were just duplicated of neighborhoods with slightly different names or were names of residential properties instead of neighborhoods. Those I dropped. The others I got the location of by doing a google search or specifying the address more properly.

In [15]:
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Central Station']

In [16]:
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'River North']
loc = geolocator.geocode('River North, Chicago, IL, USA')
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'River North', 'Latitude': loc.latitude, 'Longitude': loc.longitude}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'University Village']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Old Town Triangle']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'K-Town']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Lincoln Square']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Lincoln Square', 'Latitude': 41.976049, 'Longitude': -87.7079486}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Heart of Chicago']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Big Oaks']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Big Oaks', 'Latitude': 41.9992963, 'Longitude': -87.7540899}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Chrysler Village']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Chrysler Village', 'Latitude': 41.7800088, 'Longitude': -87.7405553}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Old Town']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Old Town', 'Latitude': 41.9111221, 'Longitude': -87.6492029}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Fifth City']
Chicago_neighborhoods = Chicago_neighborhoods.append({'Neighborhood': 'Fifth City', 'Latitude': 41.8776055, 'Longitude': -87.7152837}, ignore_index=True)
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'New Eastside']
Chicago_neighborhoods = Chicago_neighborhoods[Chicago_neighborhoods.Neighborhood != 'Museum Campus']
Chicago_neighborhoods = Chicago_neighborhoods.sort_values(by='Neighborhood')
Chicago_neighborhoods.reset_index(drop=True, inplace=True)

In [17]:
Chicago_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.971937,-87.716174
1,Altgeld Gardens,41.654864,-87.600446
2,Andersonville,41.977139,-87.669273
3,Archer Heights,41.811422,-87.726165
4,Armour Square,41.840033,-87.633107


# Let's try this again. 

In [18]:
# create map of Toronto using latitude and longitude values
map_chicago = folium.Map(location=[chi_location.latitude, chi_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(Chicago_neighborhoods['Latitude'], Chicago_neighborhoods['Longitude'], Chicago_neighborhoods['Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago

In [19]:
Chicago_neighborhoods.to_csv('Chicago_neighborhoods.csv')

# NYC

In [20]:
NYC = pd.read_html(NYC_wiki_url, match='Arrochar')[0]
NYC.drop([0,1,2,3], axis=1, inplace=True)
NYC.drop([0,60], axis=0, inplace=True)
NYC_neighborhoods = pd.DataFrame(columns=['Neighborhood'])
NYC_neighborhoods

Unnamed: 0,Neighborhood


In [21]:
for n in NYC[4]:
    neighbs = n.split(',')
    for nb in neighbs:
        NYC_neighborhoods = NYC_neighborhoods.append({'Neighborhood': nb.strip()}, ignore_index=True)

In [22]:
NYC_neighborhoods.sort_values(by='Neighborhood', inplace=True)
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [23]:
lats = []
longs = []
got_none = [] #the list of neighborhoods we got none for
got_none_ind = []
for i, neighb in enumerate(NYC_neighborhoods['Neighborhood']):
    print(neighb)
    location = geolocator.geocode( neighb + ', NYC')
    if location != None:
        lats.append(location.latitude)
        longs.append(location.longitude)
    else:
        got_none.append(neighb)
        got_none_ind.append(i)
        lats.append(np.nan)
        longs.append(np.nan)
    clear_output()

In [24]:
NYC_neighborhoods['Latitude'] = lats
NYC_neighborhoods['Longitude'] = longs

In [25]:
print("List of neighborhoods with no location", got_none)

List of neighborhoods with no location ['Greenwood Heights', 'Hilltop Village', 'Kew Gardens Hills', 'Meiers Corners', 'Plum Beach', 'Prospect Lefferts Gardens', 'Van Cortlandt Village']


In [26]:
print(NYC_neighborhoods.shape)
NYC_neighborhoods.drop(got_none_ind, inplace=True, axis=0)
print(NYC_neighborhoods.shape)

(328, 3)
(321, 3)


In [27]:
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [28]:
#removing duplicates
NYC_neighborhoods = NYC_neighborhoods.groupby('Neighborhood').first().reset_index()

In [29]:
NYC_neighborhoods.shape

(305, 3)

In [31]:
nyc_location = geolocator.geocode('NYC, NY')

In [32]:
# create map of Toronto using latitude and longitude values
map_nyc = folium.Map(location=[nyc_location.latitude, nyc_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(NYC_neighborhoods['Latitude'], NYC_neighborhoods['Longitude'], NYC_neighborhoods['Neighborhood']):
    label = '{}, NYC'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

## Same as before, cleaning out the neighborhoods that arent actually in New York

In [33]:
drop = ['Indian Village', 'Mount Hope']
geolocator.geocode('Indian Village, New York City, New York, USA')
for n in drop:
    NYC_neighborhoods = NYC_neighborhoods[NYC_neighborhoods.Neighborhood != n]
NYC_neighborhoods = NYC_neighborhoods[NYC_neighborhoods.Neighborhood != 'New Hyde Park']
NYC_neighborhoods = NYC_neighborhoods.append({'Neighborhood': 'New Hyde Park', 'Latitude': 40.7326609, 'Longitude': -73.6948062}, ignore_index=True)
NYC_neighborhoods = NYC_neighborhoods.sort_values(by='Neighborhood')
NYC_neighborhoods.reset_index(drop=True, inplace=True)

In [34]:
NYC_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allerton,40.866111,-73.850556
1,Alphabet City,40.725102,-73.979583
2,Annadale,40.54455,-74.176532
3,Arden Heights,40.557629,-74.188609
4,Arlington,40.632326,-74.165144


In [35]:
# create map of Toronto using latitude and longitude values
map_nyc = folium.Map(location=[nyc_location.latitude, nyc_location.longitude], zoom_start=10)
for lat, lng, neighborhood in zip(NYC_neighborhoods['Latitude'], NYC_neighborhoods['Longitude'], NYC_neighborhoods['Neighborhood']):
    label = '{}, NYC'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

In [36]:
print('NYC, ', NYC_neighborhoods.shape)
print('Chicago, ', Chicago_neighborhoods.shape)

NYC,  (303, 3)
Chicago,  (226, 3)


In [37]:
NYC_neighborhoods.to_csv('NYC_neighborhoods.csv')

# Now that we have the list of neighborhoods and their locations, it's time for some feature extraction.

In [84]:
NYC_neighborhoods = pd.read_csv('NYC_neighborhoods.csv', header=0, index_col = 0)
NYC_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allerton,40.866111,-73.850556
1,Alphabet City,40.725102,-73.979583
2,Annadale,40.54455,-74.176532
3,Arden Heights,40.557629,-74.188609
4,Arlington,40.632326,-74.165144


In [85]:
Chicago_neighborhoods = pd.read_csv('Chicago_neighborhoods.csv', header=0, index_col = 0)
Chicago_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.971937,-87.716174
1,Altgeld Gardens,41.654864,-87.600446
2,Andersonville,41.977139,-87.669273
3,Archer Heights,41.811422,-87.726165
4,Armour Square,41.840033,-87.633107


### First we make a dataframes with venues 

In [86]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [87]:
def getNearbyVenues(names, latitudes, longitudes):
    radius = 2000
    LIMIT = 500
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
        clear_output()

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [88]:
NYC_venues = getNearbyVenues(names=NYC_neighborhoods['Neighborhood'],
                   latitudes=NYC_neighborhoods['Latitude'],
                   longitudes=NYC_neighborhoods['Longitude']
                  )


In [89]:
NYC_venues['Neighborhood'] = NYC_venues['Neighborhood'] + ', NYC'
NYC_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Allerton, NYC",40.866111,-73.850556,Sal & Doms Bakery,40.865377,-73.855236,Dessert Shop
1,"Allerton, NYC",40.866111,-73.850556,Nicks Pizza,40.870352,-73.846171,Pizza Place
2,"Allerton, NYC",40.866111,-73.850556,Fratelli's,40.863019,-73.843607,Italian Restaurant
3,"Allerton, NYC",40.866111,-73.850556,Gun Hill Brewing Co.,40.872139,-73.855698,Brewery
4,"Allerton, NYC",40.866111,-73.850556,Four Seasons Nails,40.869402,-73.844527,Spa


In [90]:
Chicago_venues = getNearbyVenues(names=Chicago_neighborhoods['Neighborhood'],
                   latitudes=Chicago_neighborhoods['Latitude'],
                   longitudes=Chicago_neighborhoods['Longitude']
                  )


In [91]:
Chicago_venues['Neighborhood'] = Chicago_venues['Neighborhood'] + ', Chicago'
Chicago_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Albany Park, Chicago",41.971937,-87.716174,Cairo Nights Hookah Lounge,41.975776,-87.715547,Hookah Bar
1,"Albany Park, Chicago",41.971937,-87.716174,Tre Kronor,41.975842,-87.711037,Scandinavian Restaurant
2,"Albany Park, Chicago",41.971937,-87.716174,Nighthawk,41.967974,-87.713415,Cocktail Bar
3,"Albany Park, Chicago",41.971937,-87.716174,Great Sea Chinese Restaurant,41.968496,-87.710678,Chinese Restaurant
4,"Albany Park, Chicago",41.971937,-87.716174,Popeyes Louisiana Kitchen,41.968459,-87.713156,Fried Chicken Joint


In [92]:
# Putting all the venues together
Venues = pd.concat([NYC_venues, Chicago_venues])
Venues.reset_index(drop=True, inplace=True)

In [93]:
# one hot encoding
onehot = pd.get_dummies(Venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['City, Neighborhood'] = Venues['Neighborhood'] 

In [94]:
columns = onehot.columns.tolist()
columns.remove('City, Neighborhood')
columns.insert(0, 'City, Neighborhood')

In [95]:
onehot = onehot.loc[:, columns]

In [96]:
onehot.head()

Unnamed: 0,"City, Neighborhood",ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
Venues_grouped = onehot.groupby('City, Neighborhood').sum().reset_index()

In [98]:
Venues_grouped.head()

Unnamed: 0,"City, Neighborhood",ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Albany Park, Chicago",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,"Allerton, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Alphabet City, NYC",0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,3,0,0
3,"Altgeld Gardens, Chicago",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Andersonville, Chicago",0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [99]:
# Saving this dataframe for convenience. Later I won't need to rerun the code to get to it 
Venues_grouped.to_csv('Venues_grouped.csv')
Venues.to_csv('Venues.csv')

## Some of the categories are very specific (e.g. cafe and coffee shop). Below, I merge a few of them together to get better features

In [100]:
Venues_grouped = pd.read_csv('Venues_grouped.csv', header=0, index_col = 0)
Venues = pd.read_csv('Venues.csv', header=0, index_col = 0)

In [101]:
categories = Venues_grouped.columns.tolist()[1:]
print(*categories, sep='\n')

ATM
Accessories Store
Adult Boutique
Afghan Restaurant
African Restaurant
Airport
Airport Food Court
Airport Lounge
Airport Service
Airport Terminal
American Restaurant
Amphitheater
Animal Shelter
Antique Shop
Aquarium
Arcade
Arepa Restaurant
Argentinian Restaurant
Art Gallery
Art Museum
Arts & Crafts Store
Arts & Entertainment
Asian Restaurant
Athletics & Sports
Auditorium
Australian Restaurant
Austrian Restaurant
Auto Garage
Automotive Shop
BBQ Joint
Bagel Shop
Bakery
Bank
Bar
Baseball Field
Baseball Stadium
Basketball Court
Basketball Stadium
Bath House
Bavarian Restaurant
Bay
Beach
Beach Bar
Bed & Breakfast
Beer Bar
Beer Garden
Beer Store
Big Box Store
Bike Rental / Bike Share
Bike Shop
Bike Trail
Bistro
Board Shop
Boat or Ferry
Bookstore
Border Crossing
Botanical Garden
Boutique
Bowling Alley
Boxing Gym
Brazilian Restaurant
Breakfast Spot
Brewery
Bridge
Bubble Tea Shop
Buffet
Building
Burger Joint
Burmese Restaurant
Burrito Place
Bus Station
Bus Stop
Business Center
Business Servi

In [102]:
def join_features(list_to_join, new_name, df):
    df['temp_name'] = df[list_to_join].sum(axis=1)
    df = df.drop(columns=list_to_join)
    df = df.rename(columns={'temp_name': new_name})
    return df

In [68]:
Venues_grouped = join_features(['Wine Bar', 'Wine Shop'], 'Wine', Venues_grouped)

In [103]:
Venues_grouped = join_features(['Airport Food Court', 'Airport Terminal', 'Airport Lounge', 'Airport Service'], 'Airport', Venues_grouped)

In [104]:
Venues_grouped = join_features(['Art Gallery', 'Art Museum', 'Performing Arts Venue', 'Public Art', 'Street Art'], 'Art', Venues_grouped)

In [105]:
Venues_grouped = join_features(['Baseball Field', 'Baseball Stadium'], 'Baseball', Venues_grouped)
Venues_grouped = join_features(['Soccer Field', 'Soccer Stadium'], 'Soccer', Venues_grouped)
Venues_grouped = join_features(['Basketball Court', 'Basketball Stadium'], 'Basketball', Venues_grouped)
Venues_grouped = join_features(['Beer Bar', 'Beer Garden', 'Brewery'], 'Beer', Venues_grouped)
Venues_grouped = join_features(['Bike Rental / Bike Share', 'Bike Shop'], 'Bike Shop', Venues_grouped)
Venues_grouped = join_features(['Bus Station', 'Bus Stop'], 'Bus Stop', Venues_grouped)
Venues_grouped = join_features(['Café', 'Coffee Shop'], 'Cofee', Venues_grouped)
Venues_grouped = join_features(['Tennis Court', 'Tennis Stadium'], 'Tennis', Venues_grouped)
Venues_grouped = join_features(['Zoo', 'Zoo Exhibit'], 'Zoo', Venues_grouped)


In [106]:
#Sorting the columns
columns = Venues_grouped.columns.tolist()
columns.remove('City, Neighborhood')
columns.sort()
columns.insert(0, 'City, Neighborhood')
Venues_grouped = Venues_grouped.loc[:, columns]

In [107]:
Venues_grouped.shape

(528, 469)

# K-means 

In [115]:
kclusters = 20

Venues_grouped_clustering = Venues_grouped.drop('City, Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Venues_grouped_clustering)
kmeans.labels_[0:50] 

array([16,  5,  4, 11, 16, 19,  0, 11, 11, 15, 19, 11,  0,  9, 14, 16, 14,
        3,  3,  0,  1,  4, 19, 19,  9, 19, 11, 19, 16, 11, 10, 17, 19, 18,
        5,  4,  4,  4, 10, 19,  5,  5,  0,  5, 11, 11,  3,  7, 16,  4])

In [116]:
%%capture
Clusters_of_neighborhoods = Venues_grouped[['City, Neighborhood']]
Clusters_of_neighborhoods['Cluster label'] = kmeans.labels_
temp_df = Venues.groupby('Neighborhood').mean().reset_index()
Clusters_of_neighborhoods['Latitude'] = temp_df[['Neighborhood Latitude']]
Clusters_of_neighborhoods['Longitude'] = temp_df[['Neighborhood Longitude']]



In [117]:
Clusters_of_neighborhoods.head()

Unnamed: 0,"City, Neighborhood",Cluster label,Latitude,Longitude
0,"Albany Park, Chicago",16,41.971937,-87.716174
1,"Allerton, NYC",5,40.866111,-73.850556
2,"Alphabet City, NYC",4,40.725102,-73.979583
3,"Altgeld Gardens, Chicago",11,41.654864,-87.600446
4,"Andersonville, Chicago",16,41.977139,-87.669273


In [118]:
cmap = {0: 'red', 1: 'green', 2: 'yellow', 3: 'blue', 4: 'black', 5: 'white', 6: 'lightsteelblue', 7: 'lime', 8: 'deeppink', 9: 'purple', 10: 'plum',
       11: 'orange', 12: 'darkcyan', 13: 'olive', 14: 'violet', 15: 'magenta', 16: 'salmon', 17: 'khaki', 18: 'gray', 19: 'indigo', 20: 'crimson'}
colors_array = [cmap[x] for x in Clusters_of_neighborhoods['Cluster label']]

# The map below shows the clusters. This is a good tool for freely exploring the neighborhoods. The ones with the same colors are likely to be similar. Zoom to explore

In [119]:
Clusters_of_neighborhoods

Unnamed: 0,"City, Neighborhood",Cluster label,Latitude,Longitude
0,"Albany Park, Chicago",16,41.971937,-87.716174
1,"Allerton, NYC",5,40.866111,-73.850556
2,"Alphabet City, NYC",4,40.725102,-73.979583
3,"Altgeld Gardens, Chicago",11,41.654864,-87.600446
4,"Andersonville, Chicago",16,41.977139,-87.669273
5,"Annadale, NYC",19,40.544550,-74.176532
6,"Archer Heights, Chicago",0,41.811422,-87.726165
7,"Arden Heights, NYC",11,40.557630,-74.188609
8,"Arlington, NYC",11,40.632326,-74.165144
9,"Armour Square, Chicago",15,41.840033,-87.633107


In [121]:
# create map
latitude = Clusters_of_neighborhoods['Latitude'].mean()
longitude = Clusters_of_neighborhoods['Longitude'].mean()
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=6)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Clusters_of_neighborhoods['Latitude'], Clusters_of_neighborhoods['Longitude'], Clusters_of_neighborhoods['City, Neighborhood'], Clusters_of_neighborhoods['Cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=cmap[cluster],
        fill=True,
        fill_color=cmap[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## While this is a good tool for freely comparing the neighborhoods, what if I'm specifically interested in finding the neighborhoods most similar to the ones I like? We can use a content based recommender for that. We will use the cosine similarity metric between neighborhoods to give most similar neighborhoods to the ones of interest.  

# Content Based Recommender

In [122]:
from sklearn.metrics.pairwise import cosine_similarity

In [123]:
matrix = Venues_grouped_clustering.values
cosine_matrix = cosine_similarity(matrix)

In [124]:
cosine_matrix

array([[1.        , 0.46571506, 0.44865159, ..., 0.53535213, 0.47140452,
        0.38298696],
       [0.46571506, 1.        , 0.44822738, ..., 0.57731126, 0.57516521,
        0.4902343 ],
       [0.44865159, 0.44822738, 1.        , ..., 0.29427902, 0.55200557,
        0.58311072],
       ...,
       [0.53535213, 0.57731126, 0.29427902, ..., 1.        , 0.41990545,
        0.33716378],
       [0.47140452, 0.57516521, 0.55200557, ..., 0.41990545, 1.        ,
        0.64473293],
       [0.38298696, 0.4902343 , 0.58311072, ..., 0.33716378, 0.64473293,
        1.        ]])

In [125]:
Similarity =pd.DataFrame(data= cosine_matrix, columns=Venues_grouped['City, Neighborhood'])

In [126]:
Similarity.index = Venues_grouped['City, Neighborhood']

In [127]:
Similarity.head()

"City, Neighborhood","Albany Park, Chicago","Allerton, NYC","Alphabet City, NYC","Altgeld Gardens, Chicago","Andersonville, Chicago","Annadale, NYC","Archer Heights, Chicago","Arden Heights, NYC","Arlington, NYC","Armour Square, Chicago",...,"Windsor Terrace, NYC","Wingate, NYC","Woodhaven, NYC","Woodlawn, Chicago","Woodlawn, NYC","Woodrow, NYC","Woodside, NYC","Wrightwood, Chicago","Wrigleyville, Chicago","Yorkville, NYC"
"City, Neighborhood",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Albany Park, Chicago",1.0,0.465715,0.448652,0.172848,0.60786,0.413118,0.425327,0.423159,0.33693,0.611871,...,0.570189,0.360894,0.428616,0.547153,0.329846,0.427841,0.457425,0.535352,0.471405,0.382987
"Allerton, NYC",0.465715,1.0,0.448227,0.175933,0.39945,0.678243,0.512841,0.580645,0.483638,0.45857,...,0.420041,0.540971,0.696972,0.367363,0.733269,0.693375,0.345777,0.577311,0.575165,0.490234
"Alphabet City, NYC",0.448652,0.448227,1.0,0.076139,0.451166,0.417029,0.285355,0.32146,0.229133,0.381288,...,0.556038,0.38044,0.309975,0.434739,0.358581,0.334367,0.314924,0.294279,0.552006,0.583111
"Altgeld Gardens, Chicago",0.172848,0.175933,0.076139,1.0,0.089443,0.182574,0.249855,0.182574,0.351099,0.062403,...,0.136931,0.081817,0.135706,0.212169,0.059804,0.136626,0.083771,0.305386,0.066667,0.074536
"Andersonville, Chicago",0.60786,0.39945,0.451166,0.089443,1.0,0.334764,0.294866,0.326599,0.224309,0.466052,...,0.520517,0.33976,0.248828,0.449307,0.184542,0.331691,0.424586,0.298753,0.518022,0.443333


# Now, in order to see most similar neighborhoods, all we have to do is pick a row (or a column) and sort the values. 

For instance, I really like Wicker Park in Chicago

In [128]:
ranked = Similarity[['Wicker Park, Chicago']].sort_values(by='Wicker Park, Chicago')[::-1]

This is the list of neighborhoods in both Chicago and New York that are similar to it.

In [129]:
ranked

"City, Neighborhood","Wicker Park, Chicago"
"City, Neighborhood",Unnamed: 1_level_1
"Wicker Park, Chicago",1.000000
"Bucktown, Chicago",0.933344
"Pulaski Park, Chicago",0.821985
"West Town, Chicago",0.809142
"Noble Square, Chicago",0.765505
"Nortown, Chicago",0.757362
"Ukrainian Village, Chicago",0.710407
"Graceland West, Chicago",0.689196
"Lakewood, Chicago",0.672070
"Lake View East, Chicago",0.658576


Let's filter it to just show the neighborhoods in New York. 

In [130]:
ranked[['NYC' in x for x in ranked.index]]

"City, Neighborhood","Wicker Park, Chicago"
"City, Neighborhood",Unnamed: 1_level_1
"Stuyvesant Heights, NYC",0.638127
"Little Italy, NYC",0.637030
"Bedford-Stuyvesant, NYC",0.635382
"Chinatown, NYC",0.630809
"Greenpoint, NYC",0.627058
"Fish Bay, NYC",0.623035
"Mott Haven, NYC",0.621461
"Glendale, NYC",0.619521
"Long Island City, NYC",0.618257
"Steinway, NYC",0.617625


Now, let's do this for a few other neighborhoods and google the results to see if they make sense. 

In [131]:
neighborhood = 'South Loop, Chicago'
ranked = Similarity[[neighborhood]].sort_values(by=neighborhood)[::-1]
ranked[['NYC' in x for x in ranked.index]]

"City, Neighborhood","South Loop, Chicago"
"City, Neighborhood",Unnamed: 1_level_1
"Highbridge, NYC",0.565111
"Farragut, NYC",0.552278
"Livingston, NYC",0.545612
"Brooklyn Heights, NYC",0.537853
"University Heights, NYC",0.535164
"Rego Park, NYC",0.533679
"Dumbo, NYC",0.521641
"Mott Haven, NYC",0.521105
"Morris Park, NYC",0.518224
"Brooklyn Navy Yard, NYC",0.516870


In [132]:
neighborhood = 'SoHo, NYC'
ranked = Similarity[[neighborhood]].sort_values(by=neighborhood)[::-1]
ranked[['Chicago' in x for x in ranked.index]]

"City, Neighborhood","SoHo, NYC"
"City, Neighborhood",Unnamed: 1_level_1
"Park West, Chicago",0.663607
"Cabrini–Green, Chicago",0.650487
"Little Italy, Chicago",0.640262
"Lake View East, Chicago",0.638217
"Lake View, Chicago",0.638217
"Wrigleyville, Chicago",0.635552
"West Lakeview, Chicago",0.633695
"Polish Village, Chicago",0.620814
"Greektown, Chicago",0.614381
"Lakewood, Chicago",0.610070


# The results look good to me based on my experience with both cities. But I do recommend that you try it out for yourself. Below is a function that will map neighborhoods in New York or Chicago based on a favorite neighborhood in the other city

In [133]:
import matplotlib.pyplot as plt
import matplotlib.colors as color
def map_recommendations(fave_neighb, city_of_interest):
    #ranked = Similarity[[fave_neighb]].sort_values(by=fave_neighb)[::-1]
    #ranked[[city_of_interest in x for x in ranked.index]]
    #rankings = ranked[fave_neighb].tolist()
    # create map
    loc = geolocator.geocode(city_of_interest +', USA')
    latitude = loc.latitude
    longitude = loc.longitude
    map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

    # add markers to the map
    markers_colors = []
    for lat, lon, poi, r in zip(Clusters_of_neighborhoods['Latitude'], Clusters_of_neighborhoods['Longitude'], Clusters_of_neighborhoods['City, Neighborhood'], Similarity[[fave_neighb]][fave_neighb].tolist()):
        label = folium.Popup(str(poi) + ' Similarity index ' + str(r), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=color.to_hex(plt.cm.RdYlGn(r)),
            fill=True,
            fill_color=color.to_hex(plt.cm.RdYlGn(r)),
            fill_opacity=0.7).add_to(map_clusters)

    return map_clusters

## Below are some examples
Neighborhoods similar to Wicker Park in NY

In [134]:
map_recommendations('Wicker Park, Chicago', 'NYC')

Neighborhoods similar to SoHo in NY

In [139]:
map_recommendations('SoHo, NYC', 'NYC')

Neighborhoods similar to SoHo in Chicago

In [137]:
map_recommendations('SoHo, NYC', 'Chicago')