# IBM Data Science Capstone Project

#### Import libraries

In [368]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import geojson

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Import geojson files with neighborhood data

In [369]:
with open('Boundaries - Neighborhoods.geojson') as json_data:
    ChiNeigh = json.load(json_data)
with open('amsterdam.geojson') as json_data:
    AmsNeigh = json.load(json_data)

#### Create data frame for Chicago neighborhood list

In [371]:
# define the dataframe columns
column_names = ['Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
df_chi = pd.DataFrame(columns=column_names)

for data in ChiNeigh['features']:
    neighborhood_name = data['properties']['sec_neigh']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[0][0][0][1]
    neighborhood_lon = neighborhood_latlon[0][0][0][0]
    
    df_chi = df_chi.append({'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [372]:
df_chi.drop_duplicates(inplace = True)
df_chi.size

288

In [373]:
df_chi.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,BRONZEVILLE,41.816814,-87.606708
1,PRINTERS ROW,41.874371,-87.627607
2,UNITED CENTER,41.888852,-87.667069
3,SHEFFIELD & DEPAUL,41.921661,-87.658335
4,HUMBOLDT PARK,41.887823,-87.740596


#### Create data frame for Amsterdam neighborhood list

In [374]:
# define the dataframe columns
column_names = ['Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
df_ams = pd.DataFrame(columns=column_names)

for data in AmsNeigh['features']:
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[0][0][1]
    neighborhood_lon = neighborhood_latlon[0][0][0]
    
    df_ams = df_ams.append({'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [375]:
df_ams.drop([69,89], inplace = True)
df_ams

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Burgwallen-Oude Zijde,52.3675,4.89549
1,Burgwallen-Nieuwe Zijde,52.3825,4.89877
2,Grachtengordel-West,52.3805,4.8884
3,Grachtengordel-Zuid,52.3674,4.89589
4,Nieuwmarkt/Lastage,52.3799,4.90581
5,Haarlemmerbuurt,52.3888,4.89567
6,Jordaan,52.3819,4.87963
7,De Weteringschans,52.365,4.8845
8,Weesperbuurt/Plantage,52.3692,4.90981
9,Oostelijke Eilanden/Kadijken,52.3765,4.91181


#### Enter four square credentials

In [376]:
CLIENT_ID = 'MNDV5G3FTMC10DJORDYHCEHBNQ1FS33HBLUSDJ1AYN4DSMKK' # your Foursquare ID
CLIENT_SECRET = '33NNQJMBQZX4UPM3W33EHXY0JP5G5ANK5IQPR5I1UA3GIU2O' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MNDV5G3FTMC10DJORDYHCEHBNQ1FS33HBLUSDJ1AYN4DSMKK
CLIENT_SECRET:33NNQJMBQZX4UPM3W33EHXY0JP5G5ANK5IQPR5I1UA3GIU2O


#### Create function to determine nearby venues by neighborhood latitude and longitude

In [377]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run getNearbyVenues on both sets of neighborhood data

In [378]:
chi_venues = getNearbyVenues(names=df_chi['Neighborhood'],
                                   latitudes=df_chi['Latitude'],
                                   longitudes=df_chi['Longitude']
                                  )

BRONZEVILLE
PRINTERS ROW
UNITED CENTER
SHEFFIELD & DEPAUL
HUMBOLDT PARK
GARFIELD PARK
NORTH LAWNDALE
LITTLE VILLAGE
ARMOUR SQUARE,CHINATOWN
AVALON PARK,CALUMET HEIGHTS
CHATHAM,BURNSIDE
BELMONT CRAIGIN,HERMOSA
IRVING PARK,AVONDALE
LOGAN SQUARE
AVALON PARK,CALUMET HEIGHTS
SOUTHEAST SIDE
WEST PULLMAN
MIDWAY AIRPORT
BACK OF THE YARDS
ENGLEWOOD
SOUTH SHORE, GRAND CROSSING
ASHBURN
MOUNT GREENWOOD,MORGAN PARK
MOUNT GREENWOOD,MORGAN PARK
OHARE
JACKSON PARK
LOOP
PULLMAN
RIVERDALE
GREEKTOWN
BRONZEVILLE
MUSEUM CAMPUS
EDGEWATER
LAKE VIEW
LINCOLN PARK
STREETERVILLE
LINCOLN SQUARE
OAKLAND,KENWOOD
GRANT PARK
WEST LOOP
BACK OF THE YARDS
ANDERSONVILLE
WOODLAWN
PORTAGE PARK
RUSH & DIVISION
LITTLE ITALY, UIC
KENWOOD,OAKLAND
ROGERS PARK
JEFFERSON PARK
SAUGANASH,FOREST GLEN
NORTH PARK,ALBANY PARK
NORTH PARK,ALBANY PARK
IRVING PARK,AVONDALE
DUNNING
WEST RIDGE
UPTOWN
NORWOOD PARK
STREETERVILLE
SOUTH SHORE, GRAND CROSSING
CHATHAM,BURNSIDE
SOUTH CHICAGO
WASHINGTON HEIGHTS,ROSELAND
NORTH CENTER
SOUTHEAST SIDE
W

In [380]:
ams_venues = getNearbyVenues(names=df_ams['Neighborhood'],
                                   latitudes=df_ams['Latitude'],
                                   longitudes=df_ams['Longitude']
                                  )

Burgwallen-Oude Zijde
Burgwallen-Nieuwe Zijde
Grachtengordel-West
Grachtengordel-Zuid
Nieuwmarkt/Lastage
Haarlemmerbuurt
Jordaan
De Weteringschans
Weesperbuurt/Plantage
Oostelijke Eilanden/Kadijken
Westelijk Havengebied
Bedrijventerrein Sloterdijk
Houthavens
Spaarndammer- en Zeeheldenbuurt
Staatsliedenbuurt
Centrale Markt
Frederik Hendrikbuurt
Da Costabuurt
Kinkerbuurt
Van Lennepbuurt
Helmersbuurt
Overtoomse Sluis
Vondelbuurt
Sloterdijk
Landlust
Erasmuspark
De Kolenkit
De Krommert
Van Galenbuurt
Hoofdweg en omgeving
Westindische buurt
Spieringhorn
Slotermeer-Noordoost
Slotermeer-Zuidwest
Geuzenveld
Eendracht
Lutkemeer en Ookmeer
Osdorp-Oost
Osdorp-Midden
De Punt
Middelveldsche Akerpolder en Sloten
Slotervaart
Overtoomse Veld
Westlandgracht
Sloten- en Riekerpolder
Oude Pijp
Nieuwe Pijp
Diamantbuurt
Hoofddorppleinbuurt
Schinkelbuurt
Willemspark
Museumkwartier
Stadionbuurt
Apollobuurt
Duivelseiland
Scheldebuurt
IJselbuurt
Rijnbuurt
Station-Zuid WTC en omgeving
Buitenveldert-West
Buitenvel

#### Create list of venue categories for each city and limit each to categories with more than 5 observations

In [381]:
grouped_chi_venues = chi_venues.groupby('Venue Category').count()
grouped_chi_venues = grouped_chi_venues[grouped_chi_venues['Neighborhood']>5]
grouped_chi_venues.shape

(111, 6)

In [382]:
grouped_ams_venues = ams_venues.groupby('Venue Category').count()
grouped_ams_venues = grouped_ams_venues[grouped_ams_venues['Neighborhood']>5]
grouped_ams_venues.shape

(120, 6)

#### Merge list of venue categories to get master list of venue categories with more than 5 observations in each city

In [383]:
venue_list = grouped_chi_venues.merge(grouped_ams_venues, on = 'Venue Category').index
venue_list

Index(['Art Gallery', 'Arts & Crafts Store', 'Asian Restaurant',
       'Athletics & Sports', 'Bakery', 'Bar', 'Boat or Ferry', 'Bookstore',
       'Breakfast Spot', 'Brewery', 'Burger Joint', 'Café',
       'Chinese Restaurant', 'Clothing Store', 'Coffee Shop',
       'Convenience Store', 'Cosmetics Shop', 'Dance Studio', 'Deli / Bodega',
       'Dessert Shop', 'Diner', 'Fast Food Restaurant', 'Food Truck',
       'Furniture / Home Store', 'Garden', 'Gift Shop', 'Greek Restaurant',
       'Grocery Store', 'Gym', 'Gym / Fitness Center', 'Harbor / Marina',
       'Hotel', 'Hotel Bar', 'Ice Cream Shop', 'Indian Restaurant',
       'Italian Restaurant', 'Japanese Restaurant', 'Jazz Club', 'Lounge',
       'Mediterranean Restaurant', 'Mexican Restaurant', 'Movie Theater',
       'Museum', 'Music Venue', 'Park', 'Pet Store', 'Pharmacy', 'Pizza Place',
       'Playground', 'Plaza', 'Pub', 'Restaurant', 'Sandwich Place',
       'Seafood Restaurant', 'Snack Place', 'Soccer Field', 'Spa',
     

#### Separate each cities venue list into top venues and other venues. Group by neighborhood to get count of each category type. Add column for the sum of all other venues

In [384]:
chi_top_venues = chi_venues[chi_venues['Venue Category'].isin(venue_list)]
chi_other_venues = chi_venues[~chi_venues['Venue Category'].isin(venue_list)]
ams_top_venues = ams_venues[ams_venues['Venue Category'].isin(venue_list)]
ams_other_venues = ams_venues[~ams_venues['Venue Category'].isin(venue_list)]

In [385]:
# one hot encoding
chi_onehot = pd.get_dummies(chi_top_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
chi_onehot['Neighborhood'] = chi_top_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [chi_onehot.columns[-1]] + list(chi_onehot.columns[:-1])
chi_onehot = chi_onehot[fixed_columns]

chi_grouped = chi_onehot.groupby('Neighborhood').sum().reset_index()
chi_other = chi_other_venues.groupby('Neighborhood').count()['Venue']

chi_grouped = chi_grouped[chi_grouped.sum(axis = 1)>5]

chi_grouped = chi_grouped.merge(chi_other, on = 'Neighborhood')
chi_grouped.rename(columns={'Venue':'Other'},inplace = True)

chi_grouped.set_index('Neighborhood',inplace = True)
chi_grouped = chi_grouped.div(chi_grouped.sum(axis=1), axis=0)
chi_grouped.drop(['Other'],axis=1,inplace = True)
chi_grouped = chi_grouped[chi_grouped.apply(lambda x: x!=0)]

In [386]:
# one hot encoding
ams_onehot = pd.get_dummies(ams_top_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ams_onehot['Neighborhood'] = ams_top_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ams_onehot.columns[-1]] + list(ams_onehot.columns[:-1])
ams_onehot = ams_onehot[fixed_columns]

ams_grouped = ams_onehot.groupby('Neighborhood').sum().reset_index()
ams_other = ams_other_venues.groupby('Neighborhood').count()['Venue']

ams_grouped = ams_grouped[ams_grouped.sum(axis = 1)>5]

ams_grouped = ams_grouped.merge(ams_other, on = 'Neighborhood')
ams_grouped.rename(columns={'Venue':'Other'},inplace = True)

ams_grouped.set_index('Neighborhood',inplace = True)
ams_grouped = ams_grouped.div(ams_grouped.sum(axis=1), axis=0)
ams_grouped.drop(['Other'],axis=1,inplace = True)
ams_grouped = ams_grouped[ams_grouped.apply(lambda x: x!=0)]

#### For each neighborhood pair, calculate similarity

In [387]:
import scipy

similar_df = pd.DataFrame(columns = ams_grouped.index)

for chi_neigh, row in chi_grouped.iterrows():
    similar_df = similar_df.append(pd.Series(name=chi_neigh))
    for ams_neigh, row in ams_grouped.iterrows():
        similar_df.loc[chi_neigh][ams_neigh] = np.sqrt(np.nansum((chi_grouped.loc[chi_neigh].values.reshape(1, -1) - ams_grouped.loc[ams_neigh].values.reshape(1, -1))**2))

  


#### Output results of most similar neighborhood in the opposite city for each city

In [389]:
similar_df = similar_df.astype('float32')
similar_df.idxmax(axis=0)

Neighborhood
Apollobuurt                                       ARMOUR SQUARE,CHINATOWN
Betondorp                                         ARMOUR SQUARE,CHINATOWN
Buitenveldert-Oost                                          STREETERVILLE
Buitenveldert-West                                ARMOUR SQUARE,CHINATOWN
Burgwallen-Nieuwe Zijde                                     ANDERSONVILLE
Burgwallen-Oude Zijde                                     LOWER WEST SIDE
Centrale Markt                                                  WEST LOOP
Da Costabuurt                                               STREETERVILLE
Dapperbuurt                                               LOWER WEST SIDE
De Kolenkit                                       ARMOUR SQUARE,CHINATOWN
De Krommert                                               LOWER WEST SIDE
De Omval                                                  RUSH & DIVISION
De Weteringschans                                               WEST LOOP
Diamantbuurt             

In [390]:
similar_df.idxmax(axis=1)

ANDERSONVILLE                                    Helmersbuurt
ARMOUR SQUARE,CHINATOWN                  Slotermeer-Noordoost
AVALON PARK,CALUMET HEIGHTS              Lutkemeer en Ookmeer
BACK OF THE YARDS                     Burgwallen-Nieuwe Zijde
BELMONT CRAGIN,HERMOSA                            Vondelbuurt
BELMONT CRAIGIN,HERMOSA                           Kinkerbuurt
BEVERLY                                           Vondelbuurt
BOYSTOWN                                  Grachtengordel-West
BRIDGEPORT                                    Van Lennepbuurt
BRIGHTON PARK,MCKINLEY PARK              Lutkemeer en Ookmeer
BRONZEVILLE                              Slotermeer-Noordoost
BUCKTOWN                               Holendrecht/Reigersbos
CHATHAM,BURNSIDE                         Lutkemeer en Ookmeer
DUNNING                                  Lutkemeer en Ookmeer
ENGLEWOOD                                         Kinkerbuurt
GOLD COAST                                        Frankendael
GRANT PA