# Week 5 Assignment
### Opening a Hotel in Shanghai



- Scraping Wikipedia sited to get neighborhoods in Shanghai
- add geo data to these scraped neighborhoods
- add venue data by using Fourspace
- Cluster neighborhoods
- Analyse which Cluster Area would be best to open a Hotel to avoid too much competition

In [9]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
!conda install -c conda-forge geocoder --yes
import geocoder
!conda install -c conda-forge lxml --yes
import requests # library to handle requests
!conda install -c conda-forge bs4 --yes
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print("Libraries imported.")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    click-7.1.2                |     pyh9f0ad1d_0          64 KB  conda-forge
    future-0.18.2              |   py36h9f0ad1d_1         714 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         837 KB

The following NEW packages will be INSTALLED:

  click              conda-fo

In [5]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/Category:Neighbourhoods_of_Shanghai').text
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')
# list to store neighborhood data
neighborhood = []
# append data into the neighborhood list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhood.append(row.text)

# create DataFrame
df = pd.DataFrame({"Neighborhood": neighborhood})

df.head()

Unnamed: 0,Neighborhood
0,Anting
1,Changshou Road Subdistrict
2,Fengjing
3,"Gaoqiao, Shanghai"
4,"Gubei, Shanghai"


In [7]:
# print the number of rows of the dataframe
df.shape

(19, 1)

In [10]:
# get coordinates
def get_latlng(neighborhood):
    # initialize variable
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Shanghai, China'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# Store coordinates in list
coordinates = [get_latlng(neighborhood) for neighborhood in df["Neighborhood"].tolist()]
# populate coordinates into Latitude and Longitude
df_temp = pd.DataFrame(coordinates, columns=['Latitude', 'Longitude'])
# merge the coordinates into the main dataframe
df['Latitude'] = df_temp['Latitude']
df['Longitude'] = df_temp['Longitude']
# print current DataFrame Status
print(df.shape)
df

(19, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Anting,31.2989,121.1576
1,Changshou Road Subdistrict,30.91604,121.15409
2,Fengjing,30.89019,121.01195
3,"Gaoqiao, Shanghai",31.22222,121.45806
4,"Gubei, Shanghai",31.22222,121.45806
5,"Koreatown, Shanghai",31.22222,121.45806
6,Lujiazui,31.3269,121.28482
7,"Luodian, Shanghai",31.22222,121.45806
8,Nanxiang,31.23694,121.07322
9,Qiantan International Business Zone (Shanghai),31.22222,121.45806


In [12]:
# Create a Map of Shanghai

address = 'Shanghai, China'

geolocator = Nominatim(user_agent="shanghai-explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Shanghai, China {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Shanghai, China 31.2252985, 121.4890497.


In [13]:
# create map of Toronto using latitude and longitude values
shanghai_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity=0.7).add_to(shanghai_map)  
    
shanghai_map

In [14]:
# save the map as HTML file
shanghai_map.save('shanghai_map.html')

In [15]:
# Use Foursquare to explore venues nearby
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [17]:
# exploring the top 100 Venues in 2500 meter range
radius = 2500
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(858, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Anting,31.2989,121.1576,Alibaba,31.297209,121.162602,German Restaurant
1,Anting,31.2989,121.1576,Wirtshaus,31.291667,121.154532,Bar
2,Anting,31.2989,121.1576,Life Hub (嘉亭荟城市生活广场),31.289792,121.157673,Shopping Mall
3,Anting,31.2989,121.1576,Starbucks (星巴克),31.291264,121.14285,Coffee Shop
4,Anting,31.2989,121.1576,KFC (肯德基),31.297443,121.158709,Fast Food Restaurant


In [18]:
# group
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anting,16,16,16,16,16,16
Changshou Road Subdistrict,5,5,5,5,5,5
Fengjing,5,5,5,5,5,5
"Gaoqiao, Shanghai",100,100,100,100,100,100
"Gubei, Shanghai",100,100,100,100,100,100
"Koreatown, Shanghai",100,100,100,100,100,100
Lujiazui,7,7,7,7,7,7
"Luodian, Shanghai",100,100,100,100,100,100
Nanxiang,1,1,1,1,1,1
Qiantan International Business Zone (Shanghai),100,100,100,100,100,100


In [19]:
# How many unique categories are available
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 109 uniques categories.


In [20]:
# show list of categories (unique)
venues_df['VenueCategory'].unique()[:50]

array(['German Restaurant', 'Bar', 'Shopping Mall', 'Coffee Shop',
       'Fast Food Restaurant', 'History Museum', 'Park', 'Hotel',
       'Metro Station', 'Toll Plaza', 'Train Station', 'Market',
       'Asian Restaurant', 'Art Gallery', 'Farm', 'Historic Site',
       'Rest Area', 'Convenience Store', 'Toll Booth',
       'Chinese Restaurant', 'Pizza Place', 'Optical Shop',
       'Cocktail Bar', 'Turkish Restaurant', 'Club House',
       'Other Nightlife', 'Theme Restaurant', 'Shanghai Restaurant',
       'Japanese Restaurant', 'Café', 'Department Store', 'Speakeasy',
       'Dumpling Restaurant', 'Spa', 'Gym / Fitness Center', 'Multiplex',
       'Mexican Restaurant', 'Brazilian Restaurant', 'Clothing Store',
       'Hong Kong Restaurant', 'Pedestrian Plaza', 'Yoga Studio',
       'Burger Joint', 'Seafood Restaurant', 'Gastropub',
       'Electronics Store', 'Bakery', 'Wine Bar', 'Supermarket',
       'Yunnan Restaurant'], dtype=object)

In [21]:
# encode
encode = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
encode['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [encode.columns[-1]] + list(encode.columns[:-1])
encode = encode[fixed_columns]

print(encode.shape)
encode.head()

(858, 110)


Unnamed: 0,Neighborhoods,Anhui Restaurant,Art Gallery,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Big Box Store,Bistro,Bookstore,Boutique,Brazilian Restaurant,Brewery,Buffet,Burger Joint,Café,Cantonese Restaurant,Cha Chaan Teng,Chinese Restaurant,Clothing Store,Club House,Cocktail Bar,Coffee Shop,Concert Hall,Convenience Store,Deli / Bodega,Department Store,Dessert Shop,Diner,Dumpling Restaurant,Electronics Store,Farm,Fast Food Restaurant,French Restaurant,Furniture / Home Store,Garden,Gas Station,Gastropub,General Travel,German Restaurant,Grocery Store,Gym / Fitness Center,Historic Site,History Museum,Hong Kong Restaurant,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indie Theater,Italian Restaurant,Japanese Curry Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Korean Restaurant,Lounge,Market,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mongolian Restaurant,Movie Theater,Multiplex,Nail Salon,Neighborhood,New American Restaurant,Noodle House,Optical Shop,Other Nightlife,Park,Pedestrian Plaza,Peruvian Restaurant,Pie Shop,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scandinavian Restaurant,Sculpture Garden,Seafood Restaurant,Shanghai Restaurant,Shopping Mall,Shopping Plaza,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Stadium,Steakhouse,Supermarket,Szechuan Restaurant,Taiwanese Restaurant,Thai Restaurant,Theater,Theme Restaurant,Toll Booth,Toll Plaza,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wine Bar,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Anting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Anting,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Anting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Anting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Anting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
df_grouped = encode.groupby(["Neighborhoods"]).mean().reset_index()

print(df_grouped.shape)
df_grouped

(18, 110)


Unnamed: 0,Neighborhoods,Anhui Restaurant,Art Gallery,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Big Box Store,Bistro,Bookstore,Boutique,Brazilian Restaurant,Brewery,Buffet,Burger Joint,Café,Cantonese Restaurant,Cha Chaan Teng,Chinese Restaurant,Clothing Store,Club House,Cocktail Bar,Coffee Shop,Concert Hall,Convenience Store,Deli / Bodega,Department Store,Dessert Shop,Diner,Dumpling Restaurant,Electronics Store,Farm,Fast Food Restaurant,French Restaurant,Furniture / Home Store,Garden,Gas Station,Gastropub,General Travel,German Restaurant,Grocery Store,Gym / Fitness Center,Historic Site,History Museum,Hong Kong Restaurant,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indie Theater,Italian Restaurant,Japanese Curry Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Korean Restaurant,Lounge,Market,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mongolian Restaurant,Movie Theater,Multiplex,Nail Salon,Neighborhood,New American Restaurant,Noodle House,Optical Shop,Other Nightlife,Park,Pedestrian Plaza,Peruvian Restaurant,Pie Shop,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scandinavian Restaurant,Sculpture Garden,Seafood Restaurant,Shanghai Restaurant,Shopping Mall,Shopping Plaza,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Stadium,Steakhouse,Supermarket,Szechuan Restaurant,Taiwanese Restaurant,Thai Restaurant,Theater,Theme Restaurant,Toll Booth,Toll Plaza,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wine Bar,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Anting,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Changshou Road Subdistrict,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Fengjing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Gaoqiao, Shanghai",0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.02,0.02,0.0,0.0,0.01,0.01,0.01,0.06,0.07,0.01,0.0,0.0,0.01,0.0,0.01,0.04,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.08,0.01,0.02,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.04,0.0,0.05,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0
4,"Gubei, Shanghai",0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.02,0.02,0.0,0.0,0.01,0.01,0.01,0.06,0.07,0.01,0.0,0.0,0.01,0.0,0.01,0.04,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.08,0.01,0.02,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.04,0.0,0.05,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0
5,"Koreatown, Shanghai",0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.02,0.02,0.0,0.0,0.01,0.01,0.01,0.06,0.07,0.01,0.0,0.0,0.01,0.0,0.01,0.04,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.08,0.01,0.02,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.04,0.0,0.05,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0
6,Lujiazui,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Luodian, Shanghai",0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.02,0.02,0.0,0.0,0.01,0.01,0.01,0.06,0.07,0.01,0.0,0.0,0.01,0.0,0.01,0.04,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.08,0.01,0.02,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.04,0.0,0.05,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0
8,Nanxiang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Qiantan International Business Zone (Shanghai),0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.02,0.02,0.0,0.0,0.01,0.01,0.01,0.06,0.07,0.01,0.0,0.0,0.01,0.0,0.01,0.04,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.08,0.01,0.02,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.04,0.0,0.05,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0


In [24]:
len(df_grouped[df_grouped['Hotel'] > 0])

13

In [26]:
# DataFrame showcasing Hotels only
df_hotel = df_grouped[['Neighborhoods','Hotel']]
df_hotel.head()

Unnamed: 0,Neighborhoods,Hotel
0,Anting,0.0625
1,Changshou Road Subdistrict,0.0
2,Fengjing,0.0
3,"Gaoqiao, Shanghai",0.08
4,"Gubei, Shanghai",0.08


In [27]:
# use k-mean clustering
kclusters = 3

df_clustering = df_hotel.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 2, 0, 0, 0, 2, 0, 2, 0], dtype=int32)

In [29]:
df_merged = df_hotel.copy()

# add clustering labels
df_merged["Cluster Labels"] = kmeans.labels_
df_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
df_merged.head()

Unnamed: 0,Neighborhood,Hotel,Cluster Labels
0,Anting,0.0625,0
1,Changshou Road Subdistrict,0.0,2
2,Fengjing,0.0,2
3,"Gaoqiao, Shanghai",0.08,0
4,"Gubei, Shanghai",0.08,0


In [30]:
# merge data
df_merged = df_merged.join(df.set_index("Neighborhood"), on="Neighborhood")

print(df_merged.shape)
df_merged.head() # check the last columns!

(18, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Anting,0.0625,0,31.2989,121.1576
1,Changshou Road Subdistrict,0.0,2,30.91604,121.15409
2,Fengjing,0.0,2,30.89019,121.01195
3,"Gaoqiao, Shanghai",0.08,0,31.22222,121.45806
4,"Gubei, Shanghai",0.08,0,31.22222,121.45806


In [31]:
# sort the results by Cluster Labels
print(df_merged.shape)
df_merged.sort_values(["Cluster Labels"], inplace=True)
df_merged

(18, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Anting,0.0625,0,31.2989,121.1576
12,Tianzifang,0.08,0,31.22222,121.45806
3,"Gaoqiao, Shanghai",0.08,0,31.22222,121.45806
4,"Gubei, Shanghai",0.08,0,31.22222,121.45806
5,"Koreatown, Shanghai",0.08,0,31.22222,121.45806
11,Songjiang Town,0.142857,0,31.03595,121.2146
7,"Luodian, Shanghai",0.08,0,31.22222,121.45806
16,Zhangjiang Town,0.108108,0,31.20861,121.60889
9,Qiantan International Business Zone (Shanghai),0.08,0,31.22222,121.45806
10,Qibao,0.113208,0,31.15267,121.35688


In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [33]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

In [35]:
# Cluster 0
df_merged.loc[df_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Anting,0.0625,0,31.2989,121.1576
12,Tianzifang,0.08,0,31.22222,121.45806
3,"Gaoqiao, Shanghai",0.08,0,31.22222,121.45806
4,"Gubei, Shanghai",0.08,0,31.22222,121.45806
5,"Koreatown, Shanghai",0.08,0,31.22222,121.45806
11,Songjiang Town,0.142857,0,31.03595,121.2146
7,"Luodian, Shanghai",0.08,0,31.22222,121.45806
16,Zhangjiang Town,0.108108,0,31.20861,121.60889
9,Qiantan International Business Zone (Shanghai),0.08,0,31.22222,121.45806
10,Qibao,0.113208,0,31.15267,121.35688


In [38]:
# Cluster 1
df_merged.loc[df_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
13,Wusong,0.25,1,31.37566,121.49041
17,Zhujiajiao,0.285714,1,31.10757,121.05696


In [40]:
# Cluster 2
df_merged.loc[df_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
2,Fengjing,0.0,2,30.89019,121.01195
1,Changshou Road Subdistrict,0.0,2,30.91604,121.15409
14,Xintiandi,0.0,2,31.76312,121.32315
15,Xujiahui,0.04,2,31.19,121.43194
6,Lujiazui,0.0,2,31.3269,121.28482
8,Nanxiang,0.0,2,31.23694,121.07322
