# Which cities in China have the greatest need for restaurants?
### To answer this question we will analyze the relative frequency of restaurants in the top 1000 venues of each city to determine which has the greatest need for a restaurant

In [1]:
import pandas as pd

### First we retrieve data regarding the most prominent cities of China

In [2]:
!wget 'https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.73.zip'

--2021-02-07 16:55:32--  https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.73.zip
Resolving simplemaps.com (simplemaps.com)... 104.26.13.95, 172.67.71.113, 104.26.12.95, ...
Connecting to simplemaps.com (simplemaps.com)|104.26.13.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2697031 (2.6M) [application/zip]
Saving to: ‘simplemaps_worldcities_basicv1.73.zip’


2021-02-07 16:55:32 (35.0 MB/s) - ‘simplemaps_worldcities_basicv1.73.zip’ saved [2697031/2697031]



In [3]:
!unzip 'simplemaps_worldcities_basicv1.73.zip'

Archive:  simplemaps_worldcities_basicv1.73.zip
  inflating: license.txt             
  inflating: worldcities.csv         
  inflating: worldcities.xlsx        


In [4]:
df = pd.read_csv('worldcities.csv')
df = df[df['country']=='China']
df.reset_index(inplace = True)
df

Unnamed: 0,index,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,5,Shanghai,Shanghai,31.1667,121.4667,China,CN,CHN,Shanghai,admin,22120000.0,1156073548
1,9,Guangzhou,Guangzhou,23.1288,113.2590,China,CN,CHN,Guangdong,admin,20902000.0,1156237133
2,10,Beijing,Beijing,39.9050,116.3914,China,CN,CHN,Beijing,primary,19433000.0,1156228865
3,17,Shenzhen,Shenzhen,22.5350,114.0540,China,CN,CHN,Guangdong,minor,15929000.0,1156158707
4,29,Nanyang,Nanyang,32.9987,112.5292,China,CN,CHN,Henan,,12010000.0,1156192287
...,...,...,...,...,...,...,...,...,...,...,...,...
688,22139,Heyin,Heyin,36.0451,101.4242,China,CN,CHN,Qinghai,minor,7642.0,1156514054
689,25588,Jinzhong,Jinzhong,26.3504,103.4167,China,CN,CHN,Yunnan,minor,5170.0,1156252749
690,25929,Kailu,Kailu,43.5837,121.2000,China,CN,CHN,Inner Mongolia,minor,2809.0,1156813834
691,26022,Linxi,Linxi,43.5171,118.0333,China,CN,CHN,Inner Mongolia,minor,679.0,1156608652


## Here we find the venues located in each major city of China

In [5]:
# The code was removed by Watson Studio for sharing.

In [6]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [7]:
import requests
radius=5000

In [8]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        print(name)
        
            
        # create the API request URL
        

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        #url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng,ACCESS_TOKEN, VERSION, radius, LIMIT)



        # make the GET request
        request = None
        while request == None or request == {} or request["response"] == {}:
            request = requests.get(url).json()
        results = request["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
city_venues = getNearbyVenues( df['city'],df['lat'], df['lng'])

Shanghai
Guangzhou
Beijing
Shenzhen
Nanyang
Chengdu
Linyi
Tianjin
Shijiazhuang
Baoding
Zhoukou
Weifang
Wuhan
Heze
Ganzhou
Tongshan
Handan
Fuyang
Jining
Dongguan
Chongqing
Changchun
Zhumadian
Ningbo
Nanjing
Hefei
Nantong
Yancheng
Foshan
Nanning
Hengyang
Xi’an
Shenyang
Tangshan
Shaoyang
Changsha
Zhengzhou
Zhanjiang
Cangzhou
Maoming
Huanggang
Xinyang
Shangrao
Luoyang
Bijie
Yantai
Quanzhou
Hangzhou
Kunming
Nanchong
Zunyi
Lu’an
Yichun
Taizhou
Liaocheng
Qujing
Xiangyang
Qingdao
Changde
Dazhou
Suzhou
Jieyang
Nangandao
Tai’an
Yulin
Dezhou
Suihua
Qiqihar
Jinhua
Shantou
Weinan
Suqian
Suzhou
Fuzhou
Zhaotong
Pudong
Yongzhou
Zhangzhou
Bozhou
Nanchang
Xianyang
Taizhou
Ji’an
Mianyang
Shaoxing
Yuncheng
Pingdingshan
Huai’an
Xinpu
Guilin
Huaihua
Jiujiang
Anqing
Huanglongsi
Xiaoganzhan
Changzhou
Chenzhou
Wuxi
Zibo
Jiaxing
Dalian
Harbin
Yangzhou
Yibin
Jiangmen
Meizhou
Chifeng
Guiyang
Langfang
Zhangjiakou
Linfen
Wenzhou
Luzhou
Jiangguanchi
Neijiang
Yanjiang
Yiyang
Zhaoqing
Hengshui
Guigang
Xiaoxita
Xiamen


In [10]:
city_venues.shape

(1412, 7)

In [11]:
city_venues.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Shanghai,31.1667,121.4667,Centre Pompidou,31.167732,121.464084,Art Museum
1,Shanghai,31.1667,121.4667,Longyao Rd. Tunnel (龙耀路隧道),31.16782,121.463768,Tunnel
2,Shanghai,31.1667,121.4667,Seesaw Coffee,31.164759,121.463806,Coffee Shop
3,Guangzhou,23.1288,113.259,Starbucks (星巴克),23.128316,113.259682,Coffee Shop
4,Guangzhou,23.1288,113.259,Mayflower Cinema (五月花影院),23.12857,113.261608,Movie Theater


In [12]:
city_venues.groupby('City', as_index=False).count()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ailan Mubage,4,4,4,4,4,4
1,Aksu,1,1,1,1,1,1
2,Alashankou,1,1,1,1,1,1
3,Anguo,1,1,1,1,1,1
4,Ankang,2,2,2,2,2,2
...,...,...,...,...,...,...,...
416,Zigong,6,6,6,6,6,6
417,Zijinglu,1,1,1,1,1,1
418,Zunhua,2,2,2,2,2,2
419,Zunyi,3,3,3,3,3,3


In [13]:
print('There are {} uniques categories.'.format(len(city_venues['Venue Category'].unique())))

There are 177 uniques categories.


In [14]:
# one hot encoding
venues_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
venues_onehot['City'] = city_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
venues_onehot = venues_onehot[fixed_columns]

venues_onehot.head()

Unnamed: 0,City,American Restaurant,Art Gallery,Art Museum,Asian Restaurant,BBQ Joint,Baby Store,Bakery,Bar,Baseball Field,...,Tea Room,Temple,Thai Restaurant,Theater,Tibetan Restaurant,Train Station,Tunnel,Xinjiang Restaurant,Yunnan Restaurant,Zhejiang Restaurant
0,Shanghai,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Shanghai,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Shanghai,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Guangzhou,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Guangzhou,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
venues_grouped = venues_onehot.groupby('City').mean().reset_index()
venues_grouped

Unnamed: 0,City,American Restaurant,Art Gallery,Art Museum,Asian Restaurant,BBQ Joint,Baby Store,Bakery,Bar,Baseball Field,...,Tea Room,Temple,Thai Restaurant,Theater,Tibetan Restaurant,Train Station,Tunnel,Xinjiang Restaurant,Yunnan Restaurant,Zhejiang Restaurant
0,Ailan Mubage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aksu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alashankou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Anguo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ankang,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,Zigong,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
417,Zijinglu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
418,Zunhua,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,Zunyi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
num_top_venues = 5

for city in venues_grouped['City']:
    print("----"+city+"----")
    temp = venues_grouped[venues_grouped['City'] == city].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Ailan Mubage----
                       venue  freq
0                      Hotel  0.50
1             Shopping Plaza  0.25
2             Massage Studio  0.25
3              Moving Target  0.00
4  Middle Eastern Restaurant  0.00


----Aksu----
                  venue  freq
0                Museum   1.0
1           IT Services   0.0
2    Miscellaneous Shop   0.0
3     Mobile Phone Shop   0.0
4  Mongolian Restaurant   0.0


----Alashankou----
                 venue  freq
0        Train Station   1.0
1  American Restaurant   0.0
2            Multiplex   0.0
3   Miscellaneous Shop   0.0
4    Mobile Phone Shop   0.0


----Anguo----
                  venue  freq
0        Clothing Store   1.0
1   American Restaurant   0.0
2                Museum   0.0
3     Mobile Phone Shop   0.0
4  Mongolian Restaurant   0.0


----Ankang----
                 venue  freq
0     Asian Restaurant   0.5
1   Chinese Restaurant   0.5
2  American Restaurant   0.0
3               Museum   0.0
4    Mobile Phone Sho

## Now we find the relative frequency of restaurants in each city of China

In [17]:
restaurantColumns = []
for col in venues_grouped.columns:
    name = col
    col = col.lower()
    if col.find('restaurant') > 0 or col.find('pizza') > 0 or col.find('food') > 0 or col.find('ice cream') > 0 or col.find('snack') > 0 or col.find('bbq') > 0 or col.find('noodle') > 0 or col.find('coffee') > 0 or col.find('café') > 0 or col.find('dessert') > 0 or col.find('pastry') > 0 or col.find('bakery') > 0 or col.find('breakfast') > 0 or col.find('steak') > 0:
        restaurantColumns.append(name)

In [18]:
restaurantFreq = pd.DataFrame(venues_grouped[restaurantColumns].sum(axis=1), columns=['Frequency'])
restaurantFreq['City'] = venues_grouped['City']
restaurantFreq.sort_values('Frequency',ascending=True)

Unnamed: 0,Frequency,City
0,0.0,Ailan Mubage
119,0.0,Huaibei
270,0.0,Shaowu
268,0.0,Shantou
267,0.0,Shanhu
...,...,...
199,1.0,Luanzhou
190,1.0,Lingbao Chengguanzhen
311,1.0,Wancheng
245,1.0,Qingzhen


### The following cities have relatively few restaurants

In [43]:
inNeedOfRestaurants = restaurantFreq[restaurantFreq['Frequency']==0]
population = []
for city in inNeedOfRestaurants['City']:
    population.append(df[df['city'] == city]['population'].iloc[0])
inNeedOfRestaurants['Population'] = population
inNeedOfRestaurants.sort_values('Population', ascending=False, inplace=True)
inNeedOfRestaurants.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Frequency,City,Population
263,0.0,Shanghai,22120000.0
20,0.0,Beijing,19433000.0
12,0.0,Baoding,10700000.0
323,0.0,Wuhan,8962000.0
156,0.0,Jining,8023000.0
44,0.0,Chongqing,7739000.0
274,0.0,Shenyang,7105000.0
23,0.0,Bijie,6537498.0
102,0.0,Hangzhou,6446000.0
221,0.0,Nanchong,6183000.0


# Our results indicate the above 10 cities are most in need of restaurants.
# This could be the result of a limitation of the Foursquare API, however,
# this also indicates that there are few well advertised restaurants in these
# ten cities.

## Now we group cities by similarity of venues

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [23]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
city_venues_sorted = pd.DataFrame(columns=columns)
city_venues_sorted['City'] = venues_grouped['City']

for ind in np.arange(venues_grouped.shape[0]):
    city_venues_sorted.iloc[ind, 1:] = return_most_common_venues(venues_grouped.iloc[ind, :], num_top_venues)

city_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ailan Mubage,Hotel,Shopping Plaza,Massage Studio,Eastern European Restaurant,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio,Fast Food Restaurant
1,Aksu,Museum,Zhejiang Restaurant,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio,Fast Food Restaurant
2,Alashankou,Train Station,Zhejiang Restaurant,Eastern European Restaurant,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio,Fast Food Restaurant
3,Anguo,Clothing Store,Zhejiang Restaurant,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio,Fast Food Restaurant
4,Ankang,Asian Restaurant,Chinese Restaurant,Zhejiang Restaurant,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio


In [24]:

from sklearn.cluster import KMeans
import numpy as np

In [25]:
# Find best number of clusters
accuracy = []

venues_grouped_clustering = venues_grouped.drop('City', 1)

for k in range(1,20):
    

    # run k-means clustering
    kmeans = KMeans(n_clusters=k, random_state=0).fit(venues_grouped_clustering)

    accuracy.append(kmeans.inertia_)
    

k = np.argmin(accuracy)+1

print(k,'clusters calculated')

kmeans = KMeans(n_clusters=k, random_state=0).fit(venues_grouped_clustering)


19 clusters calculated


In [26]:
# add clustering labels
city_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

city_merged = city_venues

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
city_merged = city_merged.join(city_venues_sorted.set_index('City'), on='City')

city_merged.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Shanghai,31.1667,121.4667,Centre Pompidou,31.167732,121.464084,Art Museum,10,Coffee Shop,Tunnel,Art Museum,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio
1,Shanghai,31.1667,121.4667,Longyao Rd. Tunnel (龙耀路隧道),31.16782,121.463768,Tunnel,10,Coffee Shop,Tunnel,Art Museum,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio
2,Shanghai,31.1667,121.4667,Seesaw Coffee,31.164759,121.463806,Coffee Shop,10,Coffee Shop,Tunnel,Art Museum,Electronics Store,Food Stand,Food Court,Food,Flea Market,Fish & Chips Shop,Film Studio
3,Guangzhou,23.1288,113.259,Starbucks (星巴克),23.128316,113.259682,Coffee Shop,10,Shopping Mall,Fast Food Restaurant,Coffee Shop,Movie Theater,Hotel,Park,Asian Restaurant,Clothing Store,Pizza Place,Bookstore
4,Guangzhou,23.1288,113.259,Mayflower Cinema (五月花影院),23.12857,113.261608,Movie Theater,10,Shopping Mall,Fast Food Restaurant,Coffee Shop,Movie Theater,Hotel,Park,Asian Restaurant,Clothing Store,Pizza Place,Bookstore


In [28]:
#!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 7.4 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [29]:
# create map
latitude = 33.98396286243095
longitude = 103.84087241028
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=4)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['City Latitude'], city_merged['City Longitude'], city_merged['City'], city_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters