In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as colors
%matplotlib inline


### 1. Import the CSV file

In [2]:
path = "/home/sissi/singapore_planning_area.csv"
df = pd.read_csv(path)
df

Unnamed: 0,X,Y,gid,Name,description,PLN_AREA_N,PLN_AREA_C,CA_IND,REGION_N,REGION_C,INC_CRC,FMEL_UPD_D
0,103.793357,1.328117,3,kml_3,,BUKIT TIMAH,BT,N,CENTRAL REGION,CR,6CCDADD1F85173E9,20191206144714
1,103.801664,1.376076,4,kml_4,,CENTRAL WATER CATCHMENT,CC,N,NORTH REGION,NR,9F30125764C74984,20191206144714
2,103.748492,1.387486,6,kml_6,,CHOA CHU KANG,CK,N,WEST REGION,WR,5224CD5C7960361F,20191206144714
3,104.049107,1.387936,14,kml_14,,NORTH-EASTERN ISLANDS,NE,N,NORTH-EAST REGION,NER,E75708EADCFF04A6,20191206144714
4,103.725202,1.362108,34,kml_34,,TENGAH,TH,N,WEST REGION,WR,0D2FF9150EC36DFE,20191206144714
5,103.913796,1.406764,18,kml_18,,PUNGGOL,PG,N,NORTH-EAST REGION,NER,51833C3BEA49A0E9,20191206144714
6,103.698202,1.312923,22,kml_22,,BOON LAY,BL,N,WEST REGION,WR,D67ECAA47DF4B485,20191206144714
7,103.892283,1.256292,38,kml_38,,MARINA EAST,ME,Y,CENTRAL REGION,CR,1B9566319261C802,20191206144714
8,103.667422,1.306725,17,kml_17,,PIONEER,PN,N,WEST REGION,WR,3202B87B465BF3BD,20191206144714
9,103.818933,1.45708,26,kml_26,,SEMBAWANG,SB,N,NORTH REGION,NR,4794FB10E1985217,20191206144714


### 2. Cleanse the data

In [3]:
sg_df = df.loc[:, ["X", "Y", "PLN_AREA_N","REGION_N"]]

sg_df.rename(columns ={"X":"Longitude", "Y":"Latitude", "PLN_AREA_N":"Neighborhood", "REGION_N":"Region"}, inplace=True)
sg_df['Neighborhood'] = sg_df['Neighborhood'].str.title()
sg_df['Region'] = sg_df['Region'].str.title()



sg_df

Unnamed: 0,Longitude,Latitude,Neighborhood,Region
0,103.793357,1.328117,Bukit Timah,Central Region
1,103.801664,1.376076,Central Water Catchment,North Region
2,103.748492,1.387486,Choa Chu Kang,West Region
3,104.049107,1.387936,North-Eastern Islands,North-East Region
4,103.725202,1.362108,Tengah,West Region
5,103.913796,1.406764,Punggol,North-East Region
6,103.698202,1.312923,Boon Lay,West Region
7,103.892283,1.256292,Marina East,Central Region
8,103.667422,1.306725,Pioneer,West Region
9,103.818933,1.45708,Sembawang,North Region


### 3. Create Singapore map using Folium

In [4]:
# get the coords of Singapore
from geopy.geocoders import Nominatim
address = "Singapore"

geolocator = Nominatim(user_agent="singapore_map")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Singapore 1.3408630000000001, 103.83039182212079.


In [5]:
import folium
sg_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, neighborhood in zip(sg_df["Latitude"], sg_df['Longitude'], sg_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(sg_map)  
    
sg_map


###  4. Use the Foursquare API to explore the neighborhoods

In [6]:
CLIENT_ID = 'MJQP45R4SJKHU0WEPOAGYVGMFA0LHN2E4PIEIAIVRVOYTSY5' # your Foursquare ID
CLIENT_SECRET = 'WJJ3H4413NZ4LM00LYUK32L24VV354GOERGQIFO3INCV0JB0' # your Foursquare Secret
VERSION = '20200811' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MJQP45R4SJKHU0WEPOAGYVGMFA0LHN2E4PIEIAIVRVOYTSY5
CLIENT_SECRET:WJJ3H4413NZ4LM00LYUK32L24VV354GOERGQIFO3INCV0JB0


#### Get the top 100 venues within the radius of 1500 meters.

In [7]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)


In [8]:
sg_venues = getNearbyVenues(names = sg_df["Neighborhood"],
                            latitudes = sg_df["Latitude"],
                            longitudes = sg_df["Longitude"])
sg_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bukit Timah,1.328117,103.793357,Plank Sourdough Pizza By Baker & Cook,1.32389,103.796797,Pizza Place
1,Bukit Timah,1.328117,103.793357,Brazil Churrasco,1.330798,103.795201,Churrascaria
2,Bukit Timah,1.328117,103.793357,MEAT n' CHILL,1.331425,103.794955,BBQ Joint
3,Bukit Timah,1.328117,103.793357,Simply Bread,1.330535,103.795658,Bakery
4,Bukit Timah,1.328117,103.793357,Sunny Heights,1.3347,103.794795,Dog Run


#### Check the count of venues for each neighborhood

In [9]:
sg_venues.groupby(['Neighborhood']).count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ang Mo Kio,100,100,100,100,100,100
Bedok,22,22,22,22,22,22
Bishan,100,100,100,100,100,100
Boon Lay,28,28,28,28,28,28
Bukit Batok,78,78,78,78,78,78
Bukit Merah,100,100,100,100,100,100
Bukit Panjang,63,63,63,63,63,63
Bukit Timah,90,90,90,90,90,90
Central Water Catchment,4,4,4,4,4,4
Changi,3,3,3,3,3,3


In [10]:
print("There are {} unique categories".format(len(sg_venues["Venue Category"].unique())))

There are 303 unique categories


In [11]:
print(sg_venues['Venue Category'].value_counts())

Coffee Shop                  157
Food Court                   148
Chinese Restaurant           144
Café                         107
Japanese Restaurant           94
                            ... 
Japanese Curry Restaurant      1
Laundromat                     1
Stables                        1
Jazz Club                      1
College Gym                    1
Name: Venue Category, Length: 303, dtype: int64


#### Create a Data-frame with the 20 Most Frequently Occuring Venue_Category


In [12]:
# create a dataframe of top 10 categories
sg_top20 = sg_venues['Venue Category'].value_counts()[0:20].to_frame(name='frequency')
sg_top20 = sg_top20.reset_index()


sg_top20.rename(index=str, columns={"index": "Venue Category", "frequency": "Frequency"}, inplace=True)
sg_top20

Unnamed: 0,Venue Category,Frequency
0,Coffee Shop,157
1,Food Court,148
2,Chinese Restaurant,144
3,Café,107
4,Japanese Restaurant,94
5,Hotel,92
6,Asian Restaurant,80
7,Fast Food Restaurant,77
8,Supermarket,70
9,Bakery,58


In [13]:
import seaborn as sns
fig = plt.figure(figsize=(18,7))
s=sns.barplot(x="Venue Category", y="Frequency", data=sg_top10)
s.set_xticklabels(s.get_xticklabels(), rotation=30)
plt.title('20 Most Frequently Occuring Venues in Singapore', fontsize=15)
plt.xlabel("Venue Category", fontsize=15)
plt.ylabel ("Frequency", fontsize=15)
plt.savefig("sg_top20", dpi=300)
plt.show()

NameError: name 'sg_top10' is not defined

<Figure size 1296x504 with 0 Axes>

### 5. Analyze Each Neighborhood

#### Add one hot encoding

In [None]:
sg_onehot = pd.get_dummies(sg_venues[['Venue Category']], prefix="", prefix_sep ="")

sg_onehot["Neighborhood"] = sg_venues['Neighborhood']

fixed_columns = [sg_onehot.columns[-1]] + list(sg_onehot.columns[:-1])
sg_onehot = sg_onehot[fixed_columns]

sg_onehot.head()

#### Group by neighborhoods

In [None]:
sg_grouped = sg_onehot.groupby(["Neighborhood"]).mean().reset_index()

print(sg_grouped.shape)
sg_grouped

#### New dataframe for all the existing shopping malls in Singapore

In [None]:
mall_sg = sg_grouped[['Neighborhood', 'Shopping Mall']]
mall_sg.head()

### 6. Using K-Means clustering to cluster the neighborhoods

In [None]:
#import library
from sklearn.cluster import KMeans

#### Set into 3 clusters
kclusters = 3

sg_clustering = mall_sg.drop(["Neighborhood"], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sg_clustering)

kmeans.labels_[0:10]

#### Create a new DataFrame that inlcudes the cluster and the top 10 malls for each neighborhood

In [None]:
sg_merged = mall_sg.copy()

sg_merged["Cluster Labels"] = kmeans.labels_

In [None]:
sg_merged.head()

#### merge sg_grouped with sg_df to add latitude/longitude for each neighborhood

In [None]:
sg_merged = sg_merged.join(sg_df.set_index("Neighborhood"), on="Neighborhood")


#### Sort the result by Cluster Labels

In [None]:
sg_merged.sort_values(["Cluster Labels"], inplace=True)
sg_merged

#### Visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(sg_merged['Latitude'], sg_merged['Longitude'], sg_merged['Neighborhood'], sg_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
    
map_clusters

### 7. Examine the clusters

### First Cluster  (Cluster Labels=0)

In [None]:
sg_merged.loc[sg_merged['Cluster Labels'] == 0]

### Second Cluster  (Cluster Labels=1)

In [None]:
sg_merged.loc[sg_merged['Cluster Labels'] == 1]

### Third Cluster  (Cluster Labels=2)

In [None]:
sg_merged.loc[sg_merged['Cluster Labels'] == 2]

### 8. Results and Discussion

As showed on the top 20 most common venue categories bar chart, we can find out that there are currently 55 shopping malls in Singapore and it's ranked the 11th most popular venue category. 

By observing the above clustering data, we could find the following analysis.

- First cluster (Cluster label 0) has zero to very low numbers of shopping malls, as we can see, the neighborhoods in the first clusters are mostly in North/East/West Region. The density of population of most of these neighborhoods are not high. 
- Second cluster (Cluster label 1) has the highest rate of shopping malls. It's easy to identify most places are in central region. For example, Orchard is a well-known shopping center in Singapore. While Woodlands, even in North Region, but it's the regional center of the North Region and being located immediately across from Johor Bahru, Malaysia, Woodlands is highly accessible, connected as the border between two countries, which brings a certain amount of business including shopping malls.
- Third cluster (Cluster label 2) shows a moderate amount of shopping malls, located mostly in West Region and Central Region. 



### Recommendations

I would strongly recommend the stakeholders avoid investing in the third cluster. Reasons are as follows:<br>
1. As we can see in the second cluster, there has been already a relatively high amount of shopping malls existed, which will be really competitive. 
1. However, investing in the first cluster could be relatively risky, since most locations are in suburb area, which are less populated and are of industrial zones instead of residential areas. 
1. In the third cluster, most neighborhoods are well-located in the central and west region, and currently there are a moderate amount of shopping malls, but not too competitive as the second cluster, which creates great potential for shopping mall investments.

