# IBM Applied Data Science Capstone Course by Coursera
### Week 5 Final Report
**_Opening a New Shopping Mall in Casablanca, Morocco**
- Build a dataframe of neighborhoods in Casablanca, Morocco by web scraping the data from Wikipedia page
- Get the geographical coordinates of the neighborhoods
- Obtain the venue data for the neighborhoods from Foursquare API
- Explore and cluster the neighborhoods
- Select the best cluster to open a new shopping mall
***
### 1. Import libraries

In [1]:
### install those packages if needed
#!pip install BeautifulSoup4
#!pip install requests
#!pip install lxml
#!pip install html5lib
#!pip install geocoder
#!pip install geopy
#!conda install -c conda-forge folium=0.5.0 --yes

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
# import some more libraries
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe
import geocoder
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

### 2. Scrap data from Wikipedia page into a DataFrame

In [3]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighbourhoods_of_Casablanca").text

In [4]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [5]:
# create a list to store neighborhood data
neighborhoodList = []

In [6]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [7]:
# create a new DataFrame from the list
cb_df = pd.DataFrame({"Neighborhood": neighborhoodList})

cb_df.head()

Unnamed: 0,Neighborhood
0,Ain Diab
1,Aïn Sebaâ
2,Anfa
3,Belvedere (Casablanca)
4,Bourgogne (Casablanca)


In [8]:
# print the number of rows of the dataframe
cb_df.shape

(23, 1)

### 3. Get the geographical coordinates

In [9]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Casablanca, Morocco'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [10]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in cb_df["Neighborhood"].tolist() ]

In [11]:
coords

[[33.596610000000055, -7.618889999999965],
 [33.60996000000006, -7.542339999999967],
 [33.588310000000035, -7.61137999999994],
 [33.595120000000065, -7.58809999999994],
 [33.602670000000046, -7.645299999999963],
 [33.57593000000003, -7.629709999999932],
 [33.57227000000006, -7.5954099999999585],
 [33.58062000000007, -7.665269999999964],
 [33.575960000000066, -7.67665999999997],
 [33.596610000000055, -7.618889999999965],
 [33.60517153454754, -7.652691025858452],
 [33.60107000000005, -7.584429999999941],
 [33.57367000000005, -7.598109999999963],
 [33.596610000000055, -7.618889999999965],
 [33.57957000000005, -7.635999999999967],
 [33.55119000000008, -7.5515799999999444],
 [33.55741000000006, -7.6815299999999525],
 [33.58921000000004, -7.640609999999981],
 [33.59946000000008, -7.583719999999971],
 [33.53825000000006, -7.55350999999996],
 [33.546910000000025, -7.575049999999976],
 [33.524820000000034, -7.650489999999934],
 [33.305240000000026, -8.356919999999946]]

In [12]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [13]:
# merge the coordinates into the original dataframe
cb_df['Latitude'] = df_coords['Latitude']
cb_df['Longitude'] = df_coords['Longitude']

In [14]:
# check the neighborhoods and the coordinates
print(cb_df.shape)
cb_df

(23, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Ain Diab,33.59661,-7.61889
1,Aïn Sebaâ,33.60996,-7.54234
2,Anfa,33.58831,-7.61138
3,Belvedere (Casablanca),33.59512,-7.5881
4,Bourgogne (Casablanca),33.60267,-7.6453
5,Derb Ghallef,33.57593,-7.62971
6,Derb Sultan,33.57227,-7.59541
7,Hay El Hanaa,33.58062,-7.66527
8,Hay El Hassani,33.57596,-7.67666
9,Hay Salama,33.59661,-7.61889


In [15]:
# save the DataFrame as CSV file
cb_df.to_csv("cb_df.csv", index=False)

### 4. Create a map of Casablanca with neighborhoods superimposed on top

In [16]:
# get the coordinates of Casablanca
address = 'Casablanca, Morocco'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Casablanca, Morocco {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Casablanca, Morocco 33.5950627, -7.6187768.


In [None]:
# create map of Casablanca using latitude and longitude values
map_cb = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(cb_df['Latitude'], cb_df['Longitude'], cb_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_cb)  
    
map_cb

In [None]:
# save the map as HTML file
map_cb.save('map_cb.html')

### 5. Use the Foursquare API to explore the neighborhoods

In [None]:
# define Foursquare Credentials and Version
CLIENT_ID = 'V5EQ4YQMFHPTOVZVDRC1M5EJLTZ4XOYW4BPTOU20XYAZ0O5T' # your Foursquare ID
CLIENT_SECRET = 'YERRGFUUDS4XORVZ3MLZI1XK5NNQ23ZK0V4GUJF0TVD5XKPJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

**Now, let's get the top 100 venues that are within a radius of 2000 meters.**

In [None]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(cb_df['Latitude'], cb_df['Longitude'], cb_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [None]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

**Let's check how many venues were returned for each neighorhood**

In [None]:
venues_df.groupby(["Neighborhood"]).count()

**Let's find out how many unique categories can be curated from all the returned venues**

In [None]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

In [None]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

In [None]:
# check if the results contain "Shopping Mall"
"Neighborhood" in venues_df['VenueCategory'].unique()

### 6. Analyze Each Neighborhood

In [None]:
# one hot encoding
cb_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
cb_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [cb_onehot.columns[-1]] + list(cb_onehot.columns[:-1])
cb_onehot = cb_onehot[fixed_columns]

print(cb_onehot.shape)
cb_onehot.head()

**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [None]:
cb_grouped = cb_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(cb_grouped.shape)
cb_grouped

In [None]:
len(cb_grouped[cb_grouped["Shopping Mall"] > 0])

**Create a new DataFrame for Shopping Mall data only**

In [None]:
cb_mall = cb_grouped[["Neighborhoods","Shopping Mall"]]

In [None]:
cb_mall.head()

### 7. Cluster Neighborhoods
Run k-means to cluster the neighborhoods in Casablanca into 3 clusters.

In [None]:
# set number of clusters
kclusters = 3

cb_clustering = cb_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cb_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
cb_merged = cb_mall.copy()

# add clustering labels
cb_merged["Cluster Labels"] = kmeans.labels_

In [None]:
cb_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
cb_merged.head()

In [None]:
# merge cb_grouped with cb_data to add latitude/longitude for each neighborhood
cb_merged = cb_merged.join(cb_df.set_index("Neighborhood"), on="Neighborhood")

print(cb_merged.shape)
cb_merged.head() # check the last columns!

In [None]:
# sort the results by Cluster Labels
print(cb_merged.shape)
cb_merged.sort_values(["Cluster Labels"], inplace=True)
cb_merged

**Finally, let's visualize the resulting clusters**

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cb_merged['Latitude'], cb_merged['Longitude'], cb_merged['Neighborhood'], cb_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

#### Cluster 0

In [None]:
cb_merged.loc[cb_merged['Cluster Labels'] == 0]

#### Cluster 1

In [None]:
cb_merged.loc[cb_merged['Cluster Labels'] == 1]

#### Cluster 2

In [None]:
cb_merged.loc[cb_merged['Cluster Labels'] == 2]