In [None]:
!pip install geopy
!pip install yellowbrick
!pip install matplotlip

# Clustering for Vancouver

In [None]:
import pandas as pd
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Data import
## load total income of each Vancouver neighbourhood

In [None]:

income_path = pd.read_csv('dataset_Vancouver/income_Vancouver1.csv', index_col= 'Total income')
df_income = pd.DataFrame(income_path)
df_income.loc['Total',:]= df_income.sum(axis=0)
df_income.head()

In [None]:
df_income = df_income.drop(["$100.000 and over"], axis = 0) # drop this row because it contains duplicate information

In [None]:
# add a coloumn with the average income
lst =[]
for i in range(10):
    lst.append(10000*i+5000)
lst.append(125000)
lst.append(250000)
lst.append(sum(lst)/len(lst))
df_income.insert(0,'average income', lst)
df_income

In [None]:
# calculate the weighted average income
for element in range(len(df_income.columns.values)):
    if element != 0:
        for column in range(len(df_income.iloc[:,element])):
            average = df_income.iloc[column,0]
            total = df_income.iloc[12,element]
            actual_value = df_income.iloc[column,element]
            value = average*actual_value/total
            df_income.iloc[column,element] = value
df_income = df_income.drop('Total', axis=0)
df_income.loc['weighted income average',:]= df_income.sum(axis=0)
df_income = df_income.drop(df_income.index[0:12], axis=0)
df_income = df_income.drop("average income", axis=1)
df_income

## load citizenship dataset of Vancouver

In [None]:

citizen_path = pd.read_csv('dataset_Vancouver/citizenship_Vancouver1.csv'  )
df_citizen = pd.DataFrame(citizen_path)
df_citizen

# Preprocessing

In [None]:
df_citizen.drop(['ID'], axis=1, inplace=True)
# get the transposed dataset
df_citizen = df_citizen.transpose()
df_citizen.columns = df_citizen.iloc[0]
#remove first row from DataFrame
df_citizen = df_citizen[1:]
df_citizen

In [None]:
# put together places of birth of South-east- Asians, Japanese, Europeans, Western, ..
df_citizen['Total Population'] = df_citizen.iloc[:,0]+df_citizen.iloc[:,12]+df_citizen.iloc[:,30]+df_citizen.iloc[:,40]++df_citizen.iloc[:,59]

'''df_citizen['Percentage of Canadians'] = (df_citizen.iloc[:,59] / df_citizen['Total Population']) * 100
df_citizen['Percentage of Latin-Americans'] = (df_citizen.iloc[:,1:9].sum(axis=1)+df_citizen.iloc[:,11] )/ df_citizen['Total Population'] * 100
df_citizen['Percentage of Europeans'] = (df_citizen.iloc[:,12] / df_citizen['Total Population']) * 100
df_citizen['Percentage of US-Americans'] = (df_citizen.iloc[:,10] / df_citizen['Total Population']) * 100
'''
# Western citizens contain Canadians+ US-Americans + Europeans
df_citizen['Percentage of Western'] = ((df_citizen.iloc[:,10] +df_citizen.iloc[:,59]+df_citizen.iloc[:,10]) / df_citizen['Total Population']) * 100
df_citizen['Percentage of Asians'] = (df_citizen.iloc[:,40] / df_citizen['Total Population']) * 100
df_citizen['Percentage of Japanese'] = (df_citizen.iloc[:,48]/ df_citizen['Total Population']) * 100
df_citizen['Percentage of Chinese'] = (df_citizen.iloc[:,43]/ df_citizen['Total Population']) * 100
df_citizen['Percentage of South-East-Asians'] = ((df_citizen.iloc[:,56]+df_citizen.iloc[:,52]) / df_citizen['Total Population']) * 100

df_citizen.drop(df_citizen.iloc[:,0:60], axis=1, inplace=True)
df_citizen.head()

In [None]:
df_citizen = df_citizen.transpose()
df_citizen

In [None]:
df_new = pd.concat([df_citizen,df_income])
df_new = df_new.transpose()
lst = list(df_new.index)
for element in lst:
    df_new.loc[element, 'Neighbourhood'] = element + ',Vancouver,BC,Canada'
df_new = df_new.drop(df_new.index[23])
df_new = df_new.drop(df_new.index[22])
df_new

In [None]:
'''
 Geopy provides a class for popular mapping services. Nominatim is the service behind the popular OpenStreetMap that allows you to geocode for free. But you should comply with the usage policies in order to allow everyone to use it:
'''
#from geopy import RateLimiter
locator = Nominatim(user_agent="http")

# 1 - convenient function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)


In [None]:
# 2- - create location column
df_new['location'] = df_new['Neighbourhood'].apply(geocode)

In [None]:
# 3 - create longitude, latitude and altitude from location column (returns tuple)
df_new['point'] = df_new['location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [None]:
# 4 - split point column into latitude, longitude and altitude columns
df_new[['lat', 'long', 'altitude']] = pd.DataFrame(df_new['point'].tolist(), index=df_new.index)
df_new

In [None]:
# Folium mapping
import folium
map_Vancouver = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighbourhood in zip(df_new['lat'], df_new['long'], df_new['Neighbourhood']):
  label = '{}'.format(neighbourhood)
  label = folium.Popup(label)
  folium.CircleMarker(
      [lat,lng],
      radius=8,
      color='blue',
      popup=label,
      fill_color='#3186cc',
      fill_opacity=0.7,
      fill=True

  ).add_to(map_Vancouver)

map_Vancouver

# preprocessing of Restaurant dataset
## load restaurants of Vancouver dataset

In [None]:
rest_path = pd.read_csv('dataset_Vancouver/Vancouver_restaurant_data.csv', encoding="latin")
df_rest = pd.DataFrame(rest_path)
df_rest.head()

In [None]:
# drop unuseful features
df_rest.drop(['Rating.1', 'number-of-reviews','veg /Non- Veg','Rating','Total No of Ratings','Current Status','Opening time','Dine in availability','Takeaway type','Delivery availability','Description of the Restaurent','Restaurent Name','Cost'], axis=1, inplace=True)
df_rest.head()


In [None]:
 # there are duplicates in the dataframe -> remove them
df_rest[df_rest.duplicated(keep = False)]

In [None]:
df_rest = df_rest.drop_duplicates() # removes all duplicates
df_rest.reset_index(drop=True, inplace=True)
df_rest[df_rest.duplicated(keep = False)]

In [None]:
# merge place and landmark, since they have the same meaning

# first: replace nan with ''
values = {"Landmark": '', "Location and Address": ''}
df_rest = df_rest.fillna(value = values)

# now create a new column containing of the values Landmark and Location and Address
df_rest['Place'] = df_rest['Landmark'] + df_rest['Location and Address']
df_rest.drop(['Landmark'], axis=1, inplace=True)
df_rest.drop(['Location and Address'], axis=1, inplace=True)

df_rest.shape

In [None]:
df_rest.value_counts(df_rest["Type of Cusine"])

In [None]:
# problem: Type of cuisine "Restaurant" doesn't have a meaning
# -> remove it and replace it with a more meaningful category
cnt = 0
for element in range(len(df_rest["Type of Cusine"])):
    rest_name = ""
    if df_rest.loc[element,"Type of Cusine"] == "Restaurant":
        rest_name = df_rest.loc[element,"Name of Restaurent "]
        if (re.search("Mexican",rest_name)) or (re.search("Taco",rest_name)) or re.search("Brazil",rest_name):
            df_rest.loc[element,"Type of Cusine"] = "Latin"
        elif (re.search("French",rest_name)) or re.search("Italian",rest_name) or re.search("Steak",rest_name) or re.search("Provence",rest_name) or re.search("Mangez",rest_name) or re.search("Mediterran",rest_name) or re.search("Pizza",rest_name):
            df_rest.loc[element,"Type of Cusine"] = "Western"
        elif (re.search("Asia",rest_name)) or re.search("Thai",rest_name):
            df_rest.loc[element,"Type of Cusine"] = "Asian"
        elif re.search("Ramen",rest_name):
            df_rest.loc[element,"Type of Cusine"] = "Japanese"
        else:
            df_rest = df_rest.drop(df_rest.index[[element-cnt]])
            cnt += 1
df_rest.reset_index(drop=True, inplace=True)
print(df_rest.shape)

In [None]:
df_rest.value_counts(df_rest["Type of Cusine"])

In [None]:
# put type of cuisines together
cnt = 0
for element in range(len(df_rest["Type of Cusine"])):
    type = df_rest.loc[element,"Type of Cusine"]
    if (re.search("French",type)) or re.search("Italian",type) or re.search("Vegetarian",type) or re.search("European",type) or re.search("Steak",type) or re.search("Mediterran",type) or re.search("Pizza",type)  or re.search("Hawaii",type)  or re.search("Americ",type) or re.search("Canad",type)  or re.search("Salad",type)  or re.search("Spanish",type)  or re.search("Tapas",type)   or re.search("Pacific Northwest",type) or re.search("German",type) or re.search("Irish",type) or re.search("cois",type) or re.search("Tuscan",type) or re.search("Continental",type) or re.search("Hamburger",type) or re.search("Fast",type) or re.search("Cantonese",type) or re.search("Breakfast",type) or re.search("Sandwich",type):
        df_rest.loc[element,"Type of Cusine"] = "Western Restaurant"

    elif (re.search("Asia",type)) or re.search("Cambodia",type) or re.search("Pho",type) or re.search("Fusion",type) or re.search("Thai",type) or re.search("Vietnamese",type) or re.search("Lebanese",type) or re.search("Afgha",type) or re.search("Ethiop",type) or re.search("Persia",type):
        df_rest.loc[element,"Type of Cusine"] = "Asian Restaurant"

    elif re.search("Ramen",type) or re.search("Sushi",type) or re.search("Terri",type) or re.search("Izakaya",type) or re.search("Yakin",type) or re.search("Japanese",type) or re.search("Teppan",type)  or re.search("Seafood",type):
        df_rest.loc[element,"Type of Cusine"] = "Japanese Restaurant"

    elif re.search("Chines",type):
        df_rest.loc[element,"Type of Cusine"] = "Chinese Restaurant"

    else:
        df_rest = df_rest.drop(df_rest.index[[element-cnt]])
        cnt += 1
df_rest.reset_index(drop=True, inplace=True)

In [None]:
df_rest.value_counts(df_rest["Type of Cusine"])

In [None]:
df_rest.shape

In [None]:
df_rest.value_counts(df_rest["Place"])

In [None]:
for row in range(len(df_rest)):
    rest = df_rest.loc[row,'Name of Restaurent ']
    address = df_rest.loc[row,'Place']
    if address == "Vancouver, BC, Canada":
        df_rest.loc[row,'Place'] = rest + ",Vancouver, BC, Canada"
    elif address == "Richmond, BC, Canada":
        df_rest.loc[row,'Place'] = rest + ",Richmond, BC, Canada"
    elif address == "West Vancouver, BC, Canada":
        df_rest.loc[row,'Place'] = rest + ",West Vancouver, BC, Canada"

In [None]:
# put addresses in a proper way so that we can feed it into geopy
for row in range(len(df_rest)):
    address = df_rest.loc[row,'Place']
    address_lst = []
    if address.find("ú In") != -1:
        address_lst = address.split("ú In")
        if address_lst[0] == "Vancouver,BC,Canada":
            df_rest.loc[row,"Place"] = address_lst[1] + ", Vancouver,BC, CA"
        else:
            df_rest.loc[row,"Place"] = address_lst[0] + ",Vancouver, BC, Canada"
    elif address.find("ú Near") != -1:
        address_lst = address.split("ú Near")
        df_rest.loc[row,"Place"] = address_lst[0] + ",Vancouver, BC, Canada"
    elif address.find("#") != -1:
        address_lst = address.split("#")
        df_rest.loc[row,"Place"] = address_lst[0] + ",Vancouver, BC, Canada"
    elif address.find("ancouver") != -1:
        df_rest.loc[row,"Place"] = "Vancouver, BC, Canada"
    elif address.find("BC, Canada") == -1 :
        df_rest.loc[row,"Place"] = address + ",Vancouver, BC, Canada"
df_rest.reset_index(drop=True, inplace=True)


In [None]:
for row in range(len(df_rest)):
    address = df_rest.loc[row,'Place']
    if address.find("#") != -1:
        address_a = re.sub(r"# \d+", '', address)
        df_rest.loc[row,"Place"] = address_a
    elif address.find("Suite") != -1:
        address_a = re.sub(r"Suite \d+", '', address)
        df_rest.loc[row,"Place"] = address_a
    elif address.find("St") != -1:
        address_a = re.sub(r"g\d+", '', address)
        df_rest.loc[row,"Place"] = address_a
    elif address.find("Vancouver, BC, Canada ,Vancouver, BC, Canada") != -1:
        df_rest.loc[row,"Place"] = "Vancouver, BC, Canada"

In [None]:
for row in range(len(df_rest)):
    rest = df_rest.loc[row,'Name of Restaurent ']
    address = df_rest.loc[row,'Place']
    if address == "Vancouver, BC, Canada":
        df_rest.loc[row,'Place'] = rest + ",Vancouver, BC, Canada"

In [None]:
df_rest.value_counts(df_rest["Place"])

In [None]:
# 1 - convenient function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

In [None]:
# 2- - create location column
df_rest['location'] = df_rest['Place'].apply(geocode)

In [None]:
for row in range(len(df_rest['location'])):
    element = df_rest.loc[row, 'location']
    if element == None:
        df_rest = df_rest.drop([row])
df_rest.reset_index(drop=True, inplace=True)

In [None]:
# 3 - create longitude, latitude and altitude from location column (returns tuple)
df_rest['point'] = df_rest['location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [None]:
# 4 - split point column into latitude, longitude and altitude columns
df_rest[['lat', 'long', 'altitude']] = pd.DataFrame(df_rest['point'].tolist(), index=df_rest.index)

In [None]:
df_rest

In [None]:
# map of the remaining restaurants of Vancouver
import folium
map_Vancouver_rest = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighbourhood in zip(df_rest['lat'], df_rest['long'], df_rest['Place']):
  label = '{}'.format(neighbourhood)
  label = folium.Popup(label)
  folium.CircleMarker(
      [lat,lng],
      radius=8,
      color='blue',
      popup=label,
      fill_color='#3186cc',
      fill_opacity=0.7,
      fill=True

  ).add_to(map_Vancouver_rest)

map_Vancouver_rest

In [None]:
df_final_rest = df_rest.copy()
df_final_rest.drop(['Place','location','point','altitude'], axis=1, inplace=True)
df_final_rest.head()

## Haversine Distance
now we need to merge the two datasets  according to their neigborhood
1. task is to find out in what neighborhood each restaurant is placed

In [None]:
# Haversine Distance is used to find the closest location

from math import radians, cos, sin, asin, sqrt
def dist(lat1, long1, lat2, long2):
    # convert decimal degrees to radians
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # haversine formula
    dlon = long2 - long1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [None]:
# find closest neighbourhood
def find_nearest(lat, long):
    distances = df_new.apply(
        lambda row: dist(lat, long, row['lat'], row['long']),
        axis=1)
    return df_new.loc[distances.idxmin(), 'Neighbourhood']

In [None]:
df_final_rest['Neighbourhood'] = df_final_rest.apply(
    lambda row: find_nearest(row['lat'], row['long']),
    axis=1)
df_final_rest.head()

In [None]:
df_final_rest.groupby('Neighbourhood').count()

## Analyze neighbourhood -> one hot encoding for type of cuisines

In [None]:
# one hot encoding
vancouver_onehot = pd.get_dummies(df_final_rest[['Type of Cusine']], prefix='', prefix_sep='')

# add neighbourhood to column back to data frame
vancouver_onehot['Neighbourhood'] = df_final_rest['Neighbourhood']

# move neighbourhood column to the first column
vancouver_onehot = vancouver_onehot[ ['Neighbourhood'] + [ col for col in vancouver_onehot.columns if col != 'Neighbourhood']]
vancouver_onehot.head()

Parameter defined for analyze of the clustering

In [None]:
# If you want to change the clustering for another parameter, just ajust this string
str_cluster = "Chinese"
percentage_people = 'Percentage of ' + str_cluster

In [None]:
# create a dataset only with type of cuisine mentioned before
vancouver_grouped = vancouver_onehot.groupby('Neighbourhood').mean().reset_index()
vancouver_grouped = vancouver_grouped[['Neighbourhood', str_cluster +' Restaurant']]
vancouver_grouped.set_index('Neighbourhood', inplace=True)
vancouver_grouped.head()

In [None]:
# create a dataset only consisting of the percentage of people used for the clustering
df_x = df_new[['Neighbourhood','lat','long',percentage_people, 'weighted income average']]
df_x.reset_index(inplace = True, drop = True)
df_x.head()

In [None]:
# finale merge of the two dataframes
df_final = pd.merge(df_x, vancouver_grouped, on='Neighbourhood')
df_final = df_final[["Neighbourhood","lat","long","weighted income average",percentage_people, str_cluster+" Restaurant"]]
df_final.head()

## Visualization of the distribution of the restaurants
Plotting a bar graph to showcase the frequency distribution of Chinese restaurants in each neighbourhood.

In [None]:
# sort by descending order
vancouver_grouped.sort_values(by=str_cluster +' Restaurant', ascending=False, inplace=True)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot')
vancouver_grouped.plot(kind='bar', figsize=(15,5))

plt.title('Mean Frequency of ' + str_cluster + ' Restaurants in Each Neighbourhood in Vancouver')
plt.xlabel('Neighbourhood')
plt.ylabel('Mean frequency of '+ str_cluster+' Restaurants')

plt.show()

In [None]:
# Plotting a bar graph to illustrate the distribution of Chinese population group in each neighbourhood

In [None]:
df_ethnic = df_new[['Neighbourhood', percentage_people]].set_index('Neighbourhood')

# sort by descending order
df_ethnic.sort_values(by=percentage_people, ascending=False, inplace=True)

#plot bar graph
df_ethnic.plot(kind='bar', figsize=(13,5))
plt.title('Distribution of '+str_cluster+' in Each Neighbourhood in Vancouver')
plt.xlabel('Neighbourhood')
plt.ylabel(percentage_people)

plt.show()

In [None]:
df_income = df_new[['Neighbourhood', 'weighted income average']].set_index('Neighbourhood')

# sort by descending order
df_income.sort_values(by='weighted income average', ascending=False, inplace=True)

#plot bar graph
df_income.plot(kind='bar', figsize=(13,5))
plt.title('Distribution of Median Household Income in Each Neighbourhood in Vancouver')
plt.xlabel('Neighbourhood')
plt.ylabel('Median Household Income')

plt.show()

# Cluster Neighbourhoods

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
X = df_final.values[:,3:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

In [None]:
df_normalized = pd.DataFrame(Clus_dataSet)
df_normalized.rename(columns={0:'weighted income average', 1:'% '+str_cluster, 2:'No. of '+str_cluster+ ' Restaurants'}, inplace=True)
df_normalized.head()

In [None]:
from sklearn.cluster import KMeans

error_cost=[]

for i in range(3, 11):
  KM = KMeans(n_clusters=i, max_iter=100)
  try:
    KM.fit(df_normalized)

  except ValueError:
    print('Error on line', i)

  # calculate squared error for the clustered points
  error_cost.append(KM.inertia_ / 100)

# plot the K values against the squared error cost
plt.figure(figsize=(13,7))
plt.plot(range(3,11), error_cost, color='r', linewidth=3)
plt.xlabel('Number of k clusters')
plt.ylabel('Squared Error (Cost)')
plt.grid(color='white', linestyle='-', linewidth=2)

plt.show()

In [None]:
from yellowbrick.cluster import KElbowVisualizer

# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(3,11))

visualizer.fit(X)
visualizer

In [None]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_normalized)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
df_normalized.drop(['weighted income average','% '+str_cluster], axis=1, inplace=True)
df_clustered = pd.merge(df_x, df_normalized, left_index=True, right_index=True)
df_clustered.insert(0, 'Cluster Label', kmeans.labels_)

In [None]:
df_clustered.head()

In [None]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude,longitude], zoom_start=11)

# set color schemes for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_clustered['lat'], df_clustered['long'], df_clustered['Neighbourhood'], df_clustered['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

In [None]:
# examine clusters

In [None]:
# Cluster 0
df_clustered.loc[df_clustered['Cluster Label'] == 0]

In [None]:
df_clustered.loc[df_clustered['Cluster Label'] == 1]

In [None]:
# Cluster 2
df_clustered.loc[df_clustered['Cluster Label'] == 2]

In [None]:
# Cluster 3
df_clustered.loc[df_clustered['Cluster Label'] == 3]

In [None]:
# Cluster 4
df_clustered.loc[df_clustered['Cluster Label'] == 4]