In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import math
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

In [None]:
# # Mount Collab to Drive
# from google.colab import drive
# drive.mount('/content/drive/')

# Load Data

In [None]:
# file = '/content/drive/MyDrive/[3-NavSafe] INDENG 290 DATA-X/alternative_classification_data_localized.csv'
file = 'alternative_classification_data_localized.csv'
df_data = pd.read_csv(file, index_col=0)
df_data.head()

In [None]:
# neighborhood_file = '/content/drive/MyDrive/[3-NavSafe] INDENG 290 DATA-X/data_neighborhood_safety.csv'
neighborhood_file = 'data_neighborhood_safety.csv'
neighborhood = pd.read_csv(neighborhood_file)
neighborhood.head()

In [None]:
df_all = df_data.merge(neighborhood, how='left', left_on='Analysis Neighborhood', right_on='Neighborhood').drop(['Analysis Neighborhood','Neighborhood'],axis=1)
df_all.head()

In [None]:
def safety_calc(row):
    if row['Time Seg'] == 'Morning':
        return row['Average of safe_day']
    elif row['Time Seg'] == 'Afternoon':
        return row['Average of safe_rate']
    else:
        return row['Average of safe_night']

df_all['Safe'] = df_all.apply(lambda row: safety_calc(row), axis=1)
df_all = df_all.drop(['Average of safe_day','Average of safe_night','Average of safe_rate'],axis=1)
df_all.head()

In [None]:
df_all['Avoid'] = 0
# df_all.loc[(df_all['Average of safe_rate']<3.67) & (df_all['1.0']>10), 'Avoid'] = 1
df_all.loc[(df_all['1.0']>75) | (df_all['2.0']>100) | (df_all['3.0']>200), 'Avoid'] = 1
df_all.head()

In [None]:
time = pd.get_dummies(df_all['Time Seg'],drop_first=True)
df_train = pd.concat([time, df_all.drop(['NewLat','NewLon','Time Seg'],axis=1)], axis=1)
# df_train[['NewLat','NewLon','Evening','Morning','Night','1.0','2.0','3.0','4.0','5.0','6.0','Safe','Avoid']].head()
df_train.head()

# Supervised - Logistic Regression

This is the old modeling method, which has the quasi-complete seperation problem. We keep it here to show the learning path but it will be replaced by unsupervised methods later.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train.drop('Avoid',axis=1), df_train['Avoid'], test_size=0.3, random_state=10)

In [None]:
def plot_cv_curve(hyperparm_grid,train_scores,val_scores):
    ax = plt.subplot(111)
    ax.errorbar(hyperparm_grid,np.mean(train_scores,axis=1),yerr=np.std(train_scores,axis=1),label="train")
    ax.errorbar(hyperparm_grid,np.mean(val_scores,axis=1),yerr=np.std(val_scores,axis=1),label="validation")
    ax.set_xlabel('Hyperparameter')
    ax.set_ylabel('Score')
    ax.legend()
    ax.grid()
    return ax

In [None]:
kf = KFold(5, shuffle=True, random_state=10)

C_grid = np.logspace(-2,2,10)

features = ['1.0','2.0','3.0','4.0','5.0','6.0']
logit_pipe = Pipeline([('columns', ColumnTransformer([('keep', StandardScaler(with_mean=False), features)], 
                                                     remainder='passthrough')), 
                       ('logit', LogisticRegression(max_iter=5000, solver='newton-cg'))])
train_scores, val_scores = validation_curve(logit_pipe, x_train, y_train, 
                                            param_name='logit__C', param_range=C_grid, cv=kf)

ax = plot_cv_curve(C_grid,train_scores,val_scores)
ax.set_xlabel('C')
ax.set_ylabel('Accuracy')
ax.set_xscale('log')

In [None]:
logit_final = Pipeline([('columns', ColumnTransformer([('keep', StandardScaler(with_mean=False), features)], remainder='passthrough')), 
                       ('logit', LogisticRegression(max_iter=5000, solver='newton-cg', C=10))])
logit_final.fit(x_train, y_train)
pred = logit_final.predict_proba(x_test)[:,1]
y_pred = [1 if i >=0.5 else 0 for i in pred]

In [None]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
cm

In [None]:
print ("\nPrecision:", tp/(tp+fp))
print ("\nRecall:", tp/(tp+fn))

# Unsupervised - Clustering

We realize that this is an unsupervised machine learning problem. So in this session, we will explore different clustering methods and tune hyperparameters to train them.

In [None]:
# https://machinelearningmastery.com/clustering-algorithms-with-python/

In [None]:
df_cluster = df_train.drop(['Avoid'], axis=1)

In [None]:
df_cluster

## K-means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

### Tuning

In this sub-session, we will use elbow methods to tune the number of clusters with different metrics. 

In [None]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 15)
 
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(df_cluster)
    kmeanModel.fit(df_cluster)
 
    distortions.append(sum(np.min(cdist(df_cluster, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / df_cluster.shape[0])
    inertias.append(kmeanModel.inertia_)
 
    mapping1[k] = sum(np.min(cdist(df_cluster, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / df_cluster.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
for key, val in mapping1.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
# Silhouette Score for K means

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(random_state =10)
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='silhouette', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
plt.xlabel('Values of K')
plt.ylabel('silhouette score')
plt.title('The Elbow Method using silhouette')
plt.show()
# visualizer.show()        # Finalize and render the figure

df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on silhouette score:', df4['centers'].tolist())

In [None]:
# Calinski Harabasz Score for K means

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(random_state =10)
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='calinski_harabaz', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
plt.xlabel('Values of K')
plt.ylabel('calinski harabasz score')
plt.title('The Elbow Method using calinski harabasz')
plt.show()
# visualizer.show()        # Finalize and render the figure


df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on calinski harabasz:', df4['centers'].tolist())


In [None]:
# distortion Score for K means

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(random_state =10)
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='distortion', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
# visualizer.show()        # Finalize and render the figure


df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on distortion:', df4['centers'].tolist())


Since different metrics give quite different k results, we choose k=4 and k=10 as the possible optimal k.

### Train

In [None]:
from numpy import unique
from numpy import where

First, k=4

In [None]:
# define the model
kmean_model = KMeans(n_clusters=4)
# fit the model
kmean_model.fit(df_cluster)
# assign a cluster to each example
kmean_yhat = kmean_model.predict(df_cluster)
# retrieve unique clusters
clusters = unique(kmean_yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	plt.scatter(df_kmean[row_ix, 0], df_kmean[row_ix, 1])
# # show the plot
# pyplot.show()

In [None]:
score_kmean_s = silhouette_score(df_cluster, kmean_model.labels_, metric='euclidean')
score_kmean_c = calinski_harabasz_score(df_cluster, kmean_model.labels_)
score_kmean_d = davies_bouldin_score(df_cluster, kmean_yhat)
print('Silhouette Score: %.4f' % score_kmean_s)
print('Calinski Harabasz Score: %.4f' % score_kmean_c)
print('Davies Bouldin Score: %.4f' % score_kmean_d)

In [None]:
df_kmean = df_data.copy()
df_kmean['Safe'] = df_cluster['Safe']
df_kmean['Cluster'] = kmean_yhat

In [None]:
pd.Series(kmean_yhat).value_counts()

In [None]:
# check what this cluster looks like
df_kmean[df_kmean['Cluster']==2].describe()

Then, k=10

In [None]:
# old kmeans k=10
kmean_model_10 = KMeans(n_clusters=10)
# fit the model
kmean_model_10.fit(df_cluster)
# assign a cluster to each example
kmean_yhat_10 = kmean_model_10.predict(df_cluster)
# retrieve unique clusters
clusters = unique(kmean_yhat_10)
df_kmean_10 = df_data.copy()
df_kmean_10['Safe'] = df_cluster['Safe']
df_kmean_10['Cluster'] = kmean_yhat_10

In [None]:
score_kmean_s = silhouette_score(df_cluster, kmean_model_10.labels_, metric='euclidean')
score_kmean_c = calinski_harabasz_score(df_cluster, kmean_model_10.labels_)
score_kmean_d = davies_bouldin_score(df_cluster, kmean_yhat_10)
print('Silhouette Score: %.4f' % score_kmean_s)
print('Calinski Harabasz Score: %.4f' % score_kmean_c)
print('Davies Bouldin Score: %.4f' % score_kmean_d)

The result is ery random, based on the initial points choice

## Agglomerative Clustering

### Tuning

In this sub-session, we will use elbow methods to tune the number of clusters with different metrics. 

In [None]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(df_cluster, method  = "ward"))
plt.title('Dendrogram')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
# Silhouette Score for agg

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = AgglomerativeClustering()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='silhouette', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
plt.xlabel('Values of K')
plt.ylabel('silhouette score')
plt.title('The Elbow Method using silhouette')
plt.show()
# visualizer.show()        # Finalize and render the figure

df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on silhouette score:', df4['centers'].tolist())

In [None]:
# calinski_harabaz Score for agg

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = AgglomerativeClustering()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='calinski_harabaz', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
plt.xlabel('Values of K')
plt.ylabel('calinski_harabaz score')
plt.title('The Elbow Method using calinski_harabaz')
plt.show()
# visualizer.show()        # Finalize and render the figure

df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on calinski_harabaz score:', df4['centers'].tolist())

In [None]:
# distortion Score for agg

# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = AgglomerativeClustering()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=range(2,15,2),metric='distortion', timings= True, locate_elbow=True)
visualizer.fit(df_cluster)        # Fit the data to the visualizer
plt.xlabel('Values of K')
plt.ylabel('distortion score')
plt.title('The Elbow Method using distortion')
plt.show()
# visualizer.show()        # Finalize and render the figure

df3 = pd.DataFrame(visualizer.k_values_,columns=['centers'])
df3['scores'] = visualizer.k_scores_
df4 = df3[df3.scores == df3.scores.max()]
print('Optimal number of clusters based on distortion score:', df4['centers'].tolist())

Since different metrics give quite different k results, we choose k=3 and k=10 as the possible optimal k.

### Train

agg, k=3

In [None]:
# agglomerative clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot

# define the model
agg_model = AgglomerativeClustering(n_clusters=3)
# fit model and predict clusters
agg_yhat = agg_model.fit_predict(df_cluster)
# retrieve unique clusters
clusters = unique(agg_yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# # show the plot
# pyplot.show()

In [None]:
score_agg_s = silhouette_score(df_cluster, agg_model.labels_, metric='euclidean')
score_agg_c = calinski_harabasz_score(df_cluster, agg_model.labels_)
score_agg_d = davies_bouldin_score(df_cluster, agg_yhat)
print('Silhouette Score: %.4f' % score_agg_s)
print('Calinski Harabasz Score: %.4f' % score_agg_c)
print('Davies Bouldin Score: %.4f' % score_agg_d)

In [None]:
df_agg = df_data.copy()
df_agg['Safe'] = df_cluster['Safe']
df_agg['Cluster'] = agg_yhat

In [None]:
df_agg['Cluster'].value_counts()

In [None]:
df_agg[df_agg['Cluster']==1].describe()

agg, k=10

In [None]:
# agglomerative k=10
agg_model_10 = AgglomerativeClustering(n_clusters=10)
# fit the model
# assign a cluster to each example
agg_yhat_10 = agg_model_10.fit_predict(df_cluster)
# retrieve unique clusters
clusters = unique(agg_yhat_10)
df_agg_10 = df_data.copy()
df_agg_10['Safe'] = df_cluster['Safe']
df_agg_10['Cluster'] = agg_yhat_10

In [None]:
score_kmean_s = silhouette_score(df_cluster, agg_model_10.labels_, metric='euclidean')
score_kmean_c = calinski_harabasz_score(df_cluster, agg_model_10.labels_)
score_kmean_d = davies_bouldin_score(df_cluster, agg_yhat_10)
print('Silhouette Score: %.4f' % score_kmean_s)
print('Calinski Harabasz Score: %.4f' % score_kmean_c)
print('Davies Bouldin Score: %.4f' % score_kmean_d)

Similar algorithm as kmeans, but has a stable and fixed outcome

## Gaussian mixture model

In [None]:
# gaussian mixture clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot

### Tuning

In this sub-session, we will use elbow methods to tune the number of clusters with different metrics.

In [None]:

n_components = np.arange(2, 20, 2)
models = [GaussianMixture(n_components=n).fit(df_cluster)
          for n in n_components]

plt.plot(n_components, [m.bic(df_cluster) for m in models], label='BIC')
plt.plot(n_components, [m.aic(df_cluster) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
n_components = np.arange(2,20)
models = [GaussianMixture(n_components=n).fit(df_cluster)
          for n in n_components]
m_yhat = [m.predict(df_cluster) for m in models]

silhouette = []
calinski = []
davies = []
for i in m_yhat:
  silhouette.append(silhouette_score(df_cluster, i, metric='euclidean'))
  # calinski.append(calinski_harabasz_score(df_cluster, i))
  # davies.append(davies_bouldin_score(df_cluster, i))


plt.plot(n_components, silhouette, label='silhouette')
# plt.plot(n_components, calinski, label='calinski')
# plt.plot(n_components, davies, label='davies')

plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
n_components = range(2, 20,2)
covariance_type = ['spherical', 'tied', 'diag', 'full']
score=[]
for cov in covariance_type:
    for n_comp in n_components:
        gmm = GaussianMixture(n_components=n_comp,covariance_type=cov, random_state = 10, max_iter=10000)
        gmm.fit(df_cluster)
        score.append((cov,n_comp,gmm.bic(df_cluster)))
score_1 = pd.DataFrame(score)
score_1.columns = ['Covariance_Type', 'N_Components','BIC_Score']
score_2 = score_1[score_1.BIC_Score == score_1.BIC_Score.min()]

score_2.head(n=2)

In [None]:
# Silhouette Score for GMM


n_components = range(2, 20,2)
covariance_type = ['spherical', 'tied', 'diag', 'full']
score=[]
for cov in covariance_type:
    for n_comp in n_components:
        gmm=GaussianMixture(n_components=n_comp,covariance_type=cov,random_state = 10,max_iter=10000)
        model = gmm.fit(df_cluster)
        model_2 = model.predict(df_cluster)
        score_s = silhouette_score(df_cluster, model_2, metric='euclidean')
        score.append((cov,n_comp,score_s))
score_1 = pd.DataFrame(score)
score_1.columns = ['Covariance_Type', 'N_Components','Silhouette_Score']
score_2 = score_1[score_1.Silhouette_Score == score_1.Silhouette_Score.max()]
score_2.head(n=2)

In [None]:

# Calinski Harabasz Score for GMM


n_components = range(2, 20,2)
covariance_type = ['spherical', 'tied', 'diag', 'full']
score=[]
for cov in covariance_type:
    for n_comp in n_components:
        gmm=GaussianMixture(n_components=n_comp,covariance_type=cov, random_state = 10,max_iter=10000)
        model = gmm.fit(df_cluster)
        model_2 = model.predict(df_cluster)
        score_c = calinski_harabasz_score(df_cluster, model_2)
        score.append((cov,n_comp,score_c))
score_1 = pd.DataFrame(score)
score_1.columns = ['Covariance_Type', 'N_Components','Calinski_Harabasz_Score']
score_2 = score_1[score_1.Calinski_Harabasz_Score == score_1.Calinski_Harabasz_Score.max()]
score_2.head(n=5)

### Train

gmm, k=4

In [None]:
# define the model
gmm_model = GaussianMixture(n_components=4, covariance_type='tied')
# fit the model
gmm_model.fit(df_cluster)
# assign a cluster to each example
gmm_yhat = gmm_model.predict(df_cluster)
# # retrieve unique clusters
# clusters = unique(yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# # show the plot
# pyplot.show()

In [None]:
score_gmm_s = silhouette_score(df_cluster, gmm_yhat, metric='euclidean')
score_gmm_c = calinski_harabasz_score(df_cluster, gmm_yhat)
score_gmm_d = davies_bouldin_score(df_cluster, gmm_yhat)
print('Silhouette Score: %.4f' % score_gmm_s)
print('Calinski Harabasz Score: %.4f' % score_gmm_c)
print('Davies Bouldin Score: %.4f' % score_gmm_d)

In [None]:
df_gmm = df_data.copy()
df_gmm['Safe'] = df_cluster['Safe']
df_gmm['Cluster'] = gmm_yhat
df_gmm['Cluster'].value_counts()

In [None]:
df_gmm[df_gmm['Cluster'].isin([1])].describe()

gmm, k=16

In [None]:
# gausian n_components = 16, cov_type = full
gmm_model_16 = GaussianMixture(n_components=16, covariance_type='full')
# fit the model
gmm_model_16.fit(df_cluster)
# assign a cluster to each example
gmm_yhat_16= gmm_model_16.predict(df_cluster)
df_gmm_16 = df_data.copy()
df_gmm_16['Safe'] = df_cluster['Safe']
df_gmm_16['Cluster'] = gmm_yhat_16

In [None]:
score_kmean_s = silhouette_score(df_cluster, gmm_yhat_16, metric='euclidean')
score_kmean_c = calinski_harabasz_score(df_cluster, gmm_yhat_16)
score_kmean_d = davies_bouldin_score(df_cluster, gmm_yhat_16)
print('Silhouette Score: %.4f' % score_kmean_s)
print('Calinski Harabasz Score: %.4f' % score_kmean_c)
print('Davies Bouldin Score: %.4f' % score_kmean_d)

# Map visualization

In [None]:
!pip install gmaps
!pip install ipywidgets
!pip install widgetsnbextension
import gmaps 
import ipywidgets as widgets
from ipywidgets.embed import embed_minimal_html
import IPython

gmaps.configure(api_key='AIzaSyDgJrLjmtTKlpLjwAfmseJJ-w8ZEy_YHeM')

Assign weights [0.4, 0.3, 0.2, 0.1] to group0-group3 for visualization.

In [None]:
# rules = [0.4, 0.3, 0.2, 0.1] for group0 to group3
def assign_weights(df, rules):
  k = unique(df['Cluster'])
  df['Weight'] = -1
  for i in k:
    weight = 0
    for j in range(len(rules)):
      weight += rules[j]*df.loc[df['Cluster']==i, str(float(j+1))].mean()

    df.loc[df['Cluster']==i, 'Weight'] = weight
  return df

## kmeans, k=4

In [None]:
df_1 = assign_weights(df_kmean, [0.4, 0.3, 0.2, 0.1])
df_1

In [None]:
df_1['Weight'].value_counts()

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_1[(df_1['Cluster'].isin([2,1,3,0])) & (df_1['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_1['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## kmeans, k=10

In [None]:
df_1_10 = assign_weights(df_kmean_10, [0.4, 0.3, 0.2, 0.1])
df_1_10

In [None]:
df_1_10['Weight'].value_counts()

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_1_10[(df_1_10['Cluster'].isin([9,5,4,6,1,3,8,2,7,0])) & (df_1_10['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_1_10['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## agglomerative, k=3

In [None]:
df_2 = assign_weights(df_agg, [0.4, 0.3, 0.2, 0.1])
df_2

In [None]:
df_2['Cluster'].value_counts()

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_2[(df_2['Cluster'].isin([0,2,1])) & (df_2['Time Seg']=='Evening')][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_2[(df_2['Cluster'].isin([1,2])) & (df_2['Time Seg']=='Evening')][['NewLat','NewLon','Time Seg']].drop_duplicates()

# [6,3,2,1,8,7]
centers['Weight'] = df_2['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## agglomerative, k=10

In [None]:
df_2_10 = assign_weights(df_agg_10, [0.4, 0.3, 0.2, 0.1])
df_2_10['Cluster'].value_counts()

In [None]:
score_kmean_s = silhouette_score(df_cluster, agg_model_10.labels_, metric='euclidean')
score_kmean_c = calinski_harabasz_score(df_cluster, agg_model_10.labels_)
score_kmean_d = davies_bouldin_score(df_cluster, agg_yhat_10)
print('Silhouette Score: %.4f' % score_kmean_s)
print('Calinski Harabasz Score: %.4f' % score_kmean_c)
print('Davies Bouldin Score: %.4f' % score_kmean_d)

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_2_10[(df_2_10['Cluster'].isin([1,7,9,2,0,8,4])) & (df_2_10['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_2_10[(df_2_10['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_2_10['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## gaussian, k=4

In [None]:
df_3 = assign_weights(df_gmm, [0.4, 0.3, 0.2, 0.1])
df_3

In [None]:
df_3['Cluster'].value_counts()

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_3[(df_3['Cluster'].isin([1,3])) & (df_3['Time Seg']=='Evening')][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_3[(df_3['Time Seg']=='Evening')][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_3[['NewLat','NewLon','Time Seg']].drop_duplicates()
# [5,4,2,6,7,1]
centers['Weight'] = df_3['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## gaussian, k=16

In [None]:
df_3_16 = assign_weights(df_gmm_16, [0.4, 0.3, 0.2, 0.1])
df_3_16['Cluster'].value_counts()

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_3_16[(df_3_16['Cluster'].isin([12,13,4,7,15,8,11,2,9,10,3,1])) & (df_3_16['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_3_16[(df_3_16['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_3_16['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

## average weights

for each unit, use the average of weights from all models to visualize the result

In [None]:
df_avg = df_1.copy()

In [None]:
df_avg.drop(columns=['Cluster'], inplace=True)
df_avg['Weight'] = (df_1['Weight']+df_1_10['Weight']+df_2['Weight']+df_2_10['Weight']+df_3['Weight']+df_3_16['Weight'])/6
df_avg

In [None]:
# centers = df_1[(df_1['Cluster']==6) | (df_1['Cluster']==3) | (df_1['Cluster']==4) | (df_1['Cluster']==1) | (df_1['Cluster']==9) |(df_1['Cluster']==7)][['NewLat','NewLon','Time Seg']].drop_duplicates()
# centers = df_3_16[(df_3_16['Cluster'].isin([12,13,4,7,15,8,11,2,9,10,3,1])) & (df_3_16['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
centers = df_avg[(df_avg['Time Seg'].isin(['Evening']))][['NewLat','NewLon','Time Seg']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_avg['Weight']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

# Compare w test set

We create a test set with 120+ records to test the previous 7 models. We will classify the test set with each model result and generate scores for each model to evaluate the performance.

In [None]:
from sklearn.metrics import confusion_matrix
from itertools import compress, product

In [None]:
# test_file = '/content/drive/MyDrive/[3-NavSafe] INDENG 290 DATA-X/data_testset.csv'
test_file = 'data_testset.csv'
df_test = pd.read_csv(test_file)

In [None]:
df_test.drop(columns=['Unnamed: 0'], inplace=True)
df_test.head()

In [None]:
df_test.shape

In [None]:
# print model clusters & prediction
def compare_model1(df, df_test):
  df_cluster_info = df['Cluster'].value_counts()
  print(df_cluster_info)

  df_comp = df_test.merge(df, on=['NewLat','NewLon','Time Seg'], how='left')

  print(df_comp[['Avoid','Cluster']].value_counts())
  return df_comp, df_cluster_info

In [None]:
# print accuracy / recall / precision
def compare_model2(df_comp, rules = [2]):
  df_comp['pred'] = df_comp['Cluster'].apply(lambda x: 0 if x in rules else 1)

  tn, fp, fn, tp = confusion_matrix(df_comp['Avoid'], df_comp['pred']).ravel()
  accuracy = (tn+tp)/(tn+fp+fn+tp)
  recall = (tp)/(fn+tp)
  precision = (tp)/(fp+tp)
  # print('accuracy: {}'.format(accuracy))
  # print('recall: {}'.format((recall))
  # print('precision: {}'.format(precision))
  return accuracy, recall, precision

In [None]:
# define the rules for classifying: combinations of all clusters except the last one
def find_rules(df_cluster_info, end=-1):
  avoid_0 = [df_cluster_info[0]]
  items = df_cluster_info[1:end]
  comb = list(( set(compress(items,mask)) for mask in product(*[[0,1]]*len(items)) ))
  return [avoid_0+list(i) for i in comb]

In [None]:
# find the rule with highest recall
def check_rules(df_comp, rules):
  accuracy = []
  recall = []
  precision = []
  for i in rules:
    a, b, c = compare_model2(df_comp, i)
    accuracy.append(a)
    recall.append(b)
    precision.append(c)
  table = pd.DataFrame({'rule':rules, 'accuracy':accuracy, 'recall':recall, 'precision':precision})
  return table[table['recall']==table['recall'].max()], table

In [None]:
# %load_ext google.colab.data_table

## kmeans, k=4


In [None]:
df_1_comp, df_1_cluster_info = compare_model1(df_1, df_test)

df_1_rules = find_rules(df_1_cluster_info.index.tolist())
print(df_1_rules)

df_1_result, df_1_table = check_rules(df_1_comp, df_1_rules)
print(df_1_result)
display(df_1_table)

In [None]:
plt.plot(range(4), df_1_table['accuracy'], label='accuracy')
plt.plot(range(4), df_1_table['recall'], label='recall')
plt.plot(range(4), df_1_table['precision'], label='precision')
plt.legend(loc='best')
plt.xlabel('rules');

## kmeans, k=10

In [None]:
df_1_10_comp, df_1_10_cluster_info = compare_model1(df_1_10, df_test)

df_1_10_rules = find_rules(df_1_10_cluster_info.index.tolist())
print(df_1_10_rules)

df_1_10_result, df_1_10_table = check_rules(df_1_10_comp, df_1_10_rules)
print(df_1_10_result)
display(df_1_10_table)

In [None]:
# plt.figure(figsize=(50,20))
plt.plot(range(256), df_1_10_table['accuracy'], label='accuracy')
plt.plot(range(256), df_1_10_table['recall'], label='recall')
plt.plot(range(256), df_1_10_table['precision'], label='precision')
plt.legend(loc='best')
plt.xlabel('rules');

In [None]:
df_1_10_table.loc[192]

## agglomerative, k=3

In [None]:
df_2_comp, df_2_cluster_info = compare_model1(df_2, df_test)

df_2_rules = find_rules(df_2_cluster_info.index.tolist())
print(df_2_rules)

df_2_result, df_2_table = check_rules(df_2_comp, df_2_rules)
print(df_2_result)
display(df_2_table)

## agglomerative, k=10

In [None]:
df_2_10_comp, df_2_10_cluster_info = compare_model1(df_2_10, df_test)

df_2_10_rules = find_rules(df_2_10_cluster_info.index.tolist())
print(df_2_10_rules)

df_2_10_result, df_2_10_table = check_rules(df_2_10_comp, df_2_10_rules)
print(df_2_10_result)
display(df_2_10_table)

In [None]:
# plt.figure(figsize=(50,20))
plt.plot(range(df_2_10_table.shape[0]), df_2_10_table['accuracy'], label='accuracy')
plt.plot(range(df_2_10_table.shape[0]), df_2_10_table['recall'], label='recall')
plt.plot(range(df_2_10_table.shape[0]), df_2_10_table['precision'], label='precision')
plt.legend(loc='best')
plt.xlabel('rules');

In [None]:
print(df_2_10_table.loc[128])
print(df_2_10_table.loc[192])

## gaussian, k=4

In [None]:
df_3_comp, df_3_cluster_info = compare_model1(df_3, df_test)

df_3_rules = find_rules(df_3_cluster_info.index.tolist())
print(df_3_rules)

df_3_result, df_3_table = check_rules(df_3_comp, df_3_rules)
print(df_3_result)
display(df_3_table)

## gaussian, k=16

In [None]:
df_3_16_comp, df_3_16_cluster_info = compare_model1(df_3_16, df_test)

df_3_16_rules = find_rules(df_3_16_cluster_info.index.tolist())
print(df_3_16_rules)

df_3_16_result, df_3_16_table = check_rules(df_3_16_comp, df_3_16_rules)
print(df_3_16_result)
display(df_3_16_table)

In [None]:
print(df_3_16_table.loc[12288])
print(df_3_16_table.loc[14336])
print(df_3_16_table.loc[15360])

In [None]:
# plt.figure(figsize=(50,20))
plt.plot(range(df_3_16_table.shape[0]), df_3_16_table['accuracy'], label='accuracy')
plt.plot(range(df_3_16_table.shape[0]), df_3_16_table['recall'], label='recall')
plt.plot(range(df_3_16_table.shape[0]), df_3_16_table['precision'], label='precision')
plt.legend(loc='best')
plt.xlabel('rules');

## average model

In [None]:
df_avg['Cluster'] = pd.cut(df_avg['Weight'], 30, labels=False)

In [None]:
df_avg_comp, df_avg_cluster_info = compare_model1(df_avg, df_test)

df_avg_rules = find_rules(df_avg_cluster_info.index.tolist(), 9)
print(df_avg_rules)

df_avg_result, df_avg_table = check_rules(df_avg_comp, df_avg_rules)
print(df_avg_result)
display(df_avg_table)

In [None]:
print(df_avg_table.loc[128])
print(df_avg_table.loc[192])

## compare result

In [None]:
# %unload_ext google.colab.data_table

In [None]:
df_all_compare = pd.DataFrame({'k,k=4':df_1_table.loc[0],
                               'k,k=10':df_1_10_table.loc[192],
                               'agg,k=3':df_2_table.loc[0],
                               'agg,k=10':df_2_10_table.loc[128],
                               'gmm,k=4':df_3_table.loc[0],
                               'gmm,k=16':df_3_16_table.loc[12288],
                               'avg':df_avg_table.loc[128]})
df_all_compare

agg,10 performs better than k,4;

gmm,4 performs better than avg;

agg,3 performs similarly to k,10, but more stable;


---



accuracy: gmm,16 -> gmm,4 -> agg,3 ->agg,10

recall: agg,10 -> gmm,4 -> agg,3 / gmm,16

precision: gmm,16 -> gmm,4 -> agg,3 -> agg,10



---



**If we want best recall, agg,10**

**If we want a balance overall, gmm,4 / gmm,16**

# Predict

In [None]:
def predict_avoid(df, safe_cluster_num):
  cluster = df['Cluster'].value_counts().index.tolist()
  safe = cluster[:safe_cluster_num]
  df['pred'] = df['Cluster'].apply(lambda x: 0 if x in safe else 1)
  return df

In [None]:
df_1_pred = predict_avoid(df_1, 1)
df_1_10_pred = predict_avoid(df_1_10, 3)
df_2_pred = predict_avoid(df_2, 1)
df_2_10_pred = predict_avoid(df_2_10, 2)
df_3_pred = predict_avoid(df_3, 1)
df_3_16_pred = predict_avoid(df_3_16, 3)
df_avg_pred = predict_avoid(df_avg, 2)

In [None]:
# df_2_10_pred.to_csv('final_prediction_agg_10.csv')

In [None]:
df_1_10_pred['pred'].value_counts()

# Compare by routes

The following session will use each model to test on the sample routes. Each subsession will generate the required input (avoid areas) for HERE API to provide the safe route result.

In [None]:
def find_active_area(start_lat, start_lon, end_lat, end_lon, val):
  if abs(start_lat) > abs(end_lat):
    act_start_lat = (abs(start_lat)+val)*np.sign(start_lat)
    act_end_lat = (abs(end_lat)-val)*np.sign(end_lat)
  else:
    act_start_lat = (abs(start_lat)-val)*np.sign(start_lat)
    act_end_lat = (abs(end_lat)+val)*np.sign(end_lat)

  if abs(start_lon) > abs(end_lon):
    act_start_lon = (abs(start_lon)+val)*np.sign(start_lon)
    act_end_lon = (abs(end_lon)-val)*np.sign(end_lon)
  else:
    act_start_lon = (abs(start_lon)-val)*np.sign(start_lon)
    act_end_lon = (abs(end_lon)+val)*np.sign(end_lon)

    return act_start_lat, act_start_lon, act_end_lat, act_end_lon

In [None]:
def find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, time_seg, df):
  df_time = df[df['Time Seg']==time_seg]

  if act_start_lat>act_end_lat:
    lat_range = [act_end_lat, act_start_lat]
  else:
    lat_range = [act_start_lat, act_end_lat]

  if act_start_lon>act_end_lon:
    lon_range = [act_end_lon, act_start_lat]
  else:
    lon_range = [act_start_lat, act_end_lat]

  df_area = df_time[(df_time['NewLat']>=lat_range[0]) & (df_time['NewLat']<=lat_range[1]) & 
                    (df_time['NewLon']>=lon_range[0]) & (df_time['NewLon']<=lon_range[1])]
  return df_area

**Caltrain to Brenda's Soul Food**	
start: 37.7766711	-122.3970318	
end: 37.781409	-122.4178537	
time: 7:00 PM - afternoon

**16th St BART to Dolores Park**	
start:37.7646383	-122.4201503  
end:37.761652	-122.423218	
time:10:00 PM - evening

## test route 1 

In [None]:
act_start_lat, act_start_lon, act_end_lat, act_end_lon = find_active_area(37.7766711,-122.3970318,37.781409,-122.4178537, 0.005)

### kmean, k=4

In [None]:
df_related_cluster_1 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_1_pred)
df_related_cluster_1

In [None]:
centers = df_related_cluster_1[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_1['Weight']*df_related_cluster_1['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### kmeans, k=10

In [None]:
df_related_cluster_1_10 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_1_10_pred)
df_related_cluster_1_10

In [None]:
centers = df_related_cluster_1_10[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_1_10['Weight']*df_related_cluster_1_10['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### agg, k=3

In [None]:
df_related_cluster_2 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_2_pred)
df_related_cluster_2

In [None]:
centers = df_related_cluster_2[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_2['Weight']*df_related_cluster_2['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### agg, k=10

In [None]:
df_related_cluster_2_10 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_2_10_pred)
df_related_cluster_2_10

In [None]:
centers = df_related_cluster_2_10[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_2_10['Weight']*df_related_cluster_2_10['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### gmm, k=4

In [None]:
df_related_cluster_3 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_3_pred)
df_related_cluster_3

In [None]:
centers = df_related_cluster_3[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_3['Weight']*df_related_cluster_3['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### gmm,k =16

In [None]:
df_related_cluster_3_16 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_3_16_pred)
df_related_cluster_3_16

In [None]:
centers = df_related_cluster_3_16[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_3_16['Weight']*df_related_cluster_3_16['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### avg

In [None]:
df_related_cluster_avg = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Afternoon', df_avg_pred)
df_related_cluster_avg

In [None]:
centers = df_related_cluster_avg[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_avg['Weight']*df_related_cluster_avg['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

## test route 2

In [None]:
act_start_lat, act_start_lon, act_end_lat, act_end_lon = find_active_area(37.7646383,-122.4201503,37.761652,-122.423218, 0.005)

### kmean, k=4

In [None]:
df_related_cluster_1 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_1_pred)

In [None]:
df_related_cluster_1

In [None]:
centers = df_related_cluster_1[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_1['Weight']*df_related_cluster_1['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)

In [None]:
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values

In [None]:
avoid_param = '|'.join(avoid_list)
#'avoid[areas]':'bbox:-122.406046,37.781438,-122.404866,37.782328|bbox:-122.413149,37.780536,-122.410864,37.781469|bbox:-122.406594,37.777967,-122.404946,37.779213',

In [None]:
avoid_param

In [None]:
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))
  # let temp = new H.map.Rect(new H.geo.Rect(area_label[3], area_label[0],area_label[1],area_label[2]))
  # avoid_area.append(temp)

avoid_rec

### kmeans, k=10

In [None]:
df_related_cluster_1_10 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_1_10_pred)
df_related_cluster_1_10

In [None]:
centers = df_related_cluster_1_10[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_1_10['Weight']*df_related_cluster_1_10['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### agg, k=3

In [None]:
df_related_cluster_2 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_2_pred)
df_related_cluster_2

In [None]:
centers = df_related_cluster_2[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_2['Weight']*df_related_cluster_2['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### agg, k=10

In [None]:
df_related_cluster_2_10 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_2_10_pred)
df_related_cluster_2_10

In [None]:
centers = df_related_cluster_2_10[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_2_10['Weight']*df_related_cluster_2_10['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### gmm, k=4

In [None]:
df_related_cluster_3 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_3_pred)
df_related_cluster_3

In [None]:
centers = df_related_cluster_3[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_3['Weight']*df_related_cluster_3['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### gmm,k =16

In [None]:
df_related_cluster_3_16 = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_3_16_pred)
df_related_cluster_3_16

In [None]:
centers = df_related_cluster_3_16[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_3_16['Weight']*df_related_cluster_3_16['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

### avg

In [None]:
df_related_cluster_avg = find_related_cluster(act_start_lat, act_start_lon, act_end_lat, act_end_lon, 'Evening', df_avg_pred)
df_related_cluster_avg

In [None]:
centers = df_related_cluster_avg[['NewLat','NewLon']].drop_duplicates()
# [6,3,4,1,9,7]
centers['Weight'] = df_related_cluster_avg['Weight']*df_related_cluster_avg['pred']
# centers = df_1[['NewLat','NewLon']].drop_duplicates()
# centers['Weight'] = df_1['Weight']

locations = centers[['NewLat', 'NewLon']]
weights = centers['Weight']
fig = gmaps.figure() 
heatmap_layer = gmaps.heatmap_layer(locations, weights=weights) 
fig.add_layer(gmaps.heatmap_layer(locations, weights=weights)) 

embed_minimal_html('export.html', views=[fig])
IPython.display.HTML(filename="export.html")

In [None]:
avoid_area = centers.sort_values(by=['Weight'], ascending=False).head(10)
avoid_area

In [None]:
avoid_list = avoid_area[['NewLat','NewLon']].apply(lambda x: 'bbox:' + str(x['NewLon']-0.00125) + ',' + str(x['NewLat']-0.00125) + 
                                                   ',' + str(x['NewLon']+0.00125) + ',' + str(x['NewLat']+0.00125), axis=1).values
avoid_param = '|'.join(avoid_list)
avoid_rec = []
for i in range(len(avoid_list)):
  area_label = avoid_list[i][5:].split(',')
  avoid_rec.append((float(area_label[3]), float(area_label[0]), float(area_label[1]), float(area_label[2])))

avoid_param

In [None]:
avoid_rec

# Conclusion

Based on results from "Compare w test set" session, we conclude that:

If we want best recall, agg,10 is the best;

If we want a balance overall, gmm,4 / gmm,16 is the best;

Based on the results from "Compare by routes", we conclude that:

Either agg, 10 or gmm, 16 provides the best route result.

But agg is more stable than gmm based on their algorithm. So we choose agg, 10 as our final model.