# Location Intelligence Data Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Modelling

In [None]:
def metrics_plots(X, max_k=10):

    score = []
    score_kmeans_s = []
    score_kmeans_c = []
    score_kmeans_d = []

    for k in range(2, max_k):
        kmeans = KMeans(n_clusters=k, random_state= 101)
        predictions = kmeans.fit_predict(X)
        # Calculate cluster validation metrics and append to lists of metrics
        score.append(kmeans.score(X))
        score_kmeans_s.append(silhouette_score(X, kmeans.labels_, metric='euclidean'))
        score_kmeans_c.append(calinski_harabasz_score(X, kmeans.labels_))
        score_kmeans_d.append(davies_bouldin_score(X, predictions))

    list_scores = [score, score_kmeans_s, score_kmeans_c, score_kmeans_d] 
    # Elbow Method plot
    list_title = ['Within-cluster sum of squares', 'Silhouette Score', 'Calinski Harabasz', 'Davies Bouldin'] 
    for i in range(len(list_scores)):
        x_ticks = list(range(2, len(list_scores[i]) + 2))
        plt.plot(x_ticks, list_scores[i], 'bx-')
        plt.xlabel('k')
        plt.ylabel(list_title[i])
        plt.title('Optimal k')
        plt.show()

In [None]:
metrics_plots(X_train, max_k=15)

#### Results functions

In [None]:
def data_geo(data):
    X_train_lat_long = feature_engineering_pipeline.transform(data)
    X_train_lat_long = pipeline_preprocessing.fit_transform(X_train_lat_long)
    X_train_lat_long = pd.DataFrame(X_train_lat_long, columns=pipeline_preprocessing.named_steps['preprocessor'].get_feature_names_out())
    X_train_lat_long = X_train_lat_long[["remainder__longitude", "remainder__latitude"]]
    return X_train_lat_long

def drawMap(data, labels):

    gdf = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(X_train_lat_long['remainder__longitude'], X_train_lat_long['remainder__latitude']))
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    world = world[(world.name != "Antarctica")]
    world.plot(figsize=(15, 7), color='lightgray', edgecolor='white')
    gdf.plot(ax=plt.gca(), marker='o', column=labels, markersize=9, legend=True)
    plt.title('Businesses on world map')
    plt.show()

def CountClasters(labels):
    df = pd.DataFrame({"labels": labels})
    value_counts = df["labels"].value_counts()
    
    plt.figure(figsize=(10, 5))
    sns.barplot(x=value_counts.index, y=value_counts.values, palette='viridis')
    plt.title('Distribution of Clusters')
    plt.xlabel('Cluster number')
    plt.ylabel('Number of Bisinesses')
    plt.show()

def calculateScores(data, labels):
    silhouette_avg = silhouette_score(data, labels)
    calinski_score = calinski_harabasz_score(data, labels)
    davies_bouldin = davies_bouldin_score(data, labels)
    Scores = {
    'Score name': ['Silhouette Score', 'Calinski-Harabaz Index', 'Davies-Bouldin Index'],
    'score value': [silhouette_avg, calinski_score, davies_bouldin]
    }

    df_scores = pd.DataFrame(Scores)
    return df_scores


#### KMeans

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train)
labels = kmeans.predict(X_train)

In [None]:
drawMap(X_train, labels)
CountClasters(labels)

#### KMedoids

In [None]:
medoids_model = KMedoids(n_clusters=5, random_state=0)
medoids_model.fit(X_train)
labels =  medoids_model.predict(X_train)

In [None]:
drawMap(X_train, labels)
CountClasters(labels)

In [None]:
calculateScores(X_train, labels)

#### Single Linkage

In [None]:
Z = hierarchy.linkage(X_train, method='single')
plt.figure(figsize=(10, 5), dpi= 200, facecolor='w', edgecolor='k')
hierarchy.dendrogram(Z)
plt.show()

In [None]:
plt.figure(figsize=(10, 30), dpi= 200, facecolor='w', edgecolor='k')
hierarchy.dendrogram(Z)
plt.show()

In [None]:
model = AgglomerativeClustering(n_clusters=None, linkage='single', distance_threshold=1.4)
labels = model.fit_predict(X_train)

In [None]:
drawMap(X_train, labels)
CountClasters(labels)

In [None]:
calculateScores(X_train, labels)

#### Complete Linkage

In [None]:
Z = hierarchy.linkage(X_train, method='complete')
plt.figure(figsize=(10, 20), dpi= 200, facecolor='w', edgecolor='k')
hierarchy.dendrogram(Z)
plt.show()

In [None]:
model = AgglomerativeClustering(n_clusters=None, linkage='complete', distance_threshold=4)
labels = model.fit_predict(X_train)

In [None]:
drawMap(X_train, labels)
CountClasters(labels)

In [None]:
calculateScores(X_train, labels)