#DBSCAN Example
Based on https://www.reneshbedre.com/blog/dbscan-python.html

## Prepare data

In [None]:
import pandas as pd

df = pd.read_csv("https://reneshbedre.github.io/assets/posts/tsne/tsne_scores.csv") #Gnomic Data
df.head()

In [None]:
# check the shape of dataset
df.shape

In [None]:
import matplotlib.pyplot as plt
x=df["t-SNE-1"]
y=df["t-SNE-2"]
plt.scatter(x,y)
plt.show()

## Select hyper parameters

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors 
#finds nearest neighbors, metric can be changed, default is minkowski (Euclidian if parameters = 2)
# n_neighbors = 5 as kneighbors function returns distance of point to itself (i.e. first column will be zeros) 
nbrs = NearestNeighbors(n_neighbors=5).fit(df)

# Find the k-neighbors of a point
neigh_dist, neigh_ind = nbrs.kneighbors(df) #returns the nearest neighbours distance and index
print(neigh_dist[:5])
print(neigh_ind[:5])

In [None]:
# sort the neighbor distances (lengths to points) in ascending order
# axis = 0 represents sort along first axis i.e. sort along row
sort_neigh_dist = np.sort(neigh_dist, axis=0)
print(sort_neigh_dist[:5])

In [None]:
#4th row are the farthest points
#Plot shows a "knee". Points below the knee are those in the cluster.
import matplotlib.pyplot as plt
k_dist = sort_neigh_dist[:, 4]
plt.plot(k_dist)
plt.axhline(y=2.5, linewidth=1, linestyle='dashed', color='k') 
plt.ylabel("k-NN distance")
plt.xlabel("Sorted observations (4th NN)")
plt.show()

2.5 looks like the value of the knee

Use this value as the minimun distance for DBSCAN

## Fit DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
clusters = DBSCAN(eps=2.5, min_samples=4).fit(df)
# get cluster labels
clusters.labels_

In [None]:
#Get cluster sizes
from collections import Counter
Counter(clusters.labels_)

22 clusters found (Cluster -1 is outliers)

Number 1 has 1,501 elements

## DBScan Visualization

In [None]:
import seaborn as sns
p = sns.scatterplot(data=df, x="t-SNE-1", y="t-SNE-2", hue=clusters.labels_, legend="full", palette="deep")
sns.move_legend(p, "upper right", bbox_to_anchor=(1.17, 1.2), title='Clusters')
plt.show()