In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from collections import Counter,deque
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_openml
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score



In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

X = newsgroups.data
y = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

X_train = X_train.toarray()
print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")


Training Data Shape: (15076, 5000), Training Labels Shape: (15076,)
Test Data Shape: (3770, 5000), Test Labels Shape: (3770,)


In [11]:
class DBScanCustom:
    def __init__(self,minPts,epsilon):
        self.minPts = minPts
        self.epsilon = epsilon
        self.labels = None
        self.dataset = None

    def preProcessing(self,dataset):
        distMatrix = cdist(dataset,dataset,metric='euclidean') 
        data = []
        for i in range(len(dataset)):
            neighbors = [str(j) for j in range(len(dataset)) if i!=j and distMatrix[i][j] <= self.epsilon]
            data.append([i] + [neighbors] + [len(neighbors)])
        self.dataset = data
        

    def expand(self,currPt,currClusterId):

        self.labels[currPt] = currClusterId
        queue = deque([currPt])
        visited = set()

        while queue:
            inQueue = queue.popleft()
            inQueueNeighborsCnt = self.dataset[inQueue][2]
            visited.add(int(inQueue))

            if inQueueNeighborsCnt>=self.minPts:
                for n in self.dataset[inQueue][1]:
                    n = int(n)
                    if n in visited:
                        continue
                    if self.labels[n] == -1 and self.dataset[n][2] >= self.minPts:
                        self.labels[n] = currClusterId
                        queue.append(n)
                    
    def runDBSCAN(self,dataset):  
        self.labels = [-1]*len(dataset)
        startClusterId = 0
        self.preProcessing(dataset)
 
        for i in range(len(self.dataset)):
            if self.labels[i] != -1:
                continue
            
            neighbors = self.dataset[i][1]
            if len(neighbors) >= self.minPts:
                startClusterId += 1
                self.expand(i,startClusterId)
            
        return self.labels
        
    

In [14]:
subset_size = 15000
indices = np.random.choice(len(X_train), subset_size, replace=False)
X_subset = X_train[indices]
dbscan = DBScanCustom(epsilon=0.8, minPts=3)
labels = dbscan.runDBSCAN(X_subset)

subset_df = pd.DataFrame(X_train[indices].copy())
subset_df['cluster'] = labels

In [15]:
subset_df['cluster'].value_counts()

cluster
-1     14325
 1       470
 3        35
 4        13
 6        10
 12        9
 21        7
 7         6
 26        6
 24        6
 10        6
 35        5
 31        5
 23        5
 19        5
 8         5
 5         5
 11        5
 15        4
 2         4
 41        4
 32        4
 29        4
 28        4
 16        4
 27        4
 25        4
 9         4
 13        4
 14        4
 17        4
 42        3
 30        2
 20        2
 33        2
 36        2
 37        2
 39        1
 40        1
 22        1
 38        1
 34        1
 18        1
 43        1
Name: count, dtype: int64

In [16]:
import numpy as np

valid_indices = np.where(np.array(labels) != -1)[0]

X_valid = X_subset[valid_indices]
X_valid = X_valid.reshape(-1, X_valid.shape[-1])

labels_valid = np.array(labels)[valid_indices].flatten()

if len(np.unique(labels_valid)) > 1:
    score = silhouette_score(X_valid, labels_valid)
    print(f"Silhouette Score: {score}")
else:
    print("Silhouette Score cannot be computed with less than 2 clusters.")


Silhouette Score: 0.8336304560640431


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA