<a href="https://colab.research.google.com/github/krishatuladhar/Data-Warehousing-and-Data-Mining/blob/main/lab4dwm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Write a program to implement the k-means algorithm.
* Write a program to implement the k-means ++ algorithm.
* Write a program to implement k-medoids algorithm.
* Write a program to implement an agglomerative hierarchical  clustering algorithm.
* Write a program to implement a divisive algorithm.
* Write a program to implement the DBSCAN algorithm.
* Write a program to perform outlier analysis using the Z-score method.
* Write a program to perform outlier detection using IQR method.
* Write a program to implement a mini-batch k-means algorithm.


Note:
You are expected to create a custom function that implements the specified algorithm..
Use “data.csv” for 1-5.
Use “data2.csv” for 6.
Use “data3.csv” for outlier analysis.
Use “data4.csv” for 9.


In [1]:
import numpy as np
import pandas as pd

def euclidean(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def kmeans(data, k, max_iter=100):
    n_samples, n_features = data.shape
    centroids = data[np.random.choice(n_samples, k, replace=False)]

    for _ in range(max_iter):
        labels = np.array([np.argmin([euclidean(x, c) for c in centroids]) for x in data])
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

    return centroids, labels

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data.csv').values
centroids, labels = kmeans(data, k=3)
print("Centroids:", centroids)


Centroids: [[1.         6.        ]
 [1.90763796 1.92106347]
 [5.98432967 5.31352018]]


In [2]:
import numpy as np
import pandas as pd

def kmeans_pp_init(data, k):
    n_samples = data.shape[0]
    centroids = [data[np.random.randint(n_samples)]]

    for _ in range(1, k):
        distances = np.array([min([np.linalg.norm(x - c) for c in centroids])**2 for x in data])
        probs = distances / distances.sum()
        cumulative_probs = np.cumsum(probs)
        r = np.random.rand()
        for i, p in enumerate(cumulative_probs):
            if r < p:
                centroids.append(data[i])
                break
    return np.array(centroids)

def kmeans_pp(data, k, max_iter=100):
    centroids = kmeans_pp_init(data, k)

    for _ in range(max_iter):
        labels = np.array([np.argmin([np.linalg.norm(x - c) for c in centroids]) for x in data])
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids

    return centroids, labels

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data.csv').values
centroids, labels = kmeans_pp(data, k=3)
print("Centroids:", centroids)


Centroids: [[1.82512542 2.29187589]
 [5.88276264 5.7448722 ]
 [7.         1.        ]]


In [3]:
import numpy as np
import pandas as pd

def k_medoids(data, k, max_iter=100):
    m, n = data.shape
    medoids_idx = np.random.choice(m, k, replace=False)
    medoids = data[medoids_idx]

    for _ in range(max_iter):
        labels = np.argmin([[np.sum(np.abs(x - m)) for m in medoids] for x in data], axis=1)

        new_medoids = []
        for i in range(k):
            cluster = data[labels == i]
            costs = [np.sum(np.abs(point - cluster).sum(axis=1)) for point in cluster]
            new_medoids.append(cluster[np.argmin(costs)])

        new_medoids = np.array(new_medoids)
        if np.allclose(medoids, new_medoids):
            break
        medoids = new_medoids

    return medoids, labels

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data.csv').values
medoids, labels = k_medoids(data, k=3)
print("Medoids:", medoids)


Medoids: [[2.24835708 1.93086785]
 [1.49358444 2.15712367]
 [5.57955292 5.79581438]]


In [4]:

import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

def agglomerative_clustering(data, k):
    clusters = [[i] for i in range(len(data))]
    distances = squareform(pdist(data))

    np.fill_diagonal(distances, np.inf)

    while len(clusters) > k:
        i, j = np.unravel_index(np.argmin(distances), distances.shape)
        new_cluster = clusters[i] + clusters[j]
        clusters.append(new_cluster)

        # Update distances
        new_dist = np.minimum(distances[i], distances[j])
        distances = np.vstack([distances, new_dist])
        new_dist = np.append(new_dist, np.inf)
        distances = np.column_stack([distances, new_dist])

        # Remove merged clusters
        distances = np.delete(distances, [i, j], axis=0)
        distances = np.delete(distances, [i, j], axis=1)
        clusters.pop(max(i, j))
        clusters.pop(min(i, j))

    return clusters

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data.csv').values
clusters = agglomerative_clustering(data, k=3)
print("Clusters:", clusters)


Clusters: [[21], [15, 10, 13, 19, 12, 14, 17, 18, 11, 16], [20, 1, 3, 6, 7, 9, 4, 8, 0, 2, 5]]


In [5]:
import numpy as np
import pandas as pd

def divisive(data, k):
    clusters = [list(range(len(data)))]

    while len(clusters) < k:
        # Find cluster with largest diameter
        max_diameter, split_idx = -1, -1
        for idx, cluster in enumerate(clusters):
            points = data[cluster]
            diam = np.max(pdist(points)) if len(points) > 1 else 0
            if diam > max_diameter:
                max_diameter = diam
                split_idx = idx

        to_split = clusters.pop(split_idx)
        points = data[to_split]
        # Use simple 2-means to split
        c1, c2 = np.mean(points, axis=0), points[np.argmax(np.linalg.norm(points - np.mean(points, axis=0), axis=1))]
        labels = [0 if np.linalg.norm(p - c1) < np.linalg.norm(p - c2) else 1 for p in points]

        cluster1 = [to_split[i] for i in range(len(to_split)) if labels[i]==0]
        cluster2 = [to_split[i] for i in range(len(to_split)) if labels[i]==1]

        clusters.extend([cluster1, cluster2])

    return clusters

from scipy.spatial.distance import pdist
data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data.csv').values
clusters = divisive(data, k=3)
print("Clusters:", clusters)


Clusters: [[21], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 20], [10, 11, 12, 13, 14, 15, 16, 17, 19]]


In [6]:
import numpy as np
import pandas as pd

def dbscan(data, eps, min_pts):
    labels = np.full(len(data), -1)
    cluster_id = 0

    def region_query(p):
        return [i for i, q in enumerate(data) if np.linalg.norm(p - q) < eps]

    for i in range(len(data)):
        if labels[i] != -1:
            continue
        neighbors = region_query(data[i])

        if len(neighbors) < min_pts:
            labels[i] = -1  # Noise
        else:
            labels[i] = cluster_id
            seeds = neighbors.copy()
            seeds.remove(i)
            while seeds:
                j = seeds.pop()
                if labels[j] == -1:
                    labels[j] = cluster_id
                if labels[j] != -1:
                    continue
                labels[j] = cluster_id
                new_neighbors = region_query(data[j])
                if len(new_neighbors) >= min_pts:
                    seeds.extend(new_neighbors)
            cluster_id += 1
    return labels

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data2.csv').values
labels = dbscan(data, eps=0.5, min_pts=5)
print("Cluster Labels:", labels)


Cluster Labels: [ 0  1  1  2  3  3  4  2  5  6  4  5  2  0  5  5  6  2  4  4  3  1  2  0
  4  4  4  4  3  3  1  1  6  4  2  7  3  3  2  4  4  8  6  1  3  8  1  1
  2  0  3  5  1  3  3  4  3  0  5  0  5  0  6  1  6  3  6  5  0  3  6  3
  2  6  3  3  1  9  6  3  1  4  6  5  5  0  2  6  5  4  1  3  7  6  1  8
  2  1  6  3  1 10  3  2  3  2  1  4  3  2  2  2  8  2  4  3  4  6  2  1
  6  3  2  2  1  1  5  2  0  6  2  6  0  2  6  3  1  7  6  3  5  5  5  6
  0  6  1  3  0  0  5  5  0  1  1  4  5  3  1  4  5  3  2  5  4  4  7  6
  5  4  6  0  1  4  2  5  3  6  2  5  2  6  2  3  2  6  0  3  0  4  3  8
  1  5  4  7  1  0  5  7 -1 -1 -1]


In [7]:
import pandas as pd
import numpy as np

def z_score_outliers(data, threshold=3):
    mean = np.mean(data)
    std = np.std(data)
    z_scores = (data - mean) / std
    return np.where(np.abs(z_scores) > threshold)

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data3.csv').values.flatten()
outliers = z_score_outliers(data)
print("Outliers at indices:", outliers)


Outliers at indices: (array([], dtype=int64),)


In [8]:
import pandas as pd
import numpy as np

def iqr_outliers(data):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    return np.where((data < lower) | (data > upper))

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data3.csv').values.flatten()
outliers = iqr_outliers(data)
print("Outliers at indices:", outliers)


Outliers at indices: (array([ 6, 14]),)


In [9]:
import numpy as np
import pandas as pd

def mini_batch_kmeans(data, k, batch_size=10, max_iter=100):
    n_samples, n_features = data.shape
    centroids = data[np.random.choice(n_samples, k, replace=False)]

    for _ in range(max_iter):
        batch_idx = np.random.choice(n_samples, batch_size, replace=False)
        batch = data[batch_idx]
        labels = np.array([np.argmin([np.linalg.norm(x - c) for c in centroids]) for x in batch])

        for i in range(k):
            points = batch[labels == i]
            if len(points):
                centroids[i] = centroids[i] + 0.1 * (points.mean(axis=0) - centroids[i])  # update rule

    return centroids

data = pd.read_csv('/content/drive/MyDrive/datawarehousinglab/data4.csv').values
centroids = mini_batch_kmeans(data, k=3)
print("Centroids:", centroids)


Centroids: [[-6.92138485 -6.86424713]
 [ 4.62294348  2.08871658]
 [-2.49590217  9.03427446]]
