



*   Distribute the dataset across multiple workers.
*   Perform k-means clustering on each subset of the data.
*   Aggregate the results to form a global clustering solution.



In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=dbef1097a39293474ece1fa218e8d1ed7213e486297d6a5c4a6d5a6a00bc3cd6
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
import numpy as np
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("DistributedKMeans").getOrCreate()

# Sample data (use your dataset here)
data = pd.DataFrame({
    'x': np.random.rand(1000),
    'y': np.random.rand(1000)
})

# Convert to Spark DataFrame
sdf = spark.createDataFrame(data)

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=["x", "y"], outputCol="features")
dataset = assembler.transform(sdf).select("features")

# Define k-means parameters
k = 3
max_iter = 10

# Define the number of partitions (workers)
num_partitions = 4

# Repartition the dataset
partitioned_data = dataset.repartition(num_partitions)

# Function to perform k-means on each partition
def kmeans_partition(partition, k, max_iter):
    # Remove SparkSession creation here - use the existing 'spark' from the global scope
    partition_df = spark.createDataFrame(partition, schema=["features"]) # Use the existing 'spark' session
    kmeans = KMeans(k=k, maxIter=max_iter)
    model = kmeans.fit(partition_df)
    centers = model.clusterCenters()
    return centers

# Apply k-means to each partition
partition_centers = partitioned_data.rdd.mapPartitions(
    lambda partition: kmeans_partition(partition, k, max_iter)
).collect()

# Aggregate centers from all partitions
global_centers = np.array(partition_centers)

# Perform final k-means clustering on the aggregated centers
final_kmeans = KMeans(k=k, maxIter=max_iter)
final_model = final_kmeans.fit(spark.createDataFrame(pd.DataFrame(global_centers, columns=["x", "y"])))

# Print final cluster centers
print("Final Cluster Centers: ")
for center in final_model.clusterCenters():
    print(center)

# Stop Spark session
spark.stop()

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/serializers.py", line 459, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/context.py", line 466, in __getnewargs__
    raise PySparkRuntimeError(
pyspark.errors.exceptions.base.PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.


PicklingError: Could not serialize object: PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

In [14]:
import math
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

# Node class representing each node in the distributed system
class Node:
    def __init__(self, neighbors, degree):
        self.neighbors = neighbors
        self.degree = degree
        self.data = None
        self.centers = None
        self.local_coreset = None
        self.weights = None
        self.message_received = {}
        self.X = None
        self.cost_of_each_data = None

    def set_data(self, data):
        self.data = data

    def set_centers(self, centers):
        self.centers = centers

    def set_cost_of_each_data(self, c):
        self.cost_of_each_data = c

    def set_local_coreset(self, S):
        self.local_coreset = S

    def set_weights(self, weights):
        self.weights = weights

    def set_X(self, X):
        self.X = X

# Initialize communication and cost tracking
c_cost = np.zeros(8)
k_means_cost = np.zeros(8)
communication_cost = 0

# Functions to create different types of graph topologies
def create_random_graph(no_of_nodes, probability):
    G = nx.erdos_renyi_graph(no_of_nodes, probability)
    while not nx.is_connected(G):
        G = create_random_graph(no_of_nodes, probability)
    return G

def create_preferential_graph(n, m):
    G = nx.barabasi_albert_graph(n, m)
    while not nx.is_connected(G):
        G = create_preferential_graph(n, m)
    return G

def create_grid_graph(n, m):
    return nx.grid_2d_graph(n, m)

# Generate a sequence for node traversal based on DFS
def node_sequence(G):
    seq = []
    l = list(nx.dfs_edges(G, 0))
    for i in range(len(l) - 1):
        if l[i][1] == l[i+1][0]:
            seq.append(l[i][0])
        else:
            seq.append(l[i][0])
            seq.append(l[i][1])
            p = nx.shortest_path(G, l[i][1], l[i+1][0])
            for k in range(1, len(p) - 1):
                seq.append(p[k])
    seq.append(l[-1][0])
    seq.append(l[-1][1])
    return seq

# Methods for partitioning data across nodes
def uniform_partitioning(df, nodes):
    temp_df = df.copy(deep=True)
    size_of_pi = math.floor(df.shape[0] / len(nodes))
    for node in nodes:
        if node != nodes[-1]:
            node_dict[node].data = temp_df.sample(size_of_pi)
            temp_df.drop(node_dict[node].data.index, inplace=True)
        else:
            node_dict[node].data = temp_df

def similarity_partitioning(df, nodes):
    temp_df = df.copy(deep=True)
    spec = SpectralClustering(n_clusters=len(nodes), gamma=1.0)
    c_id = spec.fit_predict(temp_df)
    for i in range(len(nodes)):
        node_dict[nodes[i]].data = temp_df[c_id == i]

def weighted_partitioning(df, nodes):
    temp_df = df.copy(deep=True)
    s = np.random.normal(0, 1, len(nodes))
    s = (abs(s) / np.sum(abs(s))) * temp_df.shape[0]
    for i in range(len(nodes)):
        if nodes[i] != nodes[-1]:
            node_dict[nodes[i]].data = temp_df.sample(int(round(s[i], 0)))
            temp_df.drop(node_dict[nodes[i]].data.index, inplace=True)
        else:
            node_dict[nodes[i]].data = temp_df

def degree_partitioning(df, nodes):
    temp_df = df.copy(deep=True)
    s = []
    for node in nodes:
        s.append(float(G.degree(node)))
    s = (s / np.sum(s)) * temp_df.shape[0]
    for i in range(len(nodes)):
        if nodes[i] != nodes[-1]:
            node_dict[nodes[i]].data = temp_df.sample(round(s[i], 0))
            temp_df.drop(node_dict[nodes[i]].data.index, inplace=True)
        else:
            node_dict[nodes[i]].data = temp_df

# K-Means clustering algorithm
def clustering_algo(data, no_of_centers):
    # Ensure data is 2D
    if data.ndim == 1:
        data = data.reshape(-1, 1)

    # Check if the number of samples is at least the number of clusters
    if data.shape[0] < no_of_centers:
        print(f"Warning: Reducing number of clusters to {data.shape[0]} due to insufficient samples.")
        no_of_centers = data.shape[0]

    kmeans = KMeans(n_clusters=no_of_centers, init='random', random_state=0).fit(data)
    return kmeans

# Message Passing algorithm to communicate data between nodes
def Message_Passing(message, neighbors, node):
    global communication_cost
    if node not in node_dict[node].message_received:
        node_dict[node].message_received[node] = message
    for neighbor in neighbors:
        for i, message in node_dict[node].message_received.items():
            if i not in node_dict[neighbor].message_received:
                if isinstance(message, pd.DataFrame) and message.ndim == 1:
                    message = message.to_frame().T  # Convert to 2D by transposing

                node_dict[neighbor].message_received[i] = message

                # Calculate communication cost based on message type and dimensions
                if np.isscalar(message):
                    communication_cost += 1
                elif len(np.shape(message)) == 1:
                    communication_cost += message.shape[0]
                elif len(np.shape(message)) == 2:
                    communication_cost += message.shape[0] * message.shape[1]


# Function to calculate the cost of clustering
def get_cost(data, centers):
    distanceMatrix = euclidean_distances(data, centers)
    return pd.DataFrame(distanceMatrix).min(axis=1)

# Distributed coreset construction algorithm
def distributed_coreset_construction(nodes, t, no_of_centers):
    for node in list(set(nodes)):
        node_dict[node].centers = pd.DataFrame(clustering_algo(node_dict[node].data, no_of_centers).cluster_centers_)
        node_dict[node].centers.columns = node_dict[node].data.columns
        node_dict[node].cost_of_each_data = get_cost(node_dict[node].data, node_dict[node].centers)
        node_dict[node].cost_of_each_data.index = node_dict[node].data.index

    for node in nodes:
        Message_Passing(node_dict[node].cost_of_each_data.sum(), node_dict[node].neighbors, node)

    for node in list(reversed(nodes)):
        Message_Passing(node_dict[node].cost_of_each_data.sum(), node_dict[node].neighbors, node)

    for node in list(set(nodes)):
        t_i = int(math.floor((t * node_dict[node].message_received[node]) / sum(node_dict[node].message_received.values())))
        m_p = 2 * (node_dict[node].cost_of_each_data + 1e-31)
        m_p.index = node_dict[node].data.index
        S_i = node_dict[node].data.sample(n=t_i, weights=m_p)
        w_q = sum(node_dict[node].message_received.values()) / (t * m_p[S_i.index])
        w_b = []
        for index, b in node_dict[node].centers.iterrows():
            temp_cost = get_cost(node_dict[node].data, b)
            temp_cost.index = node_dict[node].data.index
            Pb = node_dict[node].data[temp_cost == node_dict[node].cost_of_each_data]
            w_b.append(Pb.shape[0] - sum(w_q[S_i.index.intersection(Pb.index)]))
        node_dict[node].message_received = {}
        node_dict[node].set_local_coreset(pd.concat([S_i, node_dict[node].centers]))
        node_dict[node].set_weights(w_q.append(pd.Series(w_b)))

# Distributed clustering on graph algorithm
def distributed_clustering_on_graph(nodes, t, no_of_centers):
    distributed_coreset_construction(nodes, t, no_of_centers)
    for v_i in nodes:
        Message_Passing(node_dict[v_i].local_coreset, node_dict[v_i].neighbors, v_i)
    for v_i in list(reversed(nodes)):
        Message_Passing(node_dict[v_i].local_coreset, node_dict[v_i].neighbors, v_i)

    combined_data = pd.concat(list(node_dict[v_i].message_received.values()))

    # Ensure combined_data is 2D
    if combined_data.ndim == 1:
        combined_data = combined_data.to_frame().T  # Transpose to ensure it's in correct format

    # Avoid clustering if there are fewer samples than clusters
    if combined_data.shape[0] < no_of_centers:
        print(f"Skipping clustering for {combined_data.shape[0]} samples with {no_of_centers} clusters.")
        return None

    cluster_details = clustering_algo(combined_data, no_of_centers)
    return cluster_details


# Example Usage

n = 4
m = 3
G = create_preferential_graph(n, m)
nodes = node_sequence(G)
node_dict = {j: Node(list(G.neighbors(j)), G.degree(j)) for j in G.nodes()}

# Load the dataset directly from the UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
df = pd.read_csv(url, header=None)
df = df.fillna(0)
df = df.iloc[:, 1:]

weighted_partitioning(df, list(G.nodes()))

t = np.arange(0.1, 0.8, 0.1) * df.shape[0]
no_of_centers = 2

for i in range(len(t)):
    for node in G.nodes():
        node_dict[node].centers = None
        node_dict[node].local_coreset = None
        node_dict[node].weights = None
        node_dict[node].message_received = {}
        node_dict[node].X = None
        node_dict[node].cost_of_each_data = None

    cluster_details = distributed_clustering_on_graph(nodes, t[i], no_of_centers)

    if cluster_details is not None:
        c_cost[i] = communication_cost
        centralised_cluster = KMeans(n_clusters=no_of_centers, init='random', random_state=0).fit(np.array(df))
        k_means_cost[i] = pd.DataFrame(euclidean_distances(df, cluster_details.cluster_centers_)).min(axis=1).sum() / pd.DataFrame(euclidean_distances(df, centralised_cluster.cluster_centers_)).min(axis=1).sum()

c_cost = c_cost[:-1]
k_means_cost = k_means_cost[:-1]

fig11 = plt.figure()
plt.xlabel('Communication Cost (*10^6)')
plt.plot(np.arange(len(c_cost)), list(k_means_cost), '*-b')
plt.xticks(np.arange(len(c_cost)), list(np.round(c_cost / 10**6, 2)))
plt.ylabel('K Means cost')
plt.show()
fig11.savefig('spam_random_degree1.png')

print('Communication cost is', communication_cost)
print('Euclidean distance is', pd.DataFrame(euclidean_distances(df, cluster_details.cluster_centers_)).min(axis=1).sum() / pd.DataFrame(euclidean_distances(df, centralised_cluster.cluster_centers_)).min(axis=1).sum())


ValueError: Expected 2D array, got 1D array instead:
array=[ 1.61304348e-01  4.76956522e-01  5.86956522e-03  4.00434783e-01
  2.18043478e-01  9.95652174e-02  1.66304348e-01  3.50217391e-01
  3.47391304e-01  1.94347826e-01  5.81739130e-01  2.33043478e-01
  2.65434783e-01  3.80000000e-01  3.13913043e-01  2.00000000e-01
  3.72173913e-01  1.84565217e+00  4.24130435e-01  9.31521739e-01
  3.10652174e-01  3.72173913e-01  2.11521739e-01  4.39130435e-02
  1.26086957e-02  8.47826087e-03  1.08695652e-03  6.73913043e-03
  3.91304348e-03 -1.38777878e-17 -3.46944695e-17  1.67608696e-01
 -2.77555756e-17  2.82608696e-03  6.45652174e-02  1.06304348e-01
  1.52173913e-03  1.34782609e-02  4.50000000e-02  1.26086957e-02
  3.91304348e-03  3.56521739e-02  2.28260870e-02  4.39130435e-02
  6.82608696e-02  2.60869565e-03  2.21739130e-02  2.00652174e-02
  9.80217391e-02  3.62608696e-02  3.10695652e-01  1.86760870e-01
  8.23260870e-02  3.43565870e+01  3.78304348e+02  2.24915217e+03
  7.82608696e-01].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [15]:
import numpy as np
from scipy.spatial import distance

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node1 = np.array([0, 0, 0])  # Node 1 labels
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

new_point = np.array([4, 4])  # New point to classify
k = 3  # Number of neighbors

# Local distance calculation at each node
distances_node1 = distance.cdist([new_point], data_node1, 'euclidean')[0]
distances_node2 = distance.cdist([new_point], data_node2, 'euclidean')[0]

# Combine distances and labels
all_distances = np.concatenate((distances_node1, distances_node2))
all_labels = np.concatenate((labels_node1, labels_node2))

# Find the k-nearest neighbors
nearest_indices = np.argsort(all_distances)[:k]
nearest_labels = all_labels[nearest_indices]

# Perform majority voting
from collections import Counter
label = Counter(nearest_labels).most_common(1)[0][0]

print(f"The new point is classified as: {label}")


The new point is classified as: 0


In [16]:
import numpy as np

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data

k = 2  # Number of clusters
iterations = 5

# Initialize centroids randomly from all data points (here, we're using only node1's data for simplicity)
centroids = np.array([[2, 3], [6, 6]])

for iteration in range(iterations):
    # Local assignment at each node
    distances_node1 = np.linalg.norm(data_node1[:, np.newaxis] - centroids, axis=2)
    distances_node2 = np.linalg.norm(data_node2[:, np.newaxis] - centroids, axis=2)

    cluster_assignments_node1 = np.argmin(distances_node1, axis=1)
    cluster_assignments_node2 = np.argmin(distances_node2, axis=1)

    # Local centroid update
    new_centroids_node1 = np.array([data_node1[cluster_assignments_node1 == i].mean(axis=0) for i in range(k)])
    new_centroids_node2 = np.array([data_node2[cluster_assignments_node2 == i].mean(axis=0) for i in range(k)])

    # Global centroid update (average of all nodes)
    centroids = (new_centroids_node1 + new_centroids_node2) / 2

    print(f"Iteration {iteration+1}: Centroids: {centroids}")

print(f"Final centroids: {centroids}")


Iteration 1: Centroids: [[nan nan]
 [nan nan]]
Iteration 2: Centroids: [[4.         4.66666667]
 [       nan        nan]]
Iteration 3: Centroids: [[       nan        nan]
 [4.         4.66666667]]
Iteration 4: Centroids: [[4.         4.66666667]
 [       nan        nan]]
Iteration 5: Centroids: [[       nan        nan]
 [4.         4.66666667]]
Final centroids: [[       nan        nan]
 [4.         4.66666667]]


In [18]:
import numpy as np
from sklearn.svm import SVC

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
# Include both class labels in node1's data
labels_node1 = np.array([0, 0, 1])  # Node 1 labels

data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

# Initialize the SVM model with a linear kernel
model = SVC(kernel='linear', C=1.0)

# Local training at each node
model.fit(data_node1, labels_node1)  # Train on Node 1 data
model.fit(data_node2, labels_node2)  # Train on Node 2 data (updating the model)

# Test the model on a new point
new_point = np.array([[4, 4]])
predicted_label = model.predict(new_point)

print(f"The new point is classified as: {predicted_label[0]}")

ValueError: The number of classes has to be greater than one; got 1 class

In [19]:
import numpy as np
from scipy.spatial import distance
from collections import Counter

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node1 = np.array([0, 0, 0])  # Node 1 labels
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

new_point = np.array([4, 4])  # New point to classify
k = 3  # Number of neighbors

# Local distance calculation at each node
distances_node1 = distance.cdist([new_point], data_node1, 'euclidean')[0]
distances_node2 = distance.cdist([new_point], data_node2, 'euclidean')[0]

# Combine distances and labels from both nodes
all_distances = np.concatenate((distances_node1, distances_node2))
all_labels = np.concatenate((labels_node1, labels_node2))

# Find the k-nearest neighbors by sorting distances
nearest_indices = np.argsort(all_distances)[:k]
nearest_labels = all_labels[nearest_indices]

# Perform majority voting among the k-nearest neighbors
label = Counter(nearest_labels).most_common(1)[0][0]

print(f"The new point is classified as: {label}")


The new point is classified as: 0


In [20]:
import numpy as np

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data

k = 2  # Number of clusters
iterations = 5  # Number of iterations for convergence

# Initialize centroids randomly from all data points (using node1's data for simplicity)
centroids = np.array([[2, 3], [6, 6]])

for iteration in range(iterations):
    # Local assignment of data points to the nearest centroid at each node
    distances_node1 = np.linalg.norm(data_node1[:, np.newaxis] - centroids, axis=2)
    distances_node2 = np.linalg.norm(data_node2[:, np.newaxis] - centroids, axis=2)

    # Determine the nearest centroid for each point
    cluster_assignments_node1 = np.argmin(distances_node1, axis=1)
    cluster_assignments_node2 = np.argmin(distances_node2, axis=1)

    # Local centroid update with check for empty clusters
    new_centroids_node1 = np.array([
        data_node1[cluster_assignments_node1 == i].mean(axis=0) if np.any(cluster_assignments_node1 == i) else centroids[i]
        for i in range(k)
    ])
    new_centroids_node2 = np.array([
        data_node2[cluster_assignments_node2 == i].mean(axis=0) if np.any(cluster_assignments_node2 == i) else centroids[i]
        for i in range(k)
    ])

    # Global centroid update by averaging centroids from all nodes
    centroids = (new_centroids_node1 + new_centroids_node2) / 2

    print(f"Iteration {iteration+1}: Centroids: {centroids}")

# Final centroids after all iterations
print(f"Final centroids: {centroids}")


Iteration 1: Centroids: [[2.         3.        ]
 [6.         6.16666667]]
Iteration 2: Centroids: [[2.   3.  ]
 [6.   6.25]]
Iteration 3: Centroids: [[2.         3.        ]
 [6.         6.29166667]]
Iteration 4: Centroids: [[2.     3.    ]
 [6.     6.3125]]
Iteration 5: Centroids: [[2.         3.        ]
 [6.         6.32291667]]
Final centroids: [[2.         3.        ]
 [6.         6.32291667]]


In [21]:
import numpy as np
from sklearn.svm import SVC

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
labels_node1 = np.array([0, 0, 0])  # Node 1 labels

data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

# Initialize the SVM model with a linear kernel
model = SVC(kernel='linear', C=1.0)

# Local training at each node
model.fit(data_node1, labels_node1)  # Train on Node 1 data
model.fit(data_node2, labels_node2)  # Train on Node 2 data (updating the model)

# Test the model on a new point
new_point = np.array([[4, 4]])
predicted_label = model.predict(new_point)

print(f"The new point is classified as: {predicted_label[0]}")


ValueError: The number of classes has to be greater than one; got 1 class

In [22]:
import numpy as np
from scipy.spatial import distance
from collections import Counter

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node1 = np.array([0, 0, 0])  # Node 1 labels
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

new_point = np.array([4, 4])  # New point to classify
k = 3  # Number of neighbors

# Local distance calculation at each node
distances_node1 = distance.cdist([new_point], data_node1, 'euclidean')[0]
distances_node2 = distance.cdist([new_point], data_node2, 'euclidean')[0]

# Combine distances and labels from both nodes
all_distances = np.concatenate((distances_node1, distances_node2))
all_labels = np.concatenate((labels_node1, labels_node2))

# Find the k-nearest neighbors by sorting distances
nearest_indices = np.argsort(all_distances)[:k]
nearest_labels = all_labels[nearest_indices]

# Perform majority voting among the k-nearest neighbors
label = Counter(nearest_labels).most_common(1)[0][0]

print(f"The new point is classified as: {label}")


The new point is classified as: 0


In [23]:
import numpy as np

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data

k = 2  # Number of clusters
iterations = 5  # Number of iterations for convergence

# Initialize centroids randomly from all data points (using node1's data for simplicity)
centroids = np.array([[2, 3], [6, 6]])

for iteration in range(iterations):
    # Local assignment of data points to the nearest centroid at each node
    distances_node1 = np.linalg.norm(data_node1[:, np.newaxis] - centroids, axis=2)
    distances_node2 = np.linalg.norm(data_node2[:, np.newaxis] - centroids, axis=2)

    # Determine the nearest centroid for each point
    cluster_assignments_node1 = np.argmin(distances_node1, axis=1)
    cluster_assignments_node2 = np.argmin(distances_node2, axis=1)

    # Local centroid update with check for empty clusters
    new_centroids_node1 = np.array([
        data_node1[cluster_assignments_node1 == i].mean(axis=0) if np.any(cluster_assignments_node1 == i) else centroids[i]
        for i in range(k)
    ])
    new_centroids_node2 = np.array([
        data_node2[cluster_assignments_node2 == i].mean(axis=0) if np.any(cluster_assignments_node2 == i) else centroids[i]
        for i in range(k)
    ])

    # Global centroid update by averaging centroids from all nodes
    centroids = (new_centroids_node1 + new_centroids_node2) / 2

    print(f"Iteration {iteration+1}: Centroids: {centroids}")

# Final centroids after all iterations
print(f"Final centroids: {centroids}")


Iteration 1: Centroids: [[2.         3.        ]
 [6.         6.16666667]]
Iteration 2: Centroids: [[2.   3.  ]
 [6.   6.25]]
Iteration 3: Centroids: [[2.         3.        ]
 [6.         6.29166667]]
Iteration 4: Centroids: [[2.     3.    ]
 [6.     6.3125]]
Iteration 5: Centroids: [[2.         3.        ]
 [6.         6.32291667]]
Final centroids: [[2.         3.        ]
 [6.         6.32291667]]


In [24]:
import numpy as np
from sklearn.svm import SVC

# Simulated distributed data
data_node1 = np.array([[1, 2], [2, 3], [3, 4]])  # Node 1 data
labels_node1 = np.array([0, 0, 0])  # Node 1 labels

data_node2 = np.array([[5, 5], [6, 6], [7, 8]])  # Node 2 data
labels_node2 = np.array([1, 1, 1])  # Node 2 labels

# Initialize the SVM model with a linear kernel
model = SVC(kernel='linear', C=1.0)

# Local training at each node
model.fit(data_node1, labels_node1)  # Train on Node 1 data
model.fit(data_node2, labels_node2)  # Train on Node 2 data (updating the model)

# Test the model on a new point
new_point = np.array([[4, 4]])
predicted_label = model.predict(new_point)

print(f"The new point is classified as: {predicted_label[0]}")


ValueError: The number of classes has to be greater than one; got 1 class