In [None]:
#Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import time
from numba import jit, cuda, float32, int32
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read data
data=pd.read_csv('/content/drive/My Drive/data_standardize.csv')
df_Standardize=pd.DataFrame(data)
df_Standardize

Unnamed: 0.1,Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,0,-0.745369,-0.248785,-0.465787,-0.397088,-0.365854,-0.476844,-0.805116,-0.677987,-0.706172,-0.675708,-0.482201,-0.520766,-0.965868,-0.549205,-0.324378,-0.525663,0.360754
1,1,0.822978,0.134866,-0.516362,-0.397088,-0.479339,2.682990,-1.220469,-0.677987,-0.915974,0.576272,0.114210,-0.603824,0.705048,0.877253,0.123457,0.234367,0.360754
2,2,0.472101,0.518517,-0.106471,0.140262,-0.479339,-0.476844,1.271643,2.681612,-0.915974,-0.675708,-0.482201,-0.105476,0.844291,-0.395538,-0.090206,-0.525663,0.360754
3,3,0.061121,-1.016087,0.278323,0.644711,-0.479339,-0.375919,-1.012794,-0.398022,-0.915974,-0.258383,-0.333098,-0.562295,0.844291,-0.622992,-0.391354,-0.525663,0.360754
4,4,-0.360018,0.518517,-0.507880,-0.385968,-0.479339,-0.476844,-1.012794,-0.398022,-0.915974,-0.675708,-0.482201,-0.562295,-0.910171,-0.374964,-0.273834,-0.525663,0.360754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8926,8926,-0.751524,0.518517,-0.362027,-0.397088,-0.133031,-0.476844,1.271643,-0.677987,1.182060,-0.675708,-0.482201,-0.354650,-0.965868,-0.503941,-0.367885,1.184407,-4.124481
8927,8927,-0.756142,0.518517,-0.357319,-0.397088,-0.122468,-0.476844,1.271643,-0.677987,1.182060,-0.675708,-0.482201,-0.354650,-0.965868,-0.522125,-0.258918,-0.525663,-4.124481
8928,8928,-0.754051,-0.184845,-0.439810,-0.397088,-0.307565,-0.476844,0.856290,-0.677987,0.762454,-0.675708,-0.482201,-0.396179,-0.965868,-0.593276,-0.351786,0.329372,-4.124481
8929,8929,-0.758982,-0.184845,-0.516362,-0.397088,-0.479339,-0.458914,-1.220469,-0.677987,-0.915974,0.158947,-0.183995,-0.603824,-1.105111,-0.603778,-0.364587,0.329372,-4.124481


In [None]:
# Convert DataFrame to numpy array
data_points = df_Standardize.values

In [None]:
K = 3

In [None]:
# Function to initialize centers
def kmeans_init_centers(data_points, K):
    centers = []
    for i in range(K):
        index = np.random.choice(len(data_points))
        centers.append(data_points[index])
    return np.array(centers)

In [None]:
# CUDA function to calculate distances
@cuda.jit
def kmeans_cal_distance(data_points, centers, distances):
    # Calculate the thread's unique index
    idx = cuda.grid(1)

    # Get the number of data points and number of clusters
    num_points = data_points.shape[0]
    num_clusters = centers.shape[0]

    if idx < num_points * num_clusters:
        point_idx = idx // num_clusters
        center_idx = idx % num_clusters
        sum_point = 0
        for i in range(data_points.shape[1]):
            sum_point += (data_points[point_idx, i] - centers[center_idx, i]) ** 2
        distances[point_idx, center_idx] = math.sqrt(sum_point)

In [None]:
@cuda.jit
def kmeans_assign_labels(data_points, centers, assigned_labels):
    # Calculate the thread's unique index
    idx = cuda.grid(1)

    # Get the number of data points and number of clusters
    num_points = data_points.shape[0]
    num_clusters = centers.shape[0]

    if idx < num_points:
        min_dist = float('inf')
        label = -1
        for j in range(num_clusters):
            sum_point = 0
            for k in range(data_points.shape[1]):
                sum_point += (data_points[idx, k] - centers[j, k]) ** 2
            dist = math.sqrt(sum_point)
            if dist < min_dist:
                min_dist = dist
                label = j
        assigned_labels[idx] = label

In [None]:
@cuda.jit
def kmeans_update_centroids(data_points, assigned_labels, new_centers, counts, K):
    # Calculate the thread's unique index
    idx = cuda.grid(1)

    # Get the number of data points
    num_points = data_points.shape[0]

    if idx < num_points:
        label = assigned_labels[idx]
        for i in range(data_points.shape[1]):
            cuda.atomic.add(new_centers, (label, i), data_points[idx, i])
        cuda.atomic.add(counts, label, 1)

In [None]:
# Function to check convergence
def has_converged(centers, new_centers):
    return np.allclose(centers, new_centers)

In [None]:
def kmeans(data_points, K):
    # Convert data to numpy array
    data_points = np.array(data_points)

    # Initialize centers
    centers = kmeans_init_centers(data_points, K)

    # Allocate device memory
    d_data_points = cuda.to_device(data_points)
    d_centers = cuda.to_device(centers)
    d_distances = cuda.device_array((data_points.shape[0], K))
    d_assigned_labels = cuda.device_array(data_points.shape[0], dtype=np.int32)
    d_new_centers = cuda.device_array((K, data_points.shape[1]))
    d_counts = cuda.device_array(K, dtype=np.int32)

    # Define the number of threads and blocks
    threads_per_block = 256
    blocks_per_grid = (data_points.shape[0] + threads_per_block - 1) // threads_per_block

    it = 0
    while True:
        # Calculate distances
        kmeans_cal_distance[blocks_per_grid, threads_per_block](d_data_points, d_centers, d_distances)

        # Assign labels
        kmeans_assign_labels[blocks_per_grid, threads_per_block](d_data_points, d_centers, d_assigned_labels)

        # Update centroids
        kmeans_update_centroids[blocks_per_grid, threads_per_block](d_data_points, d_assigned_labels, d_new_centers, d_counts, K)

        # Transfer new centers back to host
        new_centers = d_new_centers.copy_to_host()
        counts = d_counts.copy_to_host()

        # Calculate new centers
        for i in range(K):
            if counts[i] > 0:
                new_centers[i] /= counts[i]

        # Check for convergence
        if np.allclose(centers, new_centers):
            break

        # Update centers
        centers = new_centers
        d_centers = cuda.to_device(centers)

        it += 1
    labels = d_assigned_labels.copy_to_host()

    return centers, labels, it


In [None]:
start = time.time()
(centers, labels, it) = kmeans(data_points, K)
end = time.time()
print(f'Processing time: {end - start} s')
print('Centers found by k-means algorithm:')
print(centers)
print('='*60)

Processing time: 0.006897926330566406 s
Centers found by k-means algorithm:
[[ 1.44079358e+03  2.52597325e-01  9.45222083e-02  1.61796456e-01
   1.55837137e-01  9.60422391e-02  9.69079008e-02  5.39286403e-02
   1.79375353e-01  8.90282704e-03  1.61853664e-02  1.26610260e-02
   1.31279715e-01  3.77892363e-01  1.76486605e-01  9.19755293e-02
  -2.08939359e-02  1.40325288e-01]
 [ 7.39357726e+03 -2.79729357e-01 -1.36571554e-01 -1.54536707e-01
  -1.52938394e-01 -8.46484103e-02 -9.64609361e-02  4.73946996e-05
  -2.06470211e-01  6.17379768e-02 -3.39216980e-02 -2.91464844e-02
  -1.21437795e-01 -4.36792294e-01 -1.92458098e-01 -8.44039466e-02
   4.53155625e-02 -2.14372973e-01]
 [ 4.36937522e+03  4.42544164e-02  4.94990894e-02  2.90236487e-03
   7.02287667e-03 -5.59315960e-03  5.76713967e-03 -5.22739853e-02
   3.95097173e-02 -7.23774516e-02  1.93563454e-02  1.78380625e-02
  -1.72565454e-03  8.51139242e-02  2.78371159e-02 -1.90737050e-03
  -2.65627885e-02  8.54872571e-02]]
