In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import time
from numba import jit, cuda, float32, int32
import math

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# read data
data=pd.read_csv('/content/drive/My Drive/data_standardize.csv')
df_Standardize=pd.DataFrame(data)
df_Standardize

Unnamed: 0.1,Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,0,-0.745369,-0.248785,-0.465787,-0.397088,-0.365854,-0.476844,-0.805116,-0.677987,-0.706172,-0.675708,-0.482201,-0.520766,-0.965868,-0.549205,-0.324378,-0.525663,0.360754
1,1,0.822978,0.134866,-0.516362,-0.397088,-0.479339,2.682990,-1.220469,-0.677987,-0.915974,0.576272,0.114210,-0.603824,0.705048,0.877253,0.123457,0.234367,0.360754
2,2,0.472101,0.518517,-0.106471,0.140262,-0.479339,-0.476844,1.271643,2.681612,-0.915974,-0.675708,-0.482201,-0.105476,0.844291,-0.395538,-0.090206,-0.525663,0.360754
3,3,0.061121,-1.016087,0.278323,0.644711,-0.479339,-0.375919,-1.012794,-0.398022,-0.915974,-0.258383,-0.333098,-0.562295,0.844291,-0.622992,-0.391354,-0.525663,0.360754
4,4,-0.360018,0.518517,-0.507880,-0.385968,-0.479339,-0.476844,-1.012794,-0.398022,-0.915974,-0.675708,-0.482201,-0.562295,-0.910171,-0.374964,-0.273834,-0.525663,0.360754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8926,8926,-0.751524,0.518517,-0.362027,-0.397088,-0.133031,-0.476844,1.271643,-0.677987,1.182060,-0.675708,-0.482201,-0.354650,-0.965868,-0.503941,-0.367885,1.184407,-4.124481
8927,8927,-0.756142,0.518517,-0.357319,-0.397088,-0.122468,-0.476844,1.271643,-0.677987,1.182060,-0.675708,-0.482201,-0.354650,-0.965868,-0.522125,-0.258918,-0.525663,-4.124481
8928,8928,-0.754051,-0.184845,-0.439810,-0.397088,-0.307565,-0.476844,0.856290,-0.677987,0.762454,-0.675708,-0.482201,-0.396179,-0.965868,-0.593276,-0.351786,0.329372,-4.124481
8929,8929,-0.758982,-0.184845,-0.516362,-0.397088,-0.479339,-0.458914,-1.220469,-0.677987,-0.915974,0.158947,-0.183995,-0.603824,-1.105111,-0.603778,-0.364587,0.329372,-4.124481


In [4]:
K = 3

In [5]:
# Function to initialize centers
def kmeans_init_centers(df_Standardize, K):
    centers = []
    for i in range(K):
        index = np.random.choice(len(df_Standardize))
        centers.append(np.array(df_Standardize.iloc[index]))
    return centers

In [6]:
# CUDA function to calculate distances
@cuda.jit
def calculate_distances(data_points, centers, distances):
    idx = cuda.grid(1)
    if idx < data_points.shape[0]:
        for j in range(centers.shape[0]):
            sum_point = 0
            for k in range(data_points.shape[1]):
                sum_point += (data_points[idx, k] - centers[j, k]) ** 2
            distances[idx, j] = math.sqrt(sum_point)

In [7]:
# CUDA function to assign labels
@cuda.jit
def assign_labels(data_points, centers, labels):
    idx = cuda.grid(1)
    if idx < data_points.shape[0]:
        min_dist = float('inf')
        label = -1
        for j in range(centers.shape[0]):
            sum_point = 0
            for k in range(data_points.shape[1]):
                sum_point += (data_points[idx, k] - centers[j, k]) ** 2
            dist = math.sqrt(sum_point)
            if dist < min_dist:
                min_dist = dist
                label = j
        labels[idx] = label

In [8]:
# CUDA function to update centroids
@cuda.jit
def update_centroids(data_points, labels, new_centers, counts, K):
    idx = cuda.grid(1)
    if idx < data_points.shape[0]:
        label = labels[idx]
        for k in range(data_points.shape[1]):
            cuda.atomic.add(new_centers, (label, k), data_points[idx, k])
        cuda.atomic.add(counts, label, 1)

In [9]:
# Function to check convergence
def has_converged(centers, new_centers):
    return np.allclose(centers, new_centers)

In [10]:
# Main KMeans function
def kmeans(df_Standardize, K):
    data_points = df_Standardize.values
    centers = np.array(kmeans_init_centers(df_Standardize, K))

    threads_per_block = 256
    blocks_per_grid = (data_points.shape[0] + threads_per_block - 1) // threads_per_block

    labels = np.zeros(data_points.shape[0], dtype=np.int32)
    distances = np.zeros((data_points.shape[0], K), dtype=np.float32)

    d_data_points = cuda.to_device(data_points)
    d_centers = cuda.to_device(centers)
    d_labels = cuda.to_device(labels)
    d_distances = cuda.to_device(distances)

    it = 0
    while True:
        # Step 1: Calculate distances
        calculate_distances[blocks_per_grid, threads_per_block](d_data_points, d_centers, d_distances)
        cuda.synchronize()

        # Step 2: Assign labels
        assign_labels[blocks_per_grid, threads_per_block](d_data_points, d_centers, d_labels)
        cuda.synchronize()

        # Step 3: Update centroids
        new_centers = np.zeros((K, data_points.shape[1]), dtype=np.float32)
        counts = np.zeros(K, dtype=np.int32)

        d_new_centers = cuda.to_device(new_centers)
        d_counts = cuda.to_device(counts)

        update_centroids[blocks_per_grid, threads_per_block](d_data_points, d_labels, d_new_centers, d_counts, K)
        cuda.synchronize()

        new_centers = d_new_centers.copy_to_host()
        counts = d_counts.copy_to_host()

        for k in range(K):
            if counts[k] > 0:
                new_centers[k] /= counts[k]

        if has_converged(centers, new_centers):
            break
        else:
            centers = new_centers.copy()
            d_centers = cuda.to_device(centers)
            it += 1

    labels = d_labels.copy_to_host()
    return centers, labels, it


In [11]:
start = time.time()
(centers, labels, it) = kmeans(df_Standardize, K)
end = time.time()
print(f'Processing time: {end - start} s')
print(f'Number of loops: {it}')
print('Centers found by k-means algorithm:')
print(centers)
print('='*60)



Processing time: 6.441731691360474 s
Number of loops: 298
Centers found by k-means algorithm:
[[ 7.44250391e+03 -2.87418067e-01 -1.44301027e-01 -1.55833498e-01
  -1.55331597e-01 -8.35717916e-02 -1.00678176e-01  9.07836482e-04
  -2.12869436e-01  6.52509779e-02 -3.63672860e-02 -3.28383334e-02
  -1.20503992e-01 -4.45652127e-01 -1.95183232e-01 -9.28476155e-02
   5.08049019e-02 -2.17464596e-01]
 [ 1.48850000e+03  2.49477342e-01  9.40071940e-02  1.55887008e-01
   1.50422454e-01  9.20428485e-02  9.42834392e-02  4.59609628e-02
   1.68218136e-01  4.61190008e-03  1.74145736e-02  1.39239533e-02
   1.24705739e-01  3.71256232e-01  1.78866297e-01  8.81542787e-02
  -2.32256036e-02  1.39852867e-01]
 [ 4.46600000e+03  3.77604589e-02  5.02190925e-02 -1.56814436e-04
   4.80222516e-03 -8.53073411e-03  6.33255998e-03 -4.68842462e-02
   4.45284769e-02 -6.98414817e-02  1.89356171e-02  1.88987628e-02
  -4.28270455e-03  7.41256773e-02  1.61918346e-02  4.63243714e-03
  -2.75550056e-02  7.75052607e-02]]
