In [None]:
%pip install pandas
%pip install plotly.express
%pip install --upgrade nbformat
%pip install numpy

import pandas as pd
import plotly.express as px
import numpy as np

from data.preprocessing import preprocess_csv

We will use k-means to cluster our data into 5 clusters: very expensive low-quantity items, expensive low-quantity items, cheap low-quantity items, cheap moderate-quantity items, and cheap bulk items.

We first import our dataset.

In [None]:
dataset_original = pd.read_csv('data/data.csv', encoding='latin1')
preprocessed_dataset = preprocess_csv('data/data.csv', ['StockCode', 'InvoiceDate'])
preprocessed_dataset

As you can see, we have a lot of datapoints (541909), but the only features we're currently worried about is quantity and unit price.

In [None]:
dataframe = preprocessed_dataset[["Quantity", "UnitPrice"]]

dataframe

In [None]:
fig = px.scatter(dataframe, x='Quantity', y='UnitPrice')
fig.show()

We need to clean up our dataset a little bit. Namely, we have some negative quantites (returned items), and many duplicates.

In [None]:
dataframe = dataframe.drop_duplicates()
dataframe = dataframe[(dataframe >= 0).all(axis=1)]
dataframe

In [None]:
fig = px.scatter(dataframe, x='Quantity', y='UnitPrice')
fig.show()

Now deleting some outliers...

In [None]:
dataframe = dataframe[(dataframe < 3000).all(axis=1)]
npdata = dataframe.to_numpy()

dataframe

In [None]:
fig = px.scatter(dataframe, x='Quantity', y='UnitPrice')
fig.show()

In [None]:
def update_assignments(centers, points):
	xsquare = np.square(centers)
	xsquare = np.sum(xsquare, axis=1, keepdims=True)
	ysquare = np.square(points)
	ysquare = np.sum(ysquare, axis=1, keepdims=True)
	xy = np.dot(centers, points.T)
	distances = np.sqrt(xsquare + ysquare.T - 2 * xy)

	return np.argmin(distances, axis=0)

def update_centers(points, assignments, K):
	returnList = []

	for i in range(K):
		indices = np.where(assignments == i)
		currPoints = np.take(points, indices[0], axis=0)
		size = currPoints.shape[0]
		if size != 0:
			currPoints = np.sum(currPoints, axis=0)
			currPoints /= size
			returnList.append(currPoints)

	return np.array(returnList)

def get_loss(points, centers, assignments):
	return np.linalg.norm((points - centers[assignments])).sum() ** 2

def train(data, centers, K=5, max_iters=10000, rel_tol = 1e-05):
	iteration = 0
	currCenters = centers

	while iteration < max_iters:
		assignments = update_assignments(currCenters, data)
		currCenters = update_centers(data, assignments, K)

		#Make sure we retain number of centers
		while centers.shape[0] < K:
			newCenter = data[np.random.choice(data.shape[0])]
			currCenters = np.vstack((currCenters, newCenter))
			assignments = update_assignments(currCenters, data)
		currLoss = get_loss(data, currCenters, assignments)
		if iteration > 0:
			if (np.abs(prevLoss - currLoss) / prevLoss < rel_tol):
				break
		prevLoss = currLoss
		iteration += 1
	return currCenters, assignments, currLoss


centers, assignments, loss = train(npdata, np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5,5]]))
#We can use dummy centers and still end up with the same result.

colored_data = np.hstack((npdata, assignments.reshape((-1, 1))))

In [None]:
colored_dataframe = pd.DataFrame(colored_data, columns=["Quantity", "UnitPrice", "Assignment"])

fig = px.scatter(colored_dataframe, x="Quantity", y="UnitPrice", color="Assignment")
fig.show()

In this graph, the colors represent:  
Orange-- Very expensive items that are bought in fery low quantities  
Purple-- Moderately expensive items that are bought in very low quantities  
Blue-- Cheap items that are bought in low to moderate quantities  
Pink-- Cheap items that are bought in moderate to high quantities  
Yellow-- Cheap items that are bought in very high quantities  

In [None]:
orange = []
purple = []
blue = []
pink = []
yellow = []


for i in list(dataframe.index):
    j = 0
    currColor = colored_dataframe.iloc[j]["Assignment"]
    if currColor == 0.0:
        blue.append(dataset_original.iloc[i]["Description"])
    elif currColor == 1.0:
        purple.append(dataset_original.iloc[i]["Description"])
    elif currColor == 2.0:
        pink.append(dataset_original.iloc[i]["Description"])
    elif currColor == 3.0:
        orange.append(dataset_original.iloc[i]["Description"])
    else:
        yellow.append(dataset_original.iloc[i]["Description"])
    j += 1


Now, we have 5 lists of item descriptions -- one for each cluster.

In [None]:
#K-mean evaluation using Silhoutte Score

def calculate_silhouette_score(points, assignments, centers):
    n_points = points.shape[0]
    silhouette_scores = np.zeros(n_points)
    for i in range(n_points):
        current_cluster = assignments[i]
        same_cluster_points = points[assignments == current_cluster]
        if len(same_cluster_points) > 1:
            a_i = np.mean(np.linalg.norm(same_cluster_points - points[i], axis=1))
        else:
            a_i = 0
        b_i = float('inf')
        for k in range(centers.shape[0]):
            if k != current_cluster:
                other_cluster_points = points[assignments == k]
                if other_cluster_points.size > 0:
                    avg_dist_to_other_cluster = np.mean(np.linalg.norm(other_cluster_points - points[i], axis=1))
                    b_i = min(b_i, avg_dist_to_other_cluster)
        if max(a_i, b_i) > 0:
            silhouette_scores[i] = (b_i - a_i) / max(a_i, b_i)
        else:
            silhouette_scores[i] = 0 
    return np.mean(silhouette_scores)
silhouette_score = calculate_silhouette_score(npdata, assignments, centers)

silhouette_score