In [31]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
import sklearn.metrics as metrics
from collections import Counter

train_file = 'train.csv'
test_file = 'test.csv'

train_dataset = pd.read_csv(train_file)
test_dataset = pd.read_csv(test_file)

test_dataset = test_dataset.drop('id', axis='columns')  #delete id from test set

x_train, y_train = train_dataset.iloc[:,:-1], train_dataset.iloc[:,-1]
#display(test_dataset)
#display(x_train.head())
#display(y_train.head())

In [121]:
def purity_score(y_true, y_pred): 
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

def find_cluster_majority_classes(cluster_labels, y_true):
    cluster_majority_classes = {}
    unique_clusters = set(cluster_labels)

    for cluster in unique_clusters:
        cluster_indices = [i for i, label in enumerate(cluster_labels) if label == cluster]
        cluster_class_labels = [y_true[i] for i in cluster_indices]
        class_counts = Counter(cluster_class_labels)     #count occurrences of each class label within the cluster
        majority_class = class_counts.most_common(1)[0][0]  #determine the majority class for the current cluster
        cluster_majority_classes[cluster] = majority_class

    return cluster_majority_classes

def calculate_metrics(y_true, labels, majority_classes):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0

    for i, true_class in enumerate(y_true):
        cluster_label = labels[i]   
        predicted_class = majority_classes[cluster_label]   #get predicted class for this label

        if predicted_class == true_class:
            if predicted_class in list(majority_classes.values())[:i] or predicted_class in list(majority_classes.values())[i + 1:]:
                true_positives += 1
            else:
                true_negatives += 1
        else:
            if predicted_class in list(majority_classes.values())[:i] or predicted_class in list(majority_classes.values())[i + 1:]:
                false_positives += 1
            else:
                false_negatives += 1

    return [true_positives, true_negatives, false_positives, false_negatives]

def calculate_precision(a, fp): 
    return a / (a+fp)

def calculate_recall(a, fn):
    return a / (a+fn)

def f_measure(a, fp, fn):
    precision = calculate_precision(a, fp)
    recall = calculate_recall(a, fn)
    return (1+a) / ((1/precision) + (a/recall))

In [127]:
K = [2,4,6,8,10]

for num_clusters in K: 
    total_fmeasure = 0
    kmeans = KMeans(n_clusters=num_clusters, n_init=10)
    kmeans.fit(x_train)
    labels = kmeans.labels_ 
    #display(find_cluster_majority_classes(labels, y_train))
    for cluster_label in range(num_clusters):
        #display(calculate_metrics(y_train, cluster_label, majority_classes=find_cluster_majority_classes(labels, y_train)))
        confusion_matrix = calculate_metrics(y_train, cluster_label, find_cluster_majority_classes(labels, y_train))
        total_fmeasure += f_measure(1, confusion_matrix[2], confusion_matrix[3])
    display(total_fmeasure)
    #print("Purity score for " + str(num_clusters) + " clusters: " + str(purity_score(y_train, labels)))

0.002663115845539281

0.005326231691078562

0.007989347536617843

0.010652463382157125

0.013315579227696406