In [34]:
import random
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.model_selection import train_test_split
from itertools import islice

In [71]:
class KModes:
    def __init__(self, convert_dataset, distance, frequency_calculator, sample_frequency, get_centroid, k):
        self.k = k
        self.convert_dataset = convert_dataset
        self.distance = distance
        self.frequency_calculator = frequency_calculator
        self.sample_frequency = sample_frequency
        self.get_centroid = get_centroid
        
    def fit(self, data):
        dataset = self.convert_dataset(data)
        
        # Step 1: Get centroids
        self.centroids = []
        centroids_old = []

        for i in range(self.k):
            centroid = random.choice(dataset)

            while centroid in self.centroids:
                centroid = random.choice(dataset)

            self.centroids.append(centroid)

        error = np.ones(self.k)
        labels = [0 for x in range(len(dataset))]

        while error.all() != 0:
            # Step 2: Distance
            # Cluster labels for each point
            labels = [0 for x in range(len(dataset))]

            # Distances to each centroid
            distances = np.zeros(self.k)

            # Frequency for mode
            frequency = [deepcopy(self.sample_frequency) for x in range(self.k)]

            # Calculate distance to each centroid
            for i in range(len(dataset)):
                cluster = -1
                if dataset[i] not in self.centroids:
                    for j in range(self.k):
                        distances[j] = self.distance(dataset[i], self.centroids[j])

                    cluster = np.argmin(distances)
                else:
                    cluster = self.centroids.index(dataset[i])
                    
                labels[i] = cluster
                frequency[cluster] = self.frequency_calculator(dataset[i], frequency[cluster]) 

            # Step 3: Update centroids
            centroids_old = deepcopy(self.centroids)

            for i in range(self.k):
                proposed_centroid = self.get_centroid(frequency[i])
                temp_centroid = proposed_centroid
                best_distance = 10
                
                for j in range(len(dataset)):
                    if labels[j] == i:
                        current_dist = distance(proposed_centroid, dataset[j])
                        
                        if current_dist < best_distance:
                            temp_centroid = dataset[j]
                            best_distance = current_dist
                            
                self.centroids[i] = temp_centroid

                error[i] = self.distance(self.centroids[i], centroids_old[i])
        
        self.set_clusters(dataset, labels)

    def set_clusters(self, data, labels):
        self.clusters = [[] for x in range(self.k)]
        for i in range(len(data)):
            self.clusters[labels[i]].append(data[i])

    def inertia(self):
        total_distance = 0.0

        for i in range(self.k):
            cluster_distance = 0.0

            for sample in self.clusters[i]:
                cluster_distance += self.distance(self.centroids[i], sample)
                
            if len(self.clusters[i]) == 0:
                print(self.centroids[i])
            
            total_distance += cluster_distance/len(self.clusters[i])

        return total_distance/(sum(len(x) for x in self.clusters))
    
    def valid_clusters(self):
        clusters = 0
        
        for cluster in self.clusters:
            if len(cluster) > 0:
                clusters += 1
                
        return clusters

In [3]:
data = pd.read_csv('ml_data/clustering_data.csv')

info = data[['Author', 'Series', 'Tags']]
info.head()

Unnamed: 0,Author,Series,Tags
0,J.K. Rowling,Harry Potter,"to-read, fantasy, favorites, young-adult, fict..."
1,"J.K. Rowling, Mary GrandPré",Harry Potter,"to-read, currently-reading, fantasy, favorites..."
2,"J.K. Rowling, Mary GrandPré",Harry Potter,"to-read, currently-reading, fantasy, favorites..."
3,J.K. Rowling,Harry Potter,"to-read, currently-reading, fantasy, favorites..."
4,"J.K. Rowling, Mary GrandPré",Harry Potter,"to-read, fantasy, favorites, currently-reading..."


In [65]:
def isnan(x):
    return x != x

def convert_dataset(data):
    dataset = data.values.tolist()
    sample_data = []

    for row in dataset:
        sample = []

        sample.append(row[0])

        if isnan(row[1]):
            sample.append('none')
        else:
            sample.append(row[1])

        sample.append(row[2])
        sample_data.append(sample)

    return sample_data

[['J.K. Rowling',
  'Harry Potter',
  'to-read, fantasy, favorites, young-adult, fiction, harry-potter, own, books-i-own, owned, ya, series, favourites, magic, childrens, re-read, owned-books, adventure, children, audiobook, audiobooks, middle-grade, j-k-rowling, childhood, my-books, all-time-favorites, classics, children-s, sci-fi-fantasy, reread, default, my-library, novels, favorite-books, ya-fantasy, i-own, children-s-books, kids, favorite, fantasy-sci-fi, audio, english, paranormal, library, urban-fantasy, books, read-more-than-once, teen, re-reads, witches, british, jk-rowling, ya-fiction, mystery, novel, bookshelf, harry-potter-series, childrens-books, my-favorites, own-it, supernatural, my-bookshelf, kindle, faves, on-my-shelf, rereads, scifi-fantasy, young-adult-fiction, audible, childhood-favorites, favorite-series, ebook, audio-books, all-time-favourites, favs, favourite, made-me-cry, coming-of-age, fantasia, youth, romance, wizards, childhood-books'],
 ['J.K. Rowling, Mary 

In [42]:
#Needs work
def individual_distance(a, b):
    a_lst = a.strip('\"').split(', ')
    b_lst = b.strip('\"').split(', ')
    
    total_length = max(len(a_lst), len(b_lst))
    difference = total_length

    for ele in a_lst:
        if ele in b_lst:
            difference -= 1

    return 1.0 * difference/total_length


def distance(book_1, book_2):
    distance = 0.0

    distance += individual_distance(book_1[0], book_2[0])
    distance += individual_distance(book_1[2], book_2[2])

    if book_1[1] != 'none' and book_1[1] == book_2[1]:
        distance += 0.0
    else:
        distance += 1.0

    return distance

def get_frequency(book, cluster):
    authors = book[0].strip('\"').split(', ')

    for author in authors:
        if author in cluster['author']:
            cluster['author'][author] += 1
        else:
            cluster['author'][author] = 1

    if book[1] in cluster['series']:
        cluster['series'][book[1]] += 1
    else:
        cluster['series'][book[1]] = 1
        
    tags = book[2].strip('\"').split(', ')

    for tag in tags:
        if tag in cluster['tags']:
            cluster['tags'][tag] += 1
    else:
        cluster['tags'][tag] = 1

    length = len(tags)

    if length in cluster['length']:
        cluster['length'][length] += 1
    else:
        cluster['length'][length] = 1

    return cluster

def get_centroid(cluster):
    new_centroid = []

    new_centroid.append(max(cluster['author'], key = cluster['author'].get))
    new_centroid.append(max(cluster['series'], key = cluster['series'].get))

    tag_len = max(cluster['length'], key = cluster['length'].get)

    tags_sorted = {k: v for k, v in sorted(cluster['tags'].items(), key=lambda item: item[1])}
    tags = [x[0] for x in islice(tags_sorted.items(), 0, tag_len)]

    new_centroid += tags

    return new_centroid

sample_frequency = {'author': {}, 'series': {}, 'tags': {}, 'length': {}}


0.536144578313253

In [69]:
info_train, info_test = train_test_split(info, train_size = 0.8, test_size = 0.2, random_state=42)

model = KModes(convert_dataset, distance, get_frequency, sample_frequency, get_centroid, 5)
model.fit(info_train)

model.inertia()

0.00015728450718786773