In [72]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from itertools import islice

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [73]:
class KModes:
    def __init__(self, distance, frequency_calculator, sample_frequency, get_centroid, k):
        self.k = k
        self.distance = distance
        self.frequency_calculator = frequency_calculator
        self.sample_frequency = sample_frequency
        self.get_centroid = get_centroid
        
    def fit(self, data):
        self.dataset = data.values.tolist()
        
        # Step 1: Get centroids
        self.centroids = []
        centroids_old = []

        for i in range(self.k):
            centroid = random.choice(self.dataset)

            while centroid in self.centroids:
                centroid = random.choice(self.dataset)

            self.centroids.append(centroid)

        error = np.ones(self.k)
        self.labels = [0 for x in range(len(self.dataset))]

        while error.all() != 0:
            # Step 2: Distance
            # Cluster labels for each point
            self.labels = [0 for x in range(len(self.dataset))]

            # Distances to each centroid
            distances = np.zeros(self.k)

            # Frequency for mode
            frequency = [deepcopy(self.sample_frequency) for x in range(self.k)]

            # Calculate distance to each centroid
            for i in range(len(self.dataset)):
                cluster = -1
                if self.dataset[i] not in self.centroids:
                    for j in range(self.k):
                        distances[j] = self.distance(self.dataset[i], self.centroids[j])

                    cluster = np.argmin(distances)
                else:
                    cluster = self.centroids.index(self.dataset[i])
                    
                self.labels[i] = cluster
                frequency[cluster] = self.frequency_calculator(self.dataset[i], frequency[cluster]) 

            # Step 3: Update centroids
            centroids_old = deepcopy(self.centroids)

            for i in range(self.k):
                proposed_centroid = self.get_centroid(frequency[i])
                temp_centroid = proposed_centroid
                best_distance = 10
                
                for j in range(len(self.dataset)):
                    if self.labels[j] == i:
                        current_dist = self.distance(proposed_centroid, self.dataset[j])
                        
                        if current_dist < best_distance:
                            temp_centroid = self.dataset[j]
                            best_distance = current_dist
                            
                self.centroids[i] = temp_centroid

                error[i] = self.distance(self.centroids[i], centroids_old[i])

    def inertia(self):
        total_distance = 0.0

        for i in range(self.k):
            for j in range(len(self.dataset)):
                if self.labels[j] == i:
                    total_distance += self.distance(self.centroids[i], self.dataset[j])

        return total_distance
    
    def predict(self, data):
        data_list = data.values.tolist()
        result = [0 for x in range(len(data))]
        
        for i in range(len(data)):
            distances = [0 for x in range(self.k)]
            for j in range(self.k):
                distances[j] = self.distance(data_list[i], self.centroids[j])

            result[i] = np.argmin(distances)
        
        return result
    
    def get_labels(self):
        return self.labels
    
    def get_centroids(self):
        return self.centroids

In [74]:
class BookClassifier:
    def __init__(self, k):
        self.k = k
        self.sample_frequency = {'author': {}, 'series': {}, 'tags': {}, 'length': {}}
        self.model = KModes(self.distance, self.get_frequency, self.sample_frequency, self.get_centroid, self.k)
        
    def individual_distance(self, a, b):
        a_lst = a.strip('\"').split(', ')
        b_lst = b.strip('\"').split(', ')

        total_length = max(len(a_lst), len(b_lst))
        difference = total_length

        for ele in a_lst:
            if ele in b_lst:
                difference -= 1

        return 1.0 * difference/total_length


    def distance(self, book_1, book_2):
        distance = 0.0

        distance += self.individual_distance(book_1[0], book_2[0])
        distance += self.individual_distance(book_1[2], book_2[2])

        if book_1[1] != 'none' and book_1[1] == book_2[1]:
            distance += 0.0
        else:
            distance += 1.0

        return distance

    def get_frequency(self, book, cluster):
        authors = book[0].strip('\"').split(', ')

        for author in authors:
            if author in cluster['author']:
                cluster['author'][author] += 1
            else:
                cluster['author'][author] = 1

        if book[1] in cluster['series']:
            cluster['series'][book[1]] += 1
        else:
            cluster['series'][book[1]] = 1

        tags = book[2].strip('\"').split(', ')

        for tag in tags:
            if tag in cluster['tags']:
                cluster['tags'][tag] += 1
        else:
            cluster['tags'][tag] = 1

        length = len(tags)

        if length in cluster['length']:
            cluster['length'][length] += 1
        else:
            cluster['length'][length] = 1

        return cluster

    def get_centroid(self, cluster):
        new_centroid = []

        new_centroid.append(max(cluster['author'], key = cluster['author'].get))
        new_centroid.append(max(cluster['series'], key = cluster['series'].get))

        tag_len = max(cluster['length'], key = cluster['length'].get)

        tags_sorted = {k: v for k, v in sorted(cluster['tags'].items(), key=lambda item: item[1])}
        tags = [x[0] for x in islice(tags_sorted.items(), 0, tag_len)]

        new_centroid += tags

        return new_centroid

    def fit(self, data, reviews):
        clustering = data[['Author', 'Series', 'Tags']]
        classification = data[['Raters', 'Reviewers', 'Pages', 'PublishYear']].values.tolist()
        review_list = reviews.values.tolist()
        
        self.model.fit(clustering)
        
        labels = self.model.get_labels()
        
        self.clusters = [[] for x in range(self.k)]
        self.cluster_labels = [[] for x in range(self.k)]
        
        for i in range(len(classification)):
            self.clusters[labels[i]].append(classification[i])
            self.cluster_labels[labels[i]].append(review_list[i])
            
    def train_classifiers(self, classifier):
        classifiers = [deepcopy(classifier) for x in range(self.k)]
        
        for i in range(self.k):
            classifiers[i].fit(self.clusters[i], self.cluster_labels[i])   
            
        return classifiers
            
    def predict(self, test, classifier):
        classifiers = self.train_classifiers(classifier)
        
        clustering = test[['Author', 'Series', 'Tags']]
        classification = test[['Raters', 'Reviewers', 'Pages', 'PublishYear']].values.tolist()
        
        labels = self.model.predict(clustering)
        
        result = [0 for x in range(len(classification))]
        
        for i in range(len(classification)):
            result[i] = classifiers[labels[i]].predict([classification[i]])[0]
        
        return result

In [75]:
def accuracy(predict, labels):
    correct = 0
    
    for i in range(len(labels)):
        if predict[i] == labels.values.tolist()[i]:
            correct += 1
            
    return correct/len(labels)
    
def confusion_matrix(k, predict, labels):
    result_matrix = np.zeros((k, k), dtype='int32')

    for i in range(len(labels)):
        result_matrix[labels.values.tolist()[i]][predict[i]] += 1
        
    return result_matrix


In [76]:
data = pd.read_csv('data/complete_data.csv')

data.tail(10)

Unnamed: 0,BookID,Title,Author,Rate,Raters,Reviewers,Pages,PublishYear,GenreLink,Series,review,Tags
97594,299919,How to Learn a Foreign Language,Graham E. Fuller,3.35,109,18,102.0,1987,/work/shelves/290993,none,1,"to-read, language, non-fiction, currently-read..."
97595,299940,It Seemed Important at the Time: A Romance Memoir,Gloria Vanderbilt,3.25,890,78,161.0,2004,/work/shelves/291014,none,1,"to-read, currently-reading, non-fiction, memoi..."
97596,299941,Once Upon a Time,Gloria Vanderbilt,3.64,240,20,352.0,1985,/work/shelves/291015,none,1,"to-read, memoir, biography, currently-reading,..."
97597,299942,A Mother's Story,Gloria Vanderbilt,3.67,247,33,160.0,1996,/work/shelves/291016,none,1,"to-read, non-fiction, memoir, currently-readin..."
97598,299934,"The Short and Bloody History of Knights, Spies...",John Farman,3.5,101,18,287.0,2003,/work/shelves/291008,none,1,"to-read, own, non-fiction, history, currently-..."
97599,299947,"X-Men: God Loves, Man Kills","Chris Claremont, Brent Anderson",4.15,18149,442,64.0,1982,/work/shelves/1410200,Marvel Graphic Novel,2,"to-read, comics, graphic-novels, currently-rea..."
97600,299948,"X-Treme X-Men, Vol. 5: God Loves, Man Kills","Chris Claremont, Igor Kordey, Salvador Larroca...",3.21,148,8,216.0,2003,/work/shelves/18626296,X-Treme X-Men (2001) (Collected Editions),1,"to-read, comics, x-men, graphic-novels, marvel..."
97601,299961,Who Moved the Stone?,"Frank Morison, Lee Strobel",3.92,623,72,193.0,1930,/work/shelves/291035,none,2,"to-read, currently-reading, christian, apologe..."
97602,299960,The Earth Moved: On the Remarkable Achievement...,Amy Stewart,3.89,2064,369,240.0,2004,/work/shelves/291034,none,2,"to-read, science, non-fiction, currently-readi..."
97603,299980,Silent Warfare: Understanding the World of Int...,"Abram N. Shulsky, Gary J. Schmitt",3.64,211,9,262.0,1991,/work/shelves/291054,none,1,"to-read, currently-reading, intelligence, non-..."


In [77]:
info = data[['Author', 'Raters', 'Reviewers', 'Pages', 'PublishYear', 'Series', 'Tags']]
labels = data['review']

info.tail()

Unnamed: 0,Author,Raters,Reviewers,Pages,PublishYear,Series,Tags
97599,"Chris Claremont, Brent Anderson",18149,442,64.0,1982,Marvel Graphic Novel,"to-read, comics, graphic-novels, currently-rea..."
97600,"Chris Claremont, Igor Kordey, Salvador Larroca...",148,8,216.0,2003,X-Treme X-Men (2001) (Collected Editions),"to-read, comics, x-men, graphic-novels, marvel..."
97601,"Frank Morison, Lee Strobel",623,72,193.0,1930,none,"to-read, currently-reading, christian, apologe..."
97602,Amy Stewart,2064,369,240.0,2004,none,"to-read, science, non-fiction, currently-readi..."
97603,"Abram N. Shulsky, Gary J. Schmitt",211,9,262.0,1991,none,"to-read, currently-reading, intelligence, non-..."


In [78]:
labels.tail()

97599    2
97600    1
97601    2
97602    2
97603    1
Name: review, dtype: int64

In [79]:
info_train, info_test, labels_train, labels_test = train_test_split(info, labels, train_size = 0.8, test_size = 0.2, random_state=42)

print(len(info_train), len(info_test))

78083 19521


In [80]:
book_classifier = BookClassifier(3)
book_classifier.fit(info_train, labels_train)

In [81]:
knn_classifier = KNeighborsClassifier(n_neighbors = 11)

main_predict = book_classifier.predict(info_test, knn_classifier)

In [82]:
knn_classifier.fit(info_train[['Raters', 'Reviewers', 'Pages', 'PublishYear']], labels_train)

knn_predict = knn_classifier.predict(info_test[['Raters', 'Reviewers', 'Pages', 'PublishYear']])

In [83]:
accuracy(main_predict, labels_test)

0.6721991701244814

In [84]:
accuracy(knn_predict, labels_test)

0.6707135904922904

In [85]:
confusion_matrix(3, main_predict, labels_test)

array([[    0,   170,   104],
       [    6,  2373,  3470],
       [    0,  2649, 10749]])

In [86]:
confusion_matrix(3, knn_predict, labels_test)

array([[    1,   125,   148],
       [    1,  1482,  4366],
       [    2,  1786, 11610]])

In [87]:
knn_classifier_2 = KNeighborsClassifier(n_neighbors = 37)

main_predict_2 = book_classifier.predict(info_test, knn_classifier_2)

In [88]:
knn_classifier_2.fit(info_train[['Raters', 'Reviewers', 'Pages', 'PublishYear']], labels_train)

knn_predict_2 = knn_classifier_2.predict(info_test[['Raters', 'Reviewers', 'Pages', 'PublishYear']])

In [89]:
print(accuracy(main_predict_2, labels_test), accuracy(knn_predict_2, labels_test))

0.6891040418011373 0.6873111008657343


In [90]:
print(confusion_matrix(3, main_predict_2, labels_test))
print(confusion_matrix(3, knn_predict_2, labels_test))

[[    0   172   102]
 [    0  2100  3749]
 [    0  2046 11352]]
[[    0   109   165]
 [    0   975  4874]
 [    0   956 12442]]


In [91]:
nb_classifier = GaussianNB()
main_predict_3 = book_classifier.predict(info_test, nb_classifier)

In [92]:
nb_classifier.fit(info_train[['Raters', 'Reviewers', 'Pages', 'PublishYear']], labels_train)
nb_predict = nb_classifier.predict(info_test[['Raters', 'Reviewers', 'Pages', 'PublishYear']])

In [93]:
print(accuracy(main_predict_3, labels_test), accuracy(nb_predict, labels_test))

0.2909687003739563 0.1564981302187388


In [94]:
print(confusion_matrix(3, main_predict_3, labels_test))
print(confusion_matrix(3, nb_predict, labels_test))

[[ 137  126   11]
 [2501 2823  525]
 [4560 6118 2720]]
[[ 241   26    7]
 [4492 1019  338]
 [8279 3324 1795]]


In [95]:
dt_classifier = DecisionTreeClassifier(random_state = 42, max_depth = 4)
main_predict_4 = book_classifier.predict(info_test, dt_classifier)

In [96]:
dt_classifier.fit(info_train[['Raters', 'Reviewers', 'Pages', 'PublishYear']], labels_train)
dt_predict = dt_classifier.predict(info_test[['Raters', 'Reviewers', 'Pages', 'PublishYear']])

In [97]:
print(accuracy(main_predict_4, labels_test), accuracy(dt_predict, labels_test))

0.6915629322268326 0.6862353363044926


In [98]:
print(confusion_matrix(3, main_predict_4, labels_test))
print(confusion_matrix(3, dt_predict, labels_test))

[[    0   126   148]
 [    0  1423  4426]
 [    0  1321 12077]]
[[    0     0   274]
 [    0     0  5849]
 [    0     2 13396]]


In [99]:
book_classifier_2 = BookClassifier(30)
book_classifier_2.fit(info_train, labels_train)

In [100]:
main_predict_2_1 = book_classifier_2.predict(info_test, knn_classifier)

In [101]:
print(accuracy(main_predict_2_1, labels_test), accuracy(knn_predict, labels_test))

0.685723067465806 0.6707135904922904


In [102]:
print(confusion_matrix(3, main_predict_2_1, labels_test))
print(confusion_matrix(3, knn_predict, labels_test))

[[    0   122   152]
 [    5  1897  3947]
 [    3  1906 11489]]
[[    1   125   148]
 [    1  1482  4366]
 [    2  1786 11610]]


In [103]:
main_predict_2_2 = book_classifier_2.predict(info_test, knn_classifier_2)

In [105]:
print(accuracy(main_predict_2_2, labels_test), accuracy(knn_predict_2, labels_test))

0.7045745607294708 0.6873111008657343


In [106]:
print(confusion_matrix(3, main_predict_2_1, labels_test))
print(confusion_matrix(3, knn_predict, labels_test))

[[    0   122   152]
 [    5  1897  3947]
 [    3  1906 11489]]
[[    1   125   148]
 [    1  1482  4366]
 [    2  1786 11610]]


In [107]:
main_predict_2_3 = book_classifier_2.predict(info_test, nb_classifier)

In [108]:
print(accuracy(main_predict_2_3, labels_test), accuracy(nb_predict, labels_test))

0.35305568362276524 0.1564981302187388


In [109]:
print(confusion_matrix(3, main_predict_2_3, labels_test))
print(confusion_matrix(3, nb_predict, labels_test))

[[ 121  135   18]
 [2219 2663  967]
 [4399 4891 4108]]
[[ 241   26    7]
 [4492 1019  338]
 [8279 3324 1795]]


In [110]:
main_predict_2_4 = book_classifier_2.predict(info_test, dt_classifier)

In [111]:
print(accuracy(main_predict_2_4, labels_test), accuracy(dt_predict, labels_test))

0.6996055529942113 0.6862353363044926


In [112]:
print(confusion_matrix(3, main_predict_2_4, labels_test))
print(confusion_matrix(3, dt_predict, labels_test))

[[    0   108   166]
 [    2  1487  4360]
 [    1  1227 12170]]
[[    0     0   274]
 [    0     0  5849]
 [    0     2 13396]]
