# Getting Started

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
%cd '/content/drive/MyDrive/ML Project - Books'

In [1]:
import random
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.random import sample_without_replacement
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Get Data

In [53]:
books = pd.read_csv('goodreads/books.csv')
genre = pd.read_csv('goodreads/genre.csv')

print(books.shape)
print(genre.shape)

(447991, 16)
(12019830, 3)


In [54]:
books.head()

Unnamed: 0,BookID,Title,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,GenreLink,Series,PublishYear
0,3,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré",4.48,4868903,1704361,631378,148016,127122,118557,309,Hardcover,English,4640799,Harry Potter,1997
1,1,Harry Potter and the Half-Blood Prince,J.K. Rowling,4.57,1738165,611768,175048,29134,13230,41832,652,Paperback,English,41335427,Harry Potter,2005
2,7,The Harry Potter Collection,"J.K. Rowling, Mary GrandPré",4.73,25140,4473,1111,230,282,921,318,Paperback,English,21457570,Harry Potter,2005
3,10,Harry Potter Collection,J.K. Rowling,4.73,25140,4473,1111,230,282,921,3342,Hardcover,English,21457570,Harry Potter,2005
4,5,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré",4.57,2028235,706012,215277,29265,14292,58998,435,Mass Market Paperback,English,2402163,Harry Potter,1999


In [55]:
genre.head()

Unnamed: 0,GenreLink,Genre,NumberOfPeople
0,17243229,fix,1
1,1888943,unfinished,11
2,940892,borrowed,21
3,4417,het-contemporary-romance-books,1
4,67763,to-read,1820


# Preprocess Data

In [56]:
def get_genre_df(books, genre):
    links = books.GenreLink.unique().tolist()

    return genre[genre.GenreLink.isin(links)]

def get_genre(link, genre):
    return genre[genre.GenreLink == link]

In [57]:
book_data = books[['BookID', 'Author', 'Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'Book format', 'Language', 'PublishYear', 'GenreLink', 'Series']]
book_data.shape

(447991, 15)

In [58]:
scaler = MinMaxScaler()
book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']] = scaler.fit_transform(book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']])

book_data.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
0,3,"J.K. Rowling, Mary GrandPré",0.884735,0.988587,0.988436,0.592227,0.261098,0.224135,0.98903,0.008351,Hardcover,English,0.767988,4640799,Harry Potter
1,1,J.K. Rowling,0.912773,0.352918,0.354791,0.164192,0.051392,0.02333,0.348972,0.017622,Paperback,English,0.771066,41335427,Harry Potter
2,7,"J.K. Rowling, Mary GrandPré",0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.008595,Paperback,English,0.771066,21457570,Harry Potter
3,10,J.K. Rowling,0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.090324,Hardcover,English,0.771066,21457570,Harry Potter
4,5,"J.K. Rowling, Mary GrandPré",0.912773,0.411815,0.409447,0.201927,0.051623,0.025202,0.492175,0.011757,Mass Market Paperback,English,0.768757,2402163,Harry Potter


# Calculate Distance

In [59]:
def jaccard_dissimalirity(a_lst, b_lst):
    intersection = list(set(a_lst) & set(b_lst))
    union = list(set(a_lst) | set(b_lst))

    jaccard_sim = (1.0 * len(intersection))/len(union)

    return abs(1 - jaccard_sim)

def hamming_distance(a, b):
    if a == b:
        return 0
    
    return 1

def euclidian_distance(diff):
    diff2 = diff**2
    diff2_sum = diff2.sum(axis=1)
    
    return np.sqrt(np.float32(diff2_sum))

def cosine_dissimilarity(v_a, v_b):
    product = np.sum(v_a * v_b)
    length_product = np.sqrt(np.float32((v_a**2).sum())) * np.sqrt(np.float32((v_b**2).sum()))

    similarity = product/length_product
    return abs(1 - similarity)

In [60]:
def book_difference1(books1, book2, genre1, genre2):
    numeric_cols = ['Rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']
    rows = len(books1)

    numeric = books1[numeric_cols].to_numpy() - book2[numeric_cols].to_numpy()
    
    authors = book2.Author.strip('\"').split(', ')
    author_diff = np.zeros((rows, 1))
    for i in range(rows):
        book_author = books1.iloc[i].Author.strip('\"').split(', ')
        author_diff[i][0] = jaccard_dissimalirity(book_author, authors)

    lang_diff = (books1.Language != book2.Language).astype(int).to_numpy().reshape((rows, 1))
    format_diff = (books1['Book format'] != book2['Book format']).astype(int).to_numpy().reshape((rows, 1))

    series_diff = np.ones((rows, 1))
    if book2.Series != "none":
        series_diff = (books1.Series != book2.Series).astype(int).to_numpy().reshape((rows, 1))

    tags = genre2.Genre.tolist()
    tag_diff = np.zeros((rows,1))
    for i in range(rows):
        book_tags = genre1[genre1.GenreLink == books1.iloc[i].GenreLink].Genre.tolist()
        tag_diff[i][0] = jaccard_dissimalirity(book_tags, tags)

    diff = np.append(numeric, author_diff, axis=1)
    diff = np.append(diff, lang_diff, axis=1)
    diff = np.append(diff, format_diff, axis=1)
    diff = np.append(diff, series_diff, axis=1)
    diff = np.append(diff, tag_diff, axis=1)

    return euclidian_distance(diff)

# Recommendation System

In [61]:
class BookRecommender:
    def __init__(self, distance):
        self.distance = distance

    def set_dataset(self, books, genre):
        self.books = books
        self.genre = genre

    def recommend(self, client, cli_genre):
        client_size = len(client)
        rec = np.zeros((len(self.books), client_size)) 
        
        for i in range(client_size):
            rec[:, i] = self.distance(self.books, client.iloc[i], self.genre, cli_genre)

        return rec 

    def get_recommendations(self, client, cli_genre, ind, k):
        rec = self.recommend(client, cli_genre)[:, ind]
        rec_sorted = np.argsort(rec)

        rec_ind = rec_sorted[:k]

        return self.books.iloc[rec_ind]

In [62]:
sample_ind = sample_without_replacement(len(books), 200)
sample_data = book_data.iloc[sample_ind]

sample_data

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
79903,199903,Stanley Kiesel,0.809969,0.000018,0.000024,0.000020,0.000023,0.000005,0.000200,0.005595,Paperback,English,0.761447,193390,
146874,460081,"Mary Pope Osborne, Jutta Knipping",0.710280,0.001275,0.002754,0.004237,0.001875,0.000589,0.005514,0.002378,Paperback,German,0.766064,448620,Magic Tree House
321664,1241168,Hermann Hesse,0.691589,0.000314,0.001418,0.001579,0.000520,0.000065,0.002661,0.003757,Paperback,English,0.732205,2162171,
319558,1230076,Kim Barnes,0.697819,0.000029,0.000123,0.000117,0.000053,0.000018,0.000651,0.006946,Hardcover,English,0.766064,987320,
293604,1109181,Arthur Miller,0.607477,0.014744,0.065150,0.096518,0.063623,0.020100,0.066521,0.002865,Paperback,English,0.751058,1426723,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241440,891463,"Gertrude Chandler Warner, Aimee Lilly",0.716511,0.000032,0.000063,0.000095,0.000048,0.000019,0.000083,0.008162,Audio CD,English,0.769142,3039792,The Boxcar Children
171722,569139,Dore Gold,0.691589,0.000013,0.000053,0.000054,0.000018,0.000014,0.000259,0.006919,Hardcover,English,0.770296,556194,
447286,1995370,James Herriot,0.909657,0.000213,0.000228,0.000104,0.000023,0.000019,0.000818,0.013622,Hardcover,English,0.762601,1203500,
328753,1276825,Eric Jerome Dickey,0.757009,0.000255,0.000619,0.000657,0.000238,0.000039,0.000776,0.010811,Paperback,English,0.769142,1265801,


In [63]:
cli_ind = sample_without_replacement(len(books), 20)
cli_books = book_data.iloc[cli_ind]

cli_books

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
429047,1880973,Harriet Beecher Stowe,0.697819,0.013773,0.03995,0.045952,0.02549,0.008974,0.068365,0.013162,Hardcover,English,0.711812,2478635,
300874,1140384,Michael Meyer,0.682243,2.8e-05,8.1e-05,7.8e-05,5.3e-05,4.2e-05,0.000267,0.007865,Paperback,English,0.710273,1127653,
55939,124415,"Nathalie Sarraute, Maria Jolas",0.635514,5.5e-05,0.000209,0.000287,0.00018,6.3e-05,0.000884,0.001946,Paperback,English,0.745671,745430,
390287,1629471,"John Grisham, Bernhard Liesen, Bea Reiter, Imk...",0.685358,0.004156,0.014647,0.014998,0.008504,0.00354,0.043071,0.012514,Paperback,German,0.771451,3202981,
15388,28570,Jeffery Deaver,0.738318,0.000512,0.001982,0.001685,0.000466,8.8e-05,0.003153,0.012189,Paperback,English,0.768757,2166664,
441972,1958653,"Jane LaFerla, Veronika Alice Gunter",0.866044,2.3e-05,3.3e-05,2.2e-05,4e-06,5e-06,0.000125,0.00627,Paperback,English,0.770681,742735,
291318,1099559,Kathy Reichs,0.700935,0.001572,0.00593,0.006341,0.00245,0.000702,0.009043,0.009486,Hardcover,English,0.771066,2747783,Temperance Brennan
276049,1028303,"Jacky Gunn, Jim Jenkins",0.785047,2e-05,4.7e-05,2.6e-05,2.5e-05,7e-06,0.000167,0.007892,Paperback,English,0.766064,1014556,
272581,11767,Don DeLillo,0.501558,0.000258,0.001498,0.003223,0.00296,0.000934,0.006749,0.003459,Paperback,English,0.769527,1304282,
146165,456919,"Audrey Couloumbis, Gino D'Achille",0.688474,9.4e-05,0.000363,0.000372,0.000189,5.8e-05,0.001944,0.008216,Hardcover,English,0.771066,2411387,Maude March Misadventures


In [64]:
sample_gen = get_genre_df(sample_data, genre)
cli_gen = get_genre_df(cli_books, genre)

In [65]:
recommender = BookRecommender(book_difference1)
recommender.set_dataset(sample_data, sample_gen)

recommender.recommend(cli_books, cli_gen)

array([[1.72198951, 1.39993536, 1.40412903, ..., 1.99176764, 1.42588341,
        1.98583114],
       [2.20953918, 1.96849608, 1.96902132, ..., 2.20791936, 1.98032999,
        2.20834494],
       [1.70615804, 1.37926543, 1.38025844, ..., 1.97603166, 1.39193273,
        1.97791398],
       ...,
       [1.40049434, 1.72025084, 1.7263695 , ..., 1.99332678, 1.74934781,
        1.98082316],
       [1.71515203, 1.39102447, 1.39329636, ..., 1.98440075, 1.41254818,
        1.98190987],
       [1.69164371, 1.96529794, 1.9673183 , ..., 1.96636319, 1.98131049,
        1.96216869]])

In [66]:
cli_books.iloc[15]

BookID                         6240
Author                 David Malouf
Rating                     0.697819
5 stars                 0.000126089
4 stars                 0.000447138
3 stars                 0.000405213
2 stars                 0.000248722
1 star                  8.63929e-05
Number of reviewers      0.00175187
Pages                    0.00421622
Book format               Paperback
Language                    English
PublishYear                0.760677
GenreLink                   1442072
Series                         None
Name: 3383, dtype: object

In [67]:
recommender.get_recommendations(cli_books, cli_gen, 15, 3)

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
114464,325043,Charles Dickens,0.71028,0.002672,0.008749,0.00899,0.003923,0.001396,0.013456,0.01427,Paperback,English,0.707195,4993095,
243784,899399,Joan Didion,0.682243,0.000139,0.000552,0.000642,0.000277,6.5e-05,0.001977,0.006,Paperback,English,0.762986,1214244,
256032,946332,Ford Madox Ford,0.64486,0.001281,0.004493,0.005761,0.003877,0.001544,0.01374,0.00627,Paperback,English,0.736437,1881188,


# Recommendation System with Clustering

In [69]:
def clustering_distance(book1, book2, gen1, gen2):
    distance = 0.0

    authors1 = book1.Author.strip('\"').split(', ')
    authors2 = book2.Author.strip('\"').split(', ')
    distance += jaccard_dissimalirity(authors1, authors2)

    if book2.Series != "None" and book2.Series == book1.Series:
        distance += 1

    tags1 = gen1.Genre.tolist()
    tags2 = gen2.Genre.tolist()
    distance += jaccard_dissimalirity(tags1, tags2)

    return distance

def get_frequency(book, genre, cluster):
    cluster['info'] = cluster['info'].append(book[['Author', 'Series']])
    cluster['tags'] = cluster['tags'].append(genre[['GenreLink', 'Genre']])

    return cluster

def get_mode(cluster):
    new_centroid = pd.Series()

    new_centroid['Author'] = cluster['info'].Author.mode()[0]
    new_centroid['Series'] = cluster['info'].Series.mode()[0]

    length = cluster['tags'].groupby('GenreLink').size().mode()[0]
    sorted_tags = cluster['tags'].groupby('Genre', as_index=False).size().sort_values(by='size', ascending=False)
    new_genre = sorted_tags.iloc[:length]

    return (new_centroid, new_genre)

sample_frequency = {'info': pd.DataFrame(columns=['Author', 'Series']), 'tags': pd.DataFrame(columns=['GenreLink', 'Genre'])}

In [100]:
class KMedoids:
    def __init__(self, distance, frequency_calculator, sample_frequency, get_medoid, k = 3):
        self.k = k

        self.distance = distance
        self.frequency_calculator = frequency_calculator
        self.sample_frequency = sample_frequency
        self.get_medoid = get_medoid


    def get_genre(self, row, medoid=False):
        if medoid:
            return self.medoid_genre[self.medoid_genre.GenreLink == row.GenreLink]

        return self.corpus[self.corpus.GenreLink == row.GenreLink]
        

    def get_genre_df(self, data):
        links = data.GenreLink.unique().tolist()

        return self.corpus[self.corpus.GenreLink.isin(links)]

        
    def fit(self, data, tags, epoch=1000):
        self.dataset = data
        self.corpus = tags

        self.rows = len(self.dataset)
        
        # Step 1: Get medoids
        medoid_inds = []

        for i in range(self.k):
            medoid_ind = random.choice(range(self.rows))

            while medoid_ind in medoid_inds:
                medoid_ind = random.choice(range(self.rows))

            medoid_inds.append(medoid_ind)

        self.medoids = self.dataset.iloc[medoid_inds].reset_index(drop=True)
        self.medoid_genre = self.get_genre_df(self.medoids)
        print("Step 1 finished")
        #print(self.medoids)

        error = np.ones(self.k)
        self.dataset['label'] = -1

        count = 1
        while (error.any() != 0) and (count < epoch):
            # Step 2: Distance
            # Cluster labels for each point
            labels = np.array([-1 for x in range(self.rows)])

            # Distances to each medoid
            distances = np.zeros(self.k)

            # Frequency for mode
            frequency = [deepcopy(self.sample_frequency) for x in range(self.k)]

            # Calculate distance to each medoid
            for i in range(self.rows):
                cluster = -1
                row = self.dataset.iloc[i]
                gen = self.get_genre(row)

                for j in range(self.k):
                    medoid = self.medoids.iloc[j]
                    medoid_genre = self.get_genre(medoid, True)
                    distances[j] = self.distance(row, medoid, gen, medoid_genre)
                
                #print(distances)
                cluster = np.argmin(distances)
                    
                labels[i] = cluster
                frequency[cluster] = self.frequency_calculator(row, gen, frequency[cluster])
                
            self.dataset['label'] = labels
            print("Step 2." + str(count) + " finished.")
            #print(self.medoids)

            # Step 3: Update medoids
            medoid_ids = []
            for i in range(self.k):
                new_medoid_id = -1
                
                if error[i] > 0:
                    proposed_medoid, proposed_genre = self.get_medoid(frequency[i])
                    new_medoid = proposed_medoid
                    new_medoid_genre = proposed_genre
                    best_distance = 1000

                    #print("Start step 3")
                    labelset = self.dataset[self.dataset.label == i].reset_index(drop=True)

                    for j in range(len(labelset)):
                        row = labelset.iloc[j]
                        gen = self.get_genre(row)

                        current_dist = self.distance(proposed_medoid, row, proposed_genre, gen)
                        
                        if current_dist < best_distance:
                            new_medoid_id = row.BookID
                            new_medoid = row
                            new_medoid_genre = gen
                            best_distance = current_dist
                else:
                    new_medoid_id = self.medoids.iloc[i].BookID
                            
                medoid_ids.append(new_medoid_id) 
                error[i] = self.distance(new_medoid, self.medoids.iloc[i], new_medoid_genre, self.get_genre(self.medoids.iloc[i], True))

            #print(medoid_ids)
            print(error)
            self.medoids = self.dataset[self.dataset.BookID.isin(medoid_ids)].reset_index(drop=True)
            self.medoid_genre = self.get_genre_df(self.medoids)

            print("Step 3." + str(count) + " finished.")
            #print("Iteration: " + str(count))
            #print(self.medoids)

            count += 1


    def inertia(self):
        total_distance = 0.0

        for i in range(self.k):
            medoid = self.medoids.iloc[i]
            medoid_genre = self.get_genre(medoid, True)
            labelset = self.dataset[self.dataset.label == i].reset_index(drop=True)

            for ind in range(len(labelset)):
                row = labelset.iloc[ind]
                total_distance += self.distance(medoid, row, medoid_genre, self.get_genre(row))

        return total_distance

    
    def predict(self, data, genre):
        result = np.array([-1 for x in range(len(data))])
        
        for i in range(len(data)):
            distances = np.zeros(self.k, dtype=int)
            row = data.iloc[i]
            gen = genre[genre.GenreLink == row.GenreLink]

            print(self.medoid)
            for j in range(self.k):
                medoid = self.medoids.iloc[j]
                distances[j] = self.distance(row, medoid, gen, self.get_genre(medoid, True))
            print(distances)

            result[i] = np.argmin(distances)
        
        return result

    
    def get_labels(self):
        return self.dataset.label

    
    def get_medoids(self):
        return self.medoids


    def get_medoid_genre(self):
        return self.medoid_genre

In [94]:
sample_ind2 = sample_without_replacement(len(books), 1000)
sample_data2 = book_data.iloc[sample_ind2]

sample_data2

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
60349,137051,Robert Anton Wilson,0.666667,0.000069,0.000249,0.000341,0.000175,0.000032,0.000384,0.010811,Paperback,German,0.768372,351935,
176469,588942,Kim Harrison,0.819315,0.005343,0.011233,0.007323,0.002233,0.000779,0.011562,0.013784,Paperback,English,0.771451,2346717,The Hollows
267392,222078,E.H. Gombrich,0.716511,0.030463,0.044865,0.053138,0.040296,0.040058,0.008200,0.018595,Hardcover,English,0.749904,215065,
118977,343553,Barbara Ann Brennan,0.806854,0.001276,0.001984,0.001687,0.000831,0.000464,0.001235,0.007838,Paperback,English,0.764140,2126028,
275051,1024396,James C. Scott,0.800623,0.000311,0.000763,0.000466,0.000180,0.000071,0.002978,0.012054,Hardcover,English,0.768372,21381,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123644,362991,Stewart O'Nan,0.685358,0.000055,0.000271,0.000244,0.000115,0.000041,0.000818,0.006838,Paperback,German,0.767988,2941863,
326919,1266557,Agatha Christie,0.688474,0.000958,0.003712,0.004483,0.001716,0.000316,0.009794,0.007784,Hardcover,English,0.747595,6886769,
28163,57126,"C.J. Cherryh, Leslie Fish, Nancy Asire, Merced...",0.613707,0.000008,0.000021,0.000054,0.000021,0.000016,0.000025,0.021865,Hardcover,English,0.767218,55654,The Sword of Knowledge
283270,54283,Judith Kerr,0.735202,0.001015,0.003436,0.002998,0.001159,0.000323,0.009936,0.005162,Paperback,English,0.757984,1877851,Out of the Hitler Time


In [82]:
cli_ind2 = sample_without_replacement(len(books), 20)
cli_books2 = book_data.iloc[cli_ind2]

cli_books2.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
394441,1659763,Bob Spitz,0.738318,0.001019,0.002446,0.001744,0.001214,0.001003,0.004271,0.026811,Hardcover,English,0.771066,1654509,
375905,1530106,Isaac Bashevis Singer,0.688474,1.7e-05,6e-05,7e-05,4.1e-05,9e-06,0.000142,0.0,Paperback,Spanish,0.766064,2267466,
12616,22680,Joseph J. Ellis,0.707165,0.001861,0.005465,0.004995,0.002803,0.00186,0.005648,0.011892,Paperback,English,0.767988,1704815,
272044,1011455,Cathy Hopkins,0.657321,2.2e-05,7.2e-05,0.000125,6.4e-05,9e-06,0.000117,0.007919,Hardcover,English,0.771066,997579,"Truth, Dare, Kiss, Promise"
125102,368189,John Dominic Crossan,0.682243,1.9e-05,7.2e-05,6.9e-05,4.4e-05,1.9e-05,0.000175,0.018595,Paperback,English,0.768372,74355,


In [95]:
sample_gen2 = get_genre_df(sample_data2, genre)
cli_gen2 = get_genre_df(cli_books2, genre)

In [102]:
model = KMedoids(clustering_distance, get_frequency, sample_frequency, get_mode, 15)
model.fit(sample_data2, sample_gen2, 10)

model.inertia()

Step 1 finished
Step 2.1 finished.
[1.64900662 1.87116564 1.76582278 0.         1.75       0.85625
 1.91156463 1.75316456 1.64864865 1.69281046 1.86627907 1.18018018
 1.60839161 1.82307692 1.5037037 ]
Step 3.1 finished.
Step 2.2 finished.
[0.         1.         0.         1.94871795 1.         0.
 0.         0.88484848 1.80357143 1.71518987 1.         0.
 0.70469799 0.         0.        ]
Step 3.2 finished.
Step 2.3 finished.
[1.85802469 1.65100671 1.84705882 0.         1.         1.90163934
 1.81871345 0.         1.         0.27118644 1.         1.86931818
 0.         1.90909091 1.91608392]
Step 3.3 finished.
Step 2.4 finished.
[1.         0.         0.79605263 1.95833333 1.         1.
 1.         1.91935484 0.         1.         1.         0.
 1.93055556 0.         0.        ]
Step 3.4 finished.
Step 2.5 finished.
[1.         1.83229814 0.         1.         1.         1.
 0.17117117 0.         1.94444444 1.         0.         1.9
 0.         1.90909091 1.91608392]
Step 3.5 finished.

1734.844025661816

In [103]:
model.predict(cli_books2, cli_gen2)

AttributeError: 'KMedoids' object has no attribute 'medoid'

In [None]:
a = sample_gen2[sample_gen2.GenreLink == 1115313].Genre.values.tolist()
b = sample_gen2[sample_gen2.GenreLink == 2064030].Genre.values.tolist()
jaccard_dissimalirity(a, b)

In [None]:
count = 0
for link1 in sample_data2.GenreLink.tolist():
    for link2 in sample_data2.GenreLink.tolist():
        a = sample_gen2[sample_gen2.GenreLink == link1].Genre.values.tolist()
        b = sample_gen2[sample_gen2.GenreLink == link2].Genre.values.tolist()
        dist = jaccard_dissimalirity(a, b)
        
        if dist <= 0:
            print(dist, link1, link2)
            count += 1
            
count

In [None]:
a1 = genre[genre.GenreLink == 2593467].Genre.unique().tolist()
print(len(a1))

In [None]:
b1 = genre[genre.GenreLink == 3295655].Genre.unique().tolist()
print(len(b1))

In [None]:
jaccard_dissimalirity(a1, b1)