# Getting Started

In [2]:
import random
import numpy as np
import pandas as pd
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.random import sample_without_replacement
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Get Data

In [3]:
books = pd.read_csv('goodreads/books.csv')
genre = pd.read_csv('goodreads/genre.csv')

print(books.shape)
print(genre.shape)

(447991, 16)
(12019830, 3)


In [None]:
books.head()

Unnamed: 0,BookID,Title,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,GenreLink,Series,PublishYear
0,3,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré",4.48,4868903,1704361,631378,148016,127122,118557,309,Hardcover,English,4640799,Harry Potter,1997
1,1,Harry Potter and the Half-Blood Prince,J.K. Rowling,4.57,1738165,611768,175048,29134,13230,41832,652,Paperback,English,41335427,Harry Potter,2005
2,7,The Harry Potter Collection,"J.K. Rowling, Mary GrandPré",4.73,25140,4473,1111,230,282,921,318,Paperback,English,21457570,Harry Potter,2005
3,10,Harry Potter Collection,J.K. Rowling,4.73,25140,4473,1111,230,282,921,3342,Hardcover,English,21457570,Harry Potter,2005
4,5,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré",4.57,2028235,706012,215277,29265,14292,58998,435,Mass Market Paperback,English,2402163,Harry Potter,1999


In [None]:
genre.head()

Unnamed: 0,GenreLink,Genre,NumberOfPeople
0,17243229,fix,1
1,1888943,unfinished,11
2,940892,borrowed,21
3,4417,het-contemporary-romance-books,1
4,67763,to-read,1820


# Preprocess Data

In [None]:
def get_genre_df(books, genre):
    links = books.GenreLink.unique().tolist()

    return genre[genre.GenreLink.isin(links)]

def get_genre(link, genre):
    return genre[genre.GenreLink == link]

In [None]:
book_data = books[['BookID', 'Author', 'Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'Book format', 'Language', 'PublishYear', 'GenreLink', 'Series']]
book_data.shape

(447991, 15)

In [None]:
scaler = MinMaxScaler()
book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']] = scaler.fit_transform(book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']])

book_data.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
0,3,"J.K. Rowling, Mary GrandPré",0.884735,0.988587,0.988436,0.592227,0.261098,0.224135,0.98903,0.008351,Hardcover,English,0.767988,4640799,Harry Potter
1,1,J.K. Rowling,0.912773,0.352918,0.354791,0.164192,0.051392,0.02333,0.348972,0.017622,Paperback,English,0.771066,41335427,Harry Potter
2,7,"J.K. Rowling, Mary GrandPré",0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.008595,Paperback,English,0.771066,21457570,Harry Potter
3,10,J.K. Rowling,0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.090324,Hardcover,English,0.771066,21457570,Harry Potter
4,5,"J.K. Rowling, Mary GrandPré",0.912773,0.411815,0.409447,0.201927,0.051623,0.025202,0.492175,0.011757,Mass Market Paperback,English,0.768757,2402163,Harry Potter


# Calculate Distance

In [None]:
def jaccard_dissimalirity(a_lst, b_lst):
    intersection = list(set(a_lst) & set(b_lst))
    union = list(set(a_lst) | set(b_lst))

    jaccard_sim = (1.0 * len(intersection))/len(union)

    return abs(1 - jaccard_sim)

def hamming_distance(a, b):
    if a == b:
        return 0
    
    return 1

def euclidian_distance(diff):
    diff2 = diff**2
    diff2_sum = diff2.sum(axis=1)
    
    return np.sqrt(np.float32(diff2_sum))

def cosine_dissimilarity(v_a, v_b):
    product = np.sum(v_a * v_b)
    length_product = np.sqrt(np.float32((v_a**2).sum())) * np.sqrt(np.float32((v_b**2).sum()))

    similarity = product/length_product
    return abs(1 - similarity)

In [None]:
def book_difference1(books1, book2, genre1, genre2):
    numeric_cols = ['Rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']
    rows = len(books1)

    numeric = books1[numeric_cols].to_numpy() - book2[numeric_cols].to_numpy()
    
    authors = book2.Author.strip('\"').split(', ')
    author_diff = np.zeros((rows, 1))
    for i in range(rows):
        book_author = books1.iloc[i].Author.strip('\"').split(', ')
        author_diff[i][0] = jaccard_dissimalirity(book_author, authors)

    lang_diff = (books1.Language != book2.Language).astype(int).to_numpy().reshape((rows, 1))
    format_diff = (books1['Book format'] != book2['Book format']).astype(int).to_numpy().reshape((rows, 1))

    series_diff = np.ones((rows, 1))
    if book2.Series != "none":
        series_diff = (books1.Series != book2.Series).astype(int).to_numpy().reshape((rows, 1))

    tags = genre2.Genre.unique().tolist()
    tag_diff = np.zeros((rows,1))
    for i in range(rows):
        book_tags = genre1[genre1.GenreLink == books1.iloc[i].GenreLink].Genre.unique().tolist()
        tag_diff[i][0] = jaccard_dissimalirity(book_tags, tags)

    diff = np.append(numeric, author_diff, axis=1)
    diff = np.append(diff, lang_diff, axis=1)
    diff = np.append(diff, format_diff, axis=1)
    diff = np.append(diff, series_diff, axis=1)
    diff = np.append(diff, tag_diff, axis=1)

    return euclidian_distance(diff)

# Recommendation System

In [None]:
class BookRecommender:
    def __init__(self, distance):
        self.distance = distance

    def set_dataset(self, books, genre):
        self.books = books
        self.genre = genre

    def recommend(self, client, cli_genre):
        client_size = len(client)
        rec = np.zeros((len(self.books), client_size)) 
        
        for i in range(client_size):
            rec[:, i] = self.distance(self.books, client.iloc[i], self.genre, cli_genre)

        return rec 

    def get_recommendations(self, client, cli_genre, ind, k):
        rec = self.recommend(client, cli_genre)[:, ind]
        rec_sorted = np.argsort(rec)

        rec_ind = rec_sorted[:k]

        return self.books.iloc[rec_ind]

In [None]:
sample_ind = sample_without_replacement(len(books), 300)
sample_data = book_data.iloc[sample_ind]

sample_data

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
109834,305438,Terri Blackstock,0.778816,0.000766,0.001825,0.001491,0.000554,0.000160,0.003771,0.010811,Paperback,English,0.769912,296438,Cape Refuge
416670,1808223,Michael Moorcock,0.700935,0.001452,0.005185,0.005221,0.002498,0.000762,0.008643,0.004324,Paperback,English,0.758369,388812,The Elric Saga
774,1436,"William Shakespeare, Cyrus Hoy",0.744548,0.060419,0.148402,0.143160,0.072498,0.024647,0.113813,0.008027,Paperback,English,0.616391,1885548,
353672,1408837,"Michael Ende, Михаэль Энде",0.838006,0.006163,0.010624,0.006647,0.002595,0.000890,0.029006,0.008000,Paperback,Russian,0.758753,517524,
368954,1488575,"Kenneth R. Miller, Joseph S. Levine",0.710280,0.000013,0.000022,0.000027,0.000016,0.000023,0.000159,0.008000,Hardcover,English,0.766064,1479798,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353559,1408360,Samuel Butler,0.613707,0.000397,0.001651,0.002352,0.001549,0.000599,0.004038,0.008000,Paperback,English,0.731820,575504,
228106,842432,Charlton Griffin,0.750779,0.000039,0.000079,0.000090,0.000049,0.000012,0.000083,0.008162,Paperback,English,0.746826,17841741,
138318,422306,"Jacques Tardi, Elisabeth Bell",0.613707,0.000032,0.000155,0.000220,0.000143,0.000039,0.000417,0.001297,Paperback,English,0.759908,50239533,Adèle Blanc-Sec
321408,1239887,Søren Kierkegaard,0.732087,0.000121,0.000370,0.000344,0.000153,0.000048,0.001009,0.007919,Hardcover,English,0.709119,1228558,


In [None]:
cli_ind = sample_without_replacement(len(books), 20)
cli_books = book_data.iloc[cli_ind]

cli_books

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
245193,12786,John Berendt,0.5919,0.000522,0.003034,0.004932,0.002773,0.000617,0.014799,0.011189,Paperback,English,0.771066,737109,
374170,224439,"Hitomi Kanehara, David James Karashima",0.479751,0.000122,0.000602,0.001335,0.001392,0.000624,0.003988,0.003459,Paperback,English,0.770296,2101804,
320228,1233661,"Anna Gavalda, Ina Kronenberger",0.588785,0.000413,0.002031,0.003189,0.002274,0.000633,0.006499,0.004541,Paperback,German,0.768757,892968,
157535,921122,Desmond Morris,0.697819,5.5e-05,0.000237,0.000259,8.5e-05,1.8e-05,0.000801,0.002838,Paperback,English,0.763755,906154,Petwatching
49530,107295,Valerie Martin,0.607477,8e-06,5e-05,6.4e-05,3.4e-05,2.1e-05,0.0002,0.008,Paperback,English,0.766833,1130889,
261852,967762,"Linda Granfield, Janet Wilson",0.825545,2.5e-05,3.6e-05,3.1e-05,1.1e-05,9e-06,0.000375,0.000865,Hardcover,English,0.736437,952659,
412140,1781838,"Albert Ellis, Yasutaka Kokubu",0.682243,6.1e-05,0.000173,0.000177,0.000153,5.6e-05,0.000784,0.008595,Paperback,Japanese,0.767603,819307,
265375,981869,William King,0.760125,0.000129,0.000442,0.000322,0.000106,1.6e-05,0.000375,0.007784,Paperback,English,0.769142,966758,Gotrek & Felix
346678,1373680,William Diehl,0.632399,2.1e-05,0.000106,0.000148,7.4e-05,2.1e-05,0.00015,0.010081,Unknown Binding,English,0.761831,1363563,
282535,1057619,Margo Lanagan,0.573209,3.4e-05,0.000134,0.000192,0.000189,8.6e-05,0.001201,0.005622,Library Binding,English,0.771451,3227218,


In [None]:
sample_gen = get_genre_df(sample_data, genre)
cli_gen = get_genre_df(cli_books, genre)

In [None]:
recommender = BookRecommender(book_difference1)
recommender.set_dataset(sample_data, sample_gen)

recommender.recommend(cli_books, cli_gen)

array([[1.71120071, 1.72702813, 1.98224497, ..., 1.70527232, 1.70441806,
        1.71663857],
       [1.69855654, 1.70940924, 1.9712075 , ..., 1.69557023, 1.7053349 ,
        1.70817399],
       [1.3949424 , 1.4130373 , 1.71717525, ..., 1.38999391, 1.39571512,
        1.04814291],
       ...,
       [1.7137394 , 1.71873009, 1.9841181 , ..., 1.7140367 , 1.73530364,
        1.73075199],
       [1.70734537, 1.72010183, 1.97875011, ..., 1.70286751, 1.70852041,
        1.71092772],
       [1.41416943, 1.43565369, 1.73234236, ..., 1.40547848, 1.39933062,
        1.41804755]])

In [None]:
cli_books.iloc[17]

BookID                       1072011
Author                 Denis Johnson
Rating                      0.657321
5 stars                  0.000313089
4 stars                   0.00108798
3 stars                   0.00104211
2 stars                  0.000767332
1 star                   0.000530699
Number of reviewers       0.00291144
Pages                     0.00791892
Book format                Paperback
Language                     English
PublishYear                 0.762601
GenreLink                    2280133
Series                          None
Name: 285771, dtype: object

In [None]:
recommender.get_recommendations(cli_books, cli_gen, 17, 5)

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
209406,762107,F. Scott Fitzgerald,0.657321,0.002276,0.010136,0.012225,0.006273,0.001522,0.024084,0.011676,Paperback,English,0.73913,2432116,
279776,1045672,Paulo Coelho,0.5919,0.003332,0.012538,0.020783,0.015652,0.005693,0.032093,0.009514,Paperback,English,0.771451,13855759,
157027,509199,George Orwell,0.691589,0.001109,0.005706,0.005348,0.001958,0.000388,0.011929,0.007757,Paperback,English,0.743748,1171545,
141679,437367,"William Shakespeare, Emrys Jones",0.641745,0.001445,0.006411,0.009451,0.004551,0.000781,0.011287,0.008081,Paperback,English,0.617545,717119,
263327,973507,Dan Simmons,0.613707,0.000447,0.002345,0.00323,0.001835,0.000569,0.007508,0.008649,Paperback,English,0.763371,1623747,


# Recommendation System with Clustering

In [None]:
class KMedoids:
    def __init__(self, distance, frequency_calculator, sample_frequency, get_medoid, k = 3):
        self.k = k

        self.distance = distance
        self.frequency_calculator = frequency_calculator
        self.sample_frequency = sample_frequency
        self.get_medoid = get_medoid


    def get_genre(self, row, medoid=False):
        if medoid:
            return self.medoid_genre[self.medoid_genre.GenreLink == row.GenreLink]

        return self.corpus[self.corpus.GenreLink == row.GenreLink]
        

    def get_genre_df(self, data):
        links = data.GenreLink.unique().tolist()

        return self.corpus[self.corpus.GenreLink.isin(links)]

        
    def fit(self, data, tags, epoch=1000):
        self.dataset = data
        self.corpus = tags

        self.rows = len(self.dataset)
        
        # Step 1: Get medoids
        medoid_inds = []

        for i in range(self.k):
            medoid_ind = random.choice(range(self.rows))

            while medoid_ind in medoid_inds:
                medoid_ind = random.choice(range(self.rows))

            medoid_inds.append(medoid_ind)

        self.medoids = self.dataset.iloc[medoid_inds].reset_index(drop=True)
        self.medoid_genre = self.get_genre_df(self.medoids)
        print("Step 1 finished")
        #print(self.medoids)

        error = np.ones(self.k)
        self.dataset['label'] = -1

        count = 1
        while (error.any() != 0) and (count < epoch):
            # Step 2: Distance
            # Cluster labels for each point
            labels = np.array([-1 for x in range(self.rows)])

            # Distances to each medoid
            distances = np.zeros(self.k)

            # Frequency for mode
            frequency = [deepcopy(self.sample_frequency) for x in range(self.k)]

            # Calculate distance to each medoid
            for i in range(self.rows):
                cluster = -1
                row = self.dataset.iloc[i]
                gen = self.get_genre(row)

                for j in range(self.k):
                    medoid = self.medoids.iloc[j]
                    medoid_genre = self.get_genre(medoid, True)
                    distances[j] = self.distance(row, medoid, gen, medoid_genre)
                
                #print(distances)
                cluster = np.argmin(distances)
                    
                labels[i] = cluster
                frequency[cluster] = self.frequency_calculator(row, gen, frequency[cluster])
                
            self.dataset['label'] = labels
            print("Step 2." + str(count) + " finished.")
            #print(self.medoids)

            # Step 3: Update medoids
            medoid_ids = []
            for i in range(self.k):
                new_medoid_id = -1
                proposed_medoid, proposed_genre = self.get_medoid(frequency[i])
                new_medoid = proposed_medoid
                new_medoid_genre = proposed_genre
                best_distance = 1000

                #print("Start step 3")
                labelset = self.dataset[self.dataset.label == i].reset_index(drop=True)

                for j in range(len(labelset)):
                    row = labelset.iloc[j]
                    gen = self.get_genre(row)

                    current_dist = self.distance(proposed_medoid, row, proposed_genre, gen)
                    
                    if current_dist < best_distance:
                        new_medoid_id = row.BookID
                        new_medoid = row
                        new_medoid_genre = gen
                        best_distance = current_dist
                            
                medoid_ids.append(new_medoid_id) 
                error[i] = self.distance(new_medoid, self.medoids.iloc[i], new_medoid_genre, self.get_genre(self.medoids.iloc[i], True))

            #print(medoid_ids)
            print(error)
            print(self.dataset.groupby('label').BookID.count().to_numpy())

            self.medoids = self.dataset[self.dataset.BookID.isin(medoid_ids)].reset_index(drop=True)
            self.medoid_genre = self.get_genre_df(self.medoids)

            print("Step 3." + str(count) + " finished.")
            #print("Iteration: " + str(count))
            #print(self.medoids)

            count += 1


    def inertia(self):
        total_distance = 0.0

        for i in range(self.k):
            medoid = self.medoids.iloc[i]
            medoid_genre = self.get_genre(medoid, True)
            labelset = self.dataset[self.dataset.label == i].reset_index(drop=True)

            for ind in range(len(labelset)):
                row = labelset.iloc[ind]
                total_distance += self.distance(medoid, row, medoid_genre, self.get_genre(row))

        return total_distance

    
    def predict(self, data, genre):
        distances = np.zeros(self.k)

        for j in range(self.k):
            medoid = self.medoids.iloc[j]
            medoid_genre = self.get_genre(medoid, True)

            distances[j] = self.distance(data, medoid, genre, medoid_genre)

        result = np.argmin(distances)
        
        return result

    
    def predict_df(self, data, genre):
        result = np.array([-1 for x in range(len(data))])
        #print(self.medoids)
        
        for i in range(len(data)):
            distances = np.zeros(self.k)
            row = data.iloc[i]
            gen = genre[genre.GenreLink == row.GenreLink]

            result[i] = self.predict(row, gen)
        
        return result

    
    def get_labels(self):
        return self.dataset.label

    
    def get_medoids(self):
        return self.medoids


    def get_medoid_genre(self):
        return self.medoid_genre

In [None]:
def clustering_distance(book1, book2, gen1, gen2):
    distance = 0.0

    authors1 = book1.Author.strip('\"').split(', ')
    authors2 = book2.Author.strip('\"').split(', ')
    distance += jaccard_dissimalirity(authors1, authors2)

    if book2.Series != "None" and book2.Series == book1.Series:
        distance += 1

    tags1 = gen1.Genre.unique().tolist()
    tags2 = gen2.Genre.unique().tolist()
    distance += jaccard_dissimalirity(tags1, tags2)

    return distance

def get_frequency(book, genre, cluster):
    cluster['info'] = cluster['info'].append(book[['Author', 'Series']])
    cluster['tags'] = cluster['tags'].append(genre[['GenreLink', 'Genre']])

    return cluster

def get_mode(cluster):
    new_centroid = pd.Series()

    new_centroid['Author'] = cluster['info'].Author.mode()[0]
    new_centroid['Series'] = cluster['info'].Series.mode()[0]

    length = cluster['tags'].groupby('GenreLink').size().mode()[0]
    sorted_tags = cluster['tags'].groupby('Genre', as_index=False).size().sort_values(by='size', ascending=False)
    new_genre = sorted_tags.iloc[:length]

    return (new_centroid, new_genre)

sample_frequency = {'info': pd.DataFrame(columns=['Author', 'Series']), 'tags': pd.DataFrame(columns=['GenreLink', 'Genre'])}

In [None]:
sample_ind2 = sample_without_replacement(len(books), 500)
sample_data2 = book_data.iloc[sample_ind2]

sample_data2

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
356816,1423018,Mark Bowden,0.813084,0.000020,0.000051,0.000025,0.000012,0.000005,0.000217,0.013027,Hardcover,English,0.766833,1413453,
154805,498162,Philip K. Dick,0.707165,0.002027,0.008858,0.007813,0.002944,0.000800,0.014799,0.007784,Paperback,French,0.759138,949696,
364410,1462934,Shyam Selvadurai,0.728972,0.000339,0.001413,0.001126,0.000376,0.000083,0.004321,0.008000,Audiobook,English,0.766833,107283,
199220,704390,"Larry A. Samovar, Richard E. Porter",0.585670,0.000008,0.000028,0.000052,0.000039,0.000021,0.000117,0.009514,Paperback,English,0.765679,502464,
77358,189929,Lisa Belkin,0.819315,0.000050,0.000171,0.000054,0.000016,0.000007,0.000601,0.008946,Hardcover,English,0.768757,183608,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373728,1515973,Barbara Emberley,0.676012,0.000190,0.000382,0.000624,0.000425,0.000164,0.002544,0.000000,Hardcover,English,0.756445,2258608,
77162,189183,Stuart Avery Gold,0.563863,0.000008,0.000028,0.000056,0.000067,0.000011,0.000350,0.003459,Hardcover,English,0.771066,1496587,
271976,1011172,"Stephen King, Michèle Pressé, Serge Quadruppani",0.722741,0.007260,0.024153,0.026350,0.008037,0.001634,0.018169,0.023297,Mass Market Paperback,French,0.763371,1814,
173565,576171,Rosemary Rogers,0.660436,0.000022,0.000071,0.000094,0.000049,0.000028,0.000225,0.012784,Paperback,English,0.771066,563144,


In [None]:
cli_ind2 = sample_without_replacement(len(books), 50)
cli_books2 = book_data.iloc[cli_ind2]

cli_books2.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
128268,381325,Mike Ashley,0.694704,3.8e-05,0.000135,0.000171,5.6e-05,1.9e-05,0.0004,0.013838,Paperback,English,0.771451,1827316,
447971,1999849,Miyuki Eto,0.694704,0.000107,0.00019,0.000264,0.000231,9.7e-05,0.000551,0.005622,Paperback,English,0.771451,2003668,Hell Girl
122332,357938,Carol de Chellis Hill,0.663551,1.1e-05,3.2e-05,3.8e-05,3.2e-05,1.4e-05,0.000184,0.012541,Paperback,English,0.762986,348077,
325433,1259177,V.C. Andrews,0.682243,0.000388,0.000998,0.001531,0.000868,0.000189,0.000709,0.010081,Paperback,English,0.768757,386924,Logan
303685,1153229,Vonda N. McIntyre,0.598131,5e-06,3.3e-05,6.3e-05,2.3e-05,1.1e-05,0.000209,0.004946,Paperback,English,0.763755,1140716,


In [None]:
sample_gen2 = get_genre_df(sample_data2, genre)
cli_gen2 = get_genre_df(cli_books2, genre)

In [None]:
model = KMedoids(clustering_distance, get_frequency, sample_frequency, get_mode, 5)
model.fit(sample_data2, sample_gen2, 5)

model.inertia()

Step 1 finished
Step 2.1 finished.
[1.83529412 1.85276074 1.94186047 1.875      1.8164557 ]
[354  70  15  24  37]
Step 3.1 finished.
Step 2.2 finished.
[1.81097561 1.87719298 1.         1.84705882 1.        ]
[ 21 104  18 165 192]
Step 3.2 finished.
Step 2.3 finished.
[0.         1.         1.84883721 0.         1.        ]
[241  19  48  19 173]
Step 3.3 finished.
Step 2.4 finished.
[0. 1. 0. 1. 1.]
[243  19  19 205  14]
Step 3.4 finished.


917.0713347729705

In [None]:
model.predict_df(cli_books2, cli_gen2)

array([3, 3, 3, 3, 3, 3, 0, 4, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 1,
       2, 0, 0, 0, 0, 3, 3, 0, 0, 0, 1, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 3,
       0, 0, 0, 3, 3, 3])

# Combination

In [None]:
class KBookRecommender:
    def __init__(self, distance, distance_k, frequency_calculator, sample_frequency, get_medoid, k = 3):
        self.distance = distance
        self.KMedoids = KMedoids(distance_k, frequency_calculator, sample_frequency, get_medoid, k)


    def set_dataset(self, books, genre, size, epoch=10):
        self.books = books
        self.genre = genre

        batch_ind = sample_without_replacement(len(self.books), size)
        batch = self.books.iloc[batch_ind]
        batch_gen = get_genre_df(batch, self.genre)

        self.KMedoids.fit(batch, batch_gen, epoch)
        self.books['label'] = self.KMedoids.predict_df(self.books, self.genre)


    def calc_distance(self, client):
        books = self.books[self.books.label == client.label]

        return self.distance(books, client)


    def recommend(self, client, cli_genre, k):
        client_size = len(client)
        rec = np.zeros((len(self.books), client_size)) 

        client['label'] = self.KMedoids.predict(client, cli_genre)
        rec = self.calc_distance(client)
        rec_sorted = np.argsort(rec)
        rec_ind = rec_sorted[:k]

        labelset = self.books[self.books.label == client.label].reset_index(drop=True)

        return labelset.iloc[rec_ind]

In [None]:
def book_difference2(books1, book2):
    numeric_cols = ['Rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']
    rows = len(books1)

    numeric = books1[numeric_cols].to_numpy() - book2[numeric_cols].to_numpy()
    lang_diff = (books1.Language != book2.Language).astype(int).to_numpy().reshape((rows, 1))
    format_diff = (books1['Book format'] != book2['Book format']).astype(int).to_numpy().reshape((rows, 1))

    diff = np.append(numeric, lang_diff, axis=1)
    diff = np.append(diff, format_diff, axis=1)

    return euclidian_distance(diff)

In [None]:
sample_ind3 = sample_without_replacement(len(books), 10000)
sample_data3 = book_data.iloc[sample_ind3]

sample_data3.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
7328,13376,Nancy Farmer,0.753894,0.007022,0.0176,0.01313,0.007144,0.004184,0.054016,0.01027,Paperback,English,0.769912,868252,Matteo Alacran
429861,1885287,Liz Fielding,0.660436,1e-05,3.9e-05,4.2e-05,2.3e-05,1.8e-05,0.0001,0.010811,Mass Market Paperback,English,0.767988,24883393,The Beaumont Brides
436190,1115279,G.A. Henty,0.747664,0.000114,0.000315,0.00027,0.000123,5.1e-05,0.000717,0.008649,Paperback,English,0.724509,1926985,
83437,212855,Ann Wroe,0.638629,1.7e-05,6.6e-05,9e-05,5.8e-05,2.1e-05,0.000509,0.017297,Paperback,English,0.770296,206046,
134983,409599,Katherine Neville,0.610592,8.5e-05,0.000453,0.000648,0.000381,9.3e-05,0.000918,0.013838,Paperback,Spanish,0.766064,1105501,


In [None]:
cli_ind3 = sample_without_replacement(len(books), 200)
cli_books3 = book_data.iloc[cli_ind3]

cli_books3.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
373972,1517307,Beverley Harper,0.803738,1.4e-05,4.2e-05,2.1e-05,7e-06,5e-06,0.0001,0.018595,Paperback,English,0.769912,1509054,
37290,77242,"Margaret E. Keck, Kathryn Sikkink",0.635514,8e-06,4.7e-05,6.7e-05,3.2e-05,7e-06,5.8e-05,0.006486,Paperback,English,0.768372,74661,
428269,1876184,Craig Shaw Gardner,0.613707,0.000113,0.000497,0.000832,0.000411,0.000138,0.000542,0.008108,Paperback,English,0.760677,52342,The Ebenezum Trilogy
110252,307264,Andrew M. Greeley,0.70405,5.2e-05,0.000208,0.000204,7.8e-05,2.5e-05,0.000259,0.010378,Paperback,English,0.768372,298234,Nuala Anne McGrail
346220,1371280,Diane Johnson,0.392523,6.2e-05,0.000532,0.001575,0.001949,0.000767,0.003245,0.008324,Hardcover,English,0.767988,210396,


In [None]:
sample_gen3 = get_genre_df(sample_data3, genre)
cli_gen3 = get_genre_df(cli_books3, genre)

In [None]:
krecommender = KBookRecommender(book_difference2, clustering_distance, get_frequency, sample_frequency, get_mode, 10)
krecommender.set_dataset(sample_data3, sample_gen3, 2000, 5)

Step 1 finished
Step 2.1 finished.
[1.58741259 1.87857143 1.82051282 1.53623188 1.6442953  1.71428571
 1.8707483  1.91489362 1.7607362  1.94915254]
[632  22  97 525 239 111  11   6 339  18]
Step 3.1 finished.
Step 2.2 finished.
[1.87272727 0.         1.89473684 0.         0.         1.
 0.         0.80625    1.         0.        ]
[323  49  10  11  45 790 386  65 318   3]
Step 3.2 finished.
Step 2.3 finished.
[1.90977444 0.         1.90196078 0.         0.         1.
 0.         1.         0.         0.        ]
[ 38  25  64  44 459 691 341 308  27   3]
Step 3.3 finished.
Step 2.4 finished.
[0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
[ 30  27  46 483 697 344 308  22  41   2]
Step 3.4 finished.


In [None]:
samp = cli_books3.iloc[20]
samp_gen = get_genre(samp.GenreLink, cli_gen3)

In [None]:
krecommender.recommend(samp, samp_gen, 10)

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series,label
908,773278,Ruth Lercher Bornstein,0.757009,7.6e-05,0.000164,0.000186,7.6e-05,1.9e-05,0.000759,0.000757,Board Book,English,0.759908,759329,,6
21,703193,Fiona Watt,0.76324,2.7e-05,6.5e-05,6.5e-05,1.8e-05,1.1e-05,0.000292,0.007838,Board Book,English,0.769527,689484,That's Not My...,6
679,644935,"Phil Roxbee Cox, Stephen Cartwright",0.725857,1.1e-05,3.5e-05,3.1e-05,2.3e-05,5e-06,0.000192,0.00027,Board Book,English,0.762601,631097,,6
1172,815234,"Audrey Wood, Don Wood",0.813084,0.004864,0.006871,0.006192,0.00308,0.001546,0.011763,0.000432,Board Book,English,0.762986,359311,,6
214,312187,"Fiona Watt, Rachel Wells",0.71028,1.4e-05,3.3e-05,4.6e-05,3.2e-05,5e-06,0.000209,0.00027,Board Book,English,0.771066,303100,That's Not My...,6
1368,1005366,Lois Lenski,0.707165,1.2e-05,3.2e-05,4.8e-05,1.8e-05,5e-06,0.000175,0.000865,Board Book,English,0.754521,3221508,Mr. Small,6
1370,313335,Richard Scarry,0.819315,0.000102,0.000143,0.000146,5.5e-05,1.8e-05,0.000267,0.000595,Board Book,English,0.765294,67506559,,6
281,1885410,Bill Martin Jr.,0.803738,0.017202,0.021932,0.024227,0.012955,0.005589,0.032126,0.000676,Board Book,English,0.756445,810506,Bill Martin's Bears,6
353,315129,"Laura Joffe Numeroff, Felicia Bond",0.816199,0.028931,0.036607,0.037175,0.018386,0.009077,0.034695,5.4e-05,Board Book,English,0.763371,3132746,If You Give...,6
1419,1841494,Keiko Kasza,0.853583,0.000204,0.000293,0.000197,6.4e-05,3.5e-05,0.00186,0.000811,Board Book,English,0.766064,1191587,,6
