# Getting Started

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
%cd '/content/drive/MyDrive/ML Project - Books'

In [2]:
import random
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.random import sample_without_replacement
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Get Data

In [3]:
books = pd.read_csv('goodreads/books.csv')
genre = pd.read_csv('goodreads/genre.csv')

print(books.shape)
print(genre.shape)

(447991, 16)
(12609524, 3)


In [4]:
books.head()

Unnamed: 0,BookID,Title,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,GenreLink,Series,PublishYear
0,3,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré",4.48,4868903,1704361,631378,148016,127122,118557,309,Hardcover,English,4640799,Harry Potter,1997
1,1,Harry Potter and the Half-Blood Prince,J.K. Rowling,4.57,1738165,611768,175048,29134,13230,41832,652,Paperback,English,41335427,Harry Potter,2005
2,7,The Harry Potter Collection,"J.K. Rowling, Mary GrandPré",4.73,25140,4473,1111,230,282,921,318,Paperback,English,21457570,Harry Potter,2005
3,10,Harry Potter Collection,J.K. Rowling,4.73,25140,4473,1111,230,282,921,3342,Hardcover,English,21457570,Harry Potter,2005
4,5,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré",4.57,2028235,706012,215277,29265,14292,58998,435,Mass Market Paperback,English,2402163,Harry Potter,1999


In [5]:
genre.head()

Unnamed: 0,GenreLink,Genre,NumberOfPeople
0,4640799,to-read,1082661
1,4640799,currently-reading,160630
2,4640799,fantasy,64264
3,4640799,favorites,57385
4,4640799,fiction,19352


# Preprocess Data

In [6]:
def get_genre_df(books, genre):
    links = books.GenreLink.unique().tolist()

    return genre[genre.GenreLink.isin(links)]

def get_genre(link, genre):
    return genre[genre.GenreLink == link]

In [7]:
book_data = books[['BookID', 'Author', 'Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'Book format', 'Language', 'PublishYear', 'GenreLink', 'Series']]
book_data.shape

(447991, 15)

In [8]:
scaler = MinMaxScaler()
book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']] = scaler.fit_transform(book_data[['Rating', '5 stars',	'4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']])

book_data.head()

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
0,3,"J.K. Rowling, Mary GrandPré",0.884735,0.988587,0.988436,0.592227,0.261098,0.224135,0.98903,0.008351,Hardcover,English,0.767988,4640799,Harry Potter
1,1,J.K. Rowling,0.912773,0.352918,0.354791,0.164192,0.051392,0.02333,0.348972,0.017622,Paperback,English,0.771066,41335427,Harry Potter
2,7,"J.K. Rowling, Mary GrandPré",0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.008595,Paperback,English,0.771066,21457570,Harry Potter
3,10,J.K. Rowling,0.962617,0.005104,0.002592,0.00104,0.000406,0.000501,0.007683,0.090324,Hardcover,English,0.771066,21457570,Harry Potter
4,5,"J.K. Rowling, Mary GrandPré",0.912773,0.411815,0.409447,0.201927,0.051623,0.025202,0.492175,0.011757,Mass Market Paperback,English,0.768757,2402163,Harry Potter


# Calculate Distance

In [33]:
def jaccard_dissimalirity(a_lst, b_lst):
    intersection = list(set(a_lst) & set(b_lst))
    union = list(set(a_lst) | set(b_lst))

    jaccard_sim = (1.0 * len(intersection))/len(union)

    return abs(1 - jaccard_sim)

def hamming_distance(a, b):
    if a == b:
        return 0
    
    return 1

def euclidian_distance(diff):
    diff2 = diff**2
    diff2_sum = diff2.sum(axis=1)
    
    return np.sqrt(np.float32(diff2_sum))

def cosine_dissimilarity(v_a, v_b):
    product = np.sum(v_a * v_b)
    length_product = np.sqrt(np.float32((v_a**2).sum())) * np.sqrt(np.float32((v_b**2).sum()))

    similarity = product/length_product
    return abs(1 - similarity)

In [10]:
def book_difference1(books1, book2, genre1, genre2):
    numeric_cols = ['Rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star', 'Number of reviewers', 'Pages', 'PublishYear']
    rows = len(books1)

    numeric = books1[numeric_cols].to_numpy() - book2[numeric_cols].to_numpy()
    
    authors = book2.Author.strip('\"').split(', ')
    author_diff = np.zeros((rows, 1))
    for i in range(rows):
        book_author = books1.iloc[i].Author.strip('\"').split(', ')
        author_diff[i][0] = jaccard_dissimalirity(book_author, authors)

    lang_diff = (books1.Language != book2.Language).astype(int).to_numpy().reshape((rows, 1))
    format_diff = (books1['Book format'] != book2['Book format']).astype(int).to_numpy().reshape((rows, 1))

    series_diff = np.ones((rows, 1))
    if book2.Series != "none":
        series_diff = (books1.Series != book2.Series).astype(int).to_numpy().reshape((rows, 1))

    tags = genre2.Genre.tolist()
    tag_diff = np.zeros((rows,1))
    for i in range(rows):
        book_tags = genre1[genre1.GenreLink == books1.iloc[i].GenreLink].Genre.tolist()
        tag_diff[i][0] = jaccard_dissimalirity(book_tags, tags)

    diff = np.append(numeric, author_diff, axis=1)
    diff = np.append(diff, lang_diff, axis=1)
    diff = np.append(diff, format_diff, axis=1)
    diff = np.append(diff, series_diff, axis=1)
    diff = np.append(diff, tag_diff, axis=1)

    return euclidian_distance(diff)

# Recommendation System

In [11]:
class BookRecommender:
    def __init__(self, distance):
        self.distance = distance

    def set_dataset(self, books, genre):
        self.books = books
        self.genre = genre

    def recommend(self, client, cli_genre):
        client_size = len(client)
        rec = np.zeros((len(self.books), client_size)) 
        
        for i in range(client_size):
            rec[:, i] = self.distance(self.books, client.iloc[i], self.genre, cli_genre)

        return rec 

    def get_recommendations(self, client, cli_genre, ind, k):
        rec = self.recommend(client, cli_genre)[:, ind]
        rec_sorted = np.argsort(rec)

        rec_ind = rec_sorted[:k]

        return self.books.iloc[rec_ind]

In [12]:
sample_ind = sample_without_replacement(len(books), 200)
sample_data = book_data.iloc[sample_ind]

sample_data

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
141011,434056,Jules Barbey d'Aurevilly,0.669782,0.000086,0.000280,0.000336,0.000203,0.000081,0.000667,0.008216,Paperback,Spanish,0.720662,1496555,
393833,1655879,Michel Rabagliati,0.800623,0.000131,0.000343,0.000210,0.000062,0.000026,0.001026,0.005622,Paperback,English,0.771451,1650541,Paul
106163,292220,Kyle Mills,0.722741,0.000077,0.000317,0.000303,0.000074,0.000016,0.000634,0.010595,Hardcover,English,0.769142,468076,Mark Beamon
148508,467242,Ntozake Shange,0.828660,0.003330,0.005190,0.003892,0.001536,0.000737,0.007583,0.002162,Hardcover,English,0.759523,505856,
77318,189750,William Morris,0.520249,0.000053,0.000294,0.000590,0.000526,0.000169,0.001360,0.008000,Paperback,English,0.726818,13352231,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196867,691985,Cameron Dokey,0.666667,0.000406,0.001503,0.002057,0.000986,0.000208,0.005631,0.005189,Paperback,English,0.771451,678324,
114454,324995,Brandon Bays,0.719626,0.000062,0.000170,0.000176,0.000088,0.000039,0.000676,0.005622,Hardcover,English,0.768757,201467,
209538,762585,"David E. Kaplan, Alec Dubro",0.700935,0.000038,0.000157,0.000145,0.000065,0.000021,0.000367,0.011405,Paperback,English,0.763755,172027,
137340,418548,Dalai Lama XIV,0.775701,0.000029,0.000071,0.000059,0.000023,0.000009,0.000142,0.005622,Paperback,English,0.769142,1494999,


In [13]:
cli_ind = sample_without_replacement(len(books), 20)
cli_books = book_data.iloc[cli_ind]

cli_books

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
78635,195386,John R. Erickson,0.76947,7.7e-05,0.000162,0.000186,5.5e-05,1.8e-05,0.000159,0.003378,Hardcover,English,0.763371,24164103,Hank the Cowdog
202987,727022,Mary Hershey,0.70405,1.1e-05,5.6e-05,4.7e-05,2.1e-05,7e-06,0.000275,0.007784,Hardcover,English,0.771835,713235,
11278,20181,James Scott Bell,0.766355,0.000419,0.001185,0.000927,0.000333,0.000109,0.004321,0.006324,Paperback,English,0.770681,21376,Write Great Fiction
440555,1949780,Haruka Fukushima,0.70405,0.000203,0.000292,0.000527,0.000349,0.000173,0.0003,0.005189,Paperback,English,0.770681,2929845,Cherry Juice
7063,12939,"William Shakespeare, Roma Gill",0.707165,0.013136,0.035752,0.04185,0.02268,0.006589,0.039484,0.005189,Paperback,English,0.61716,2342136,
66836,156155,"Suparna Damany, Jack Bellis, Martin Cherniack",0.800623,1.4e-05,2.7e-05,1.6e-05,1.4e-05,7e-06,0.0001,0.006324,Paperback,English,0.769527,150689,
215171,791048,"Fyodor Dostoyevsky, Andrew R. MacAndrew, Konst...",0.834891,0.028755,0.041475,0.028584,0.014625,0.006786,0.100115,0.025297,paperback,English,0.722586,3393910,The Brothers Karamazov
225425,833528,Terry Pratchett,0.647975,0.000624,0.002625,0.003923,0.001764,0.000309,0.005189,0.006919,Paperback,English,0.757984,583699,
145178,452154,Ann N. Martin,0.813084,1.9e-05,2.8e-05,1.8e-05,1.9e-05,9e-06,0.00025,0.005405,Paperback,English,0.767988,420051,
393022,1649749,Aline Templeton,0.744548,3.6e-05,0.000164,0.000108,2.8e-05,1.2e-05,0.000534,0.010811,Paperback,English,0.771451,1644247,DI Marjory Fleming


In [14]:
sample_gen = get_genre_df(sample_data, genre)
cli_gen = get_genre_df(cli_books, genre)

In [15]:
recommender = BookRecommender(book_difference1)
recommender.set_dataset(sample_data, sample_gen)

recommender.recommend(cli_books, cli_gen)

array([[2.1894927 , 1.94573545, 1.94780374, ..., 2.18734026, 1.70538819,
        1.67020273],
       [1.93509686, 1.93723774, 1.65672743, ..., 1.93864179, 1.96501493,
        1.65759945],
       [1.65200305, 1.65142655, 1.9310056 , ..., 1.9309994 , 2.20111966,
        1.93120158],
       ...,
       [1.92901695, 1.64816034, 1.64945912, ..., 1.92791557, 1.68427598,
        1.31168318],
       [1.9280895 , 1.65002978, 1.64850307, ..., 1.93048751, 1.6835897 ,
        1.31136203],
       [1.65405786, 1.32346892, 1.93298531, ..., 1.93902171, 1.96313202,
        1.65553129]])

In [16]:
cli_books.iloc[15]

BookID                                            764553
Author                 Alexandre Dumas, Robin Waterfield
Rating                                          0.816199
5 stars                                        0.0809184
4 stars                                         0.132266
3 stars                                        0.0997557
2 stars                                        0.0468743
1 star                                         0.0221942
Number of reviewers                             0.188443
Pages                                          0.0108108
Book format                                    Paperback
Language                                         English
PublishYear                                     0.709119
GenreLink                                         391568
Series                                              None
Name: 210067, dtype: object

In [17]:
recommender.get_recommendations(cli_books, cli_gen, 15, 3)

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
218668,246964,"Billy Wilder, Raymond Chandler, Jeffrey Meyers",0.859813,4.3e-05,7.7e-05,3.5e-05,1.2e-05,7e-06,0.000142,0.003757,Paperback,English,0.747595,798466,
34743,71672,Bruce Eckel,0.781931,0.000185,0.000423,0.000304,0.000148,6.2e-05,0.000692,0.037865,Paperback,English,0.768372,69406,
387735,1614003,Penny Jordan,0.595016,1e-05,5e-05,6.4e-05,5.6e-05,2.5e-05,0.000192,0.005054,Paperback,English,0.76414,1607832,


# Recommendation System with Clustering

In [34]:
class KMedoids:
    def __init__(self, distance, frequency_calculator, sample_frequency, get_medoid, k):
        self.k = k

        self.distance = distance
        self.frequency_calculator = frequency_calculator
        self.sample_frequency = sample_frequency
        self.get_medoid = get_medoid


    def get_genre(self, row):
        return self.corpus[self.corpus.GenreLink == row.GenreLink]

        
    def fit(self, data, tags):
        self.dataset = data
        self.corpus = tags

        self.rows = len(self.dataset)
        
        # Step 1: Get medoids
        self.medoids = []
        medoids_old = []
        medoid_inds = []

        for i in range(self.k):
            medoid_ind = random.choice(range(self.rows))

            while medoid_ind in medoid_inds:
                medoid_ind = random.choice(range(self.rows))

            medoid_inds.append(medoid_ind)
            self.medoids.append(self.dataset.iloc[medoid_ind])

        error = np.ones(self.k)
        self.dataset['label'] = 0

        self.medoid_genre = [self.get_genre(x) for x in self.medoids]
        #print("Step 1 finished")

        count = 1
        while error.all() != 0:
            # Step 2: Distance
            # Cluster labels for each point
            self.dataset.assign(labels=0)

            # Distances to each medoid
            distances = np.zeros(self.k)

            # Frequency for mode
            frequency = [deepcopy(self.sample_frequency) for x in range(self.k)]

            # Calculate distance to each medoid
            for i in range(self.rows):
                cluster = -1
                row = self.dataset.iloc[i]
                gen = self.get_genre(row)

                for j in range(self.k):
                    distances[j] = self.distance(row, self.medoids[j], gen, self.medoid_genre[j])

                cluster = np.argmin(distances)
                    
                self.dataset.iloc[i].label = cluster
                frequency[cluster] = self.frequency_calculator(row, gen, frequency[cluster]) 
            #print("Step 2." + str(count) + " finished.")

            # Step 3: Update medoids
            medoids_old = deepcopy(self.medoids)

            for i in range(self.k):
                proposed_medoid, proposed_genre = self.get_medoid(frequency[i])
                temp_medoid = proposed_medoid
                temp_genre = proposed_genre
                best_distance = 1000

                #print("Start step 3")
                labelset = self.dataset[self.dataset.label == i]

                for ind, row in labelset.iterrows():
                    gen = self.get_genre(row)

                    current_dist = self.distance(proposed_medoid, row, proposed_genre, gen)
                    
                    if current_dist < best_distance:
                        temp_medoid = row
                        temp_genre = gen
                        best_distance = current_dist
                            
                self.medoids[i] = temp_medoid
                self.medoid_genre[i] = temp_genre

                error[i] = self.distance(self.medoids[i], medoids_old[i], self.medoid_genre[i], self.get_genre(medoids_old[i]))
                #print("Step 3." + str(count) + " finished.")
                print("Iteration: " + str(count))

                count += 1


    def inertia(self):
        total_distance = 0.0

        for i in range(self.k):
            labelset = self.dataset[self.dataset.label == i]
            for ind, row in labelset.iterrows():
                total_distance += self.distance(self.medoids[i], row, self.medoid_genre[i], self.get_genre(row))

        return total_distance

    
    def predict(self, data, genre):
        result = np.zeros(len(data), dtype=int)
        
        for i in range(len(data)):
            distances = np.zeros(self.k, dtype=int)
            for j in range(self.k):
                row = data.iloc[i]
                gen = genre[genre.GenreLink == row.GenreLink]
                distances[j] = self.distance(row, self.medoids[j], gen, self.medoid_genre)

            result[i] = np.argmin(distances)
        
        return result

    
    def get_labels(self):
        return self.dataset.label

    
    def get_medoids(self):
        return self.medoids


    def get_medoid_genre(self):
        return self.medoid_genre

In [35]:
def clustering_distance(book1, book2, gen1, gen2):
    distance = 0.0

    authors1 = book1.Author.strip('\"').split(', ')
    authors2 = book2.Author.strip('\"').split(', ')
    distance += jaccard_dissimalirity(authors1, authors2)

    if book2.Series != "None" and book2.Series == book1.Series:
        distance += 1

    tags1 = gen1.Genre.tolist()
    tags2 = gen2.Genre.tolist()
    distance += jaccard_dissimalirity(tags1, tags2)

    return distance

def get_frequency(book, genre, cluster):
    cluster['info'] = cluster['info'].append(book[['Author', 'Series']])
    cluster['tags'] = cluster['tags'].append(genre[['GenreLink', 'Genre']])

    return cluster

def get_mode(cluster):
    new_centroid = pd.Series()

    new_centroid['Author'] = cluster['info'].Author.mode()[0]
    new_centroid['Series'] = cluster['info'].Series.mode()[0]

    length = cluster['tags'].groupby('GenreLink').size().mode()[0]
    sorted_tags = cluster['tags'].groupby('Genre', as_index=False).size().sort_values(by='size', ascending=False)
    new_genre = sorted_tags.iloc[:length]

    return (new_centroid, new_genre)

sample_frequency = {'info': pd.DataFrame(columns=['Author', 'Series']), 'tags': pd.DataFrame(columns=['GenreLink', 'Genre'])}

In [36]:
sample_ind2 = sample_without_replacement(len(books), 2000)
sample_data2 = book_data.iloc[sample_ind2]

sample_data2

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
267971,993530,Gertrude Stein,0.454829,0.000097,0.000471,0.001093,0.001261,0.000633,0.002861,0.009514,Mass Market Paperback,English,0.734129,61928206,
324209,1253136,Gina B. Nahai,0.747664,0.000078,0.000245,0.000183,0.000090,0.000032,0.001084,0.010378,Hardcover,English,0.768372,229674,
130987,392855,"Debbie Bertram, Susan Bloom, Michael Garland",0.647975,0.000006,0.000032,0.000050,0.000019,0.000005,0.000234,0.000865,Hardcover,English,0.771066,382427,The Best .... to Read
275755,1026943,Laurence Leamer,0.738318,0.000074,0.000216,0.000216,0.000076,0.000023,0.000384,0.025081,Paperback,English,0.769527,1013192,
414988,1799455,Charlotte Perkins Gilman,0.766355,0.008471,0.022288,0.016626,0.007342,0.002962,0.047542,0.010189,Hardcover,English,0.727588,17352354,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126206,372530,Michael Cart,0.504673,0.000008,0.000045,0.000117,0.000102,0.000021,0.000400,0.007838,Hardcover,English,0.769912,1519277,
153177,490084,Aaron Sorkin,0.809969,0.000108,0.000223,0.000174,0.000051,0.000014,0.000400,0.003459,Paperback,English,0.765294,478318,
133914,404447,Bernard Moitessier,0.834891,0.000014,0.000035,0.000015,0.000004,0.000004,0.000058,0.000000,Paperback,English,0.756445,393813,
339912,1334969,Søren Kierkegaard,0.788162,0.000668,0.001526,0.001122,0.000478,0.000162,0.001652,0.006135,Hardcover,English,0.708734,25733,


In [27]:
cli_ind2 = sample_without_replacement(len(books), 20)
cli_books2 = book_data.iloc[cli_ind2]

cli_books2

Unnamed: 0,BookID,Author,Rating,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,PublishYear,GenreLink,Series
291776,7686221,William F. Nolan,0.47352,9e-06,4.9e-05,0.000132,0.000127,5.1e-05,0.000292,0.003919,Paperback,English,0.761447,1088062,Logan
360825,178533,Joseph Conrad,0.632399,8.6e-05,0.000435,0.000538,0.000296,9.9e-05,0.001401,0.003027,Paperback,English,0.729511,1766682,
171001,566187,Sebastian Faulks,0.76324,0.00596,0.0142,0.010915,0.006128,0.002523,0.027363,0.011649,Hardcover,English,0.766449,1093016,French Trilogy
192860,670706,"Franz Kafka, Ruth Saunner",0.657321,0.001154,0.005209,0.00656,0.003031,0.000693,0.008326,0.008649,Paperback,Spanish,0.741054,935000,
250396,924612,Janelle Taylor,0.806854,4.1e-05,6.7e-05,5.5e-05,2.5e-05,1.8e-05,0.000117,0.013351,Paperback,English,0.762216,977848,Gray Eagle
208841,3760,Ian Fleming,0.629283,0.000817,0.004377,0.006448,0.002713,0.000614,0.007908,0.006216,Paperback,English,0.752212,2772203,James Bond (Original Series)
215803,794782,Alain-Fournier,0.654206,0.000584,0.002044,0.002672,0.001669,0.000495,0.006715,0.009324,Paperback,English,0.735668,51583,
394024,1657343,Dave Ellis,0.660436,2.1e-05,5.1e-05,4.9e-05,4.9e-05,4.6e-05,0.000192,0.008054,Paperback,English,0.771451,3093413,
420019,1824003,Philip Yancey,0.785047,0.003758,0.007061,0.005242,0.002764,0.002153,0.008776,0.008108,Paperback,English,0.767988,1616396,
397059,1678897,Fynn,0.778816,0.000559,0.00093,0.000853,0.00051,0.000277,0.004371,0.005135,Paperback,English,0.759138,49605,


In [37]:
sample_gen2 = get_genre_df(sample_data2, genre)
cli_gen2 = get_genre_df(cli_books2, genre)

In [38]:
model = KMedoids(clustering_distance, get_frequency, sample_frequency, get_mode, 20)
model.fit(sample_data2, sample_gen2)

model.inertia()

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21


AttributeError: 'Series' object has no attribute 'GenreLink'

In [None]:
a = sample_gen2[sample_gen2.GenreLink == 1115313].Genre.values.tolist()
b = sample_gen2[sample_gen2.GenreLink == 2064030].Genre.values.tolist()
jaccard_dissimalirity(a, b)

In [None]:
count = 0
for link1 in sample_data2.GenreLink.tolist():
    for link2 in sample_data2.GenreLink.tolist():
        a = sample_gen2[sample_gen2.GenreLink == link1].Genre.values.tolist()
        b = sample_gen2[sample_gen2.GenreLink == link2].Genre.values.tolist()
        dist = jaccard_dissimalirity(a, b)
        
        if dist <= 0:
            print(dist, link1, link2)
            count += 1
            
count

In [None]:
a1 = genre[genre.GenreLink == 2593467].Genre.unique().tolist()
print(len(a1))

In [None]:
b1 = genre[genre.GenreLink == 3295655].Genre.unique().tolist()
print(len(b1))

In [None]:
jaccard_dissimalirity(a1, b1)