In [25]:
# Import library yang diperlukan
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# Load rating dataset
rating_data = pd.read_csv("../dataset/rating_dataset.csv")
rating_data.head()

Unnamed: 0,userId,ISBN,bookRating,bookTitle,user,books
0,276747,60517794,4.5,Little Altars Everywhere,0,966
1,278843,60517794,3.5,Little Altars Everywhere,1,966
2,4017,60517794,5.0,Little Altars Everywhere,2,966
3,8961,60517794,4.0,Little Altars Everywhere,3,966
4,21788,60517794,4.5,Little Altars Everywhere,4,966


In [27]:
# Load book dataset
book_data = pd.read_csv("../dataset/book_dataset.csv")
book_data.head()

Unnamed: 0,bookTitle,bookRating,ISBN,bookAuthor,yearOfPublication,Publisher,url,bookImage,bookDesc,ratingCount,bookPages,bookGenres,bookGenre1,bookGenre2,bookGenre3,books
0,To Kill a Mockingbird,4.28,446310786,Harper Lee,1988,Little Brown &amp,https://www.goodreads.com/book/show/2657.To_Ki...,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4772918,324 pages,"['Classics', 'Fiction', 'Historical-Historical...",Classics,Fiction,Historical-HistoricalFiction,0
1,Pride and Prejudice,4.27,055321215X,Jane Austen,1983,Bantam,https://www.goodreads.com/book/show/44623850-p...,https://i.gr-assets.com/images/S/compressed.ph...,an alternate cover edition can be found hereIn...,3206208,446 pages,"['Classics', 'Fiction', 'Romance', 'Historical...",Classics,Fiction,Romance,1
2,Animal Farm,3.96,451526341,George Orwell,2004,Signet,https://www.goodreads.com/book/show/14060211-a...,https://i.gr-assets.com/images/S/compressed.ph...,This remarkable book has been described in man...,2927053,95 pages,"['Classics', 'Fiction', 'ScienceFiction-Dystop...",Classics,Fiction,ScienceFiction-Dystopia,2
3,Gone with the Wind,4.3,446365386,Margaret Mitchell,1993,Warner Books,https://www.goodreads.com/book/show/18405.Gone...,https://i.gr-assets.com/images/S/compressed.ph...,"Scarlett O'Hara, the beautiful, spoiled daught...",1106466,1037 pages,"['Classics', 'Historical-HistoricalFiction', '...",Classics,Historical-HistoricalFiction,Fiction,3
4,The Giving Tree,4.37,60256664,Shel Silverstein,1964,HarperCollins,https://www.goodreads.com/book/show/370493.The...,https://i.gr-assets.com/images/S/compressed.ph...,"""Once there was a tree...and she loved a littl...",952487,64 pages,"['Childrens', 'Childrens-PictureBooks', 'Class...",Childrens,Childrens-PictureBooks,Classics,4


In [28]:
# Menggabungkan genre buku menjadi satu kolom
book_data['genre'] = book_data['bookGenre1'] + ',' + book_data['bookGenre2'] + ',' + book_data['bookGenre3']

# Melakukan update tabel data book
book_data_filtered = book_data[['ISBN', 'bookTitle', 'bookAuthor', 'bookGenre1', 'bookGenre2', 'bookGenre3', 'genre']]

In [29]:
 # Inisialisasi TfidfVectorizer
tf = TfidfVectorizer()
 
# Melakukan perhitungan idf pada data genre
tf.fit(book_data_filtered['genre']) 
 
# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names_out() 

array(['academic', 'action', 'adultfiction', 'adventure', 'africa',
       'africanamerican', 'albanianliterature', 'alternatehistory',
       'american', 'americanhistory', 'angels', 'animals', 'anthropology',
       'apocalyptic', 'art', 'arthurian', 'asia', 'asianliterature',
       'astronomy', 'australia', 'autobiography', 'baseball', 'biography',
       'biology', 'bookclub', 'booksaboutbooks', 'brazil',
       'britishliterature', 'business', 'canada', 'chicklit', 'childrens',
       'china', 'christian', 'christianfiction', 'christianity',
       'christianliving', 'christmas', 'civilwar', 'classics', 'comedy',
       'comics', 'comingofage', 'contemporary', 'contemporaryromance',
       'crime', 'cultural', 'cyberpunk', 'darkfantasy', 'dragons',
       'drama', 'drawing', 'dystopia', 'economics', 'environment',
       'epicfantasy', 'espionage', 'essays', 'europeanliterature',
       'evolution', 'fae', 'fairies', 'fairytales', 'fantasy', 'feminism',
       'fiction', 'finance

In [30]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(book_data_filtered['genre']) 
 
# Melihat ukuran matrix tfidf
tfidf_matrix.shape 

(1536, 181)

In [31]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.46213191],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [32]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan genre buku
# Baris diisi dengan nama buku
 
pd.DataFrame(
    tfidf_matrix.todense(), 
    columns=tf.get_feature_names_out(),
    index=book_data_filtered['bookTitle']
).sample(22, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,southern,comedy,dragons,sciencefictionfantasy,spirituality,newadult,vampires,spanishliterature,australia,spythriller,...,academic,militaryfiction,biology,indianliterature,fae,romance,christianity,suspense,presidents,poetry
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The War of the End of the World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Prayer for Owen Meany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Lives of Christopher Chant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Annie's Song,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.77426,0.0,0.0,0.0,0.0
You Are Special,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
As a Man Thinketh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Bloody Chamber and Other Stories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Band of Brothers: E Company, 506th Regiment, 101st Airborne from Normandy to Hitler's Eagle's Nest",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Silver Kiss,0.0,0.0,0.0,0.0,0.0,0.0,0.652575,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead Babies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix) 
cosine_sim

array([[1.        , 0.4144056 , 0.26230236, ..., 0.05115634, 0.        ,
        0.        ],
       [0.4144056 , 1.        , 0.28344805, ..., 0.05528035, 0.        ,
        0.        ],
       [0.26230236, 0.28344805, 1.        , ..., 0.03499027, 0.        ,
        0.        ],
       ...,
       [0.05115634, 0.05528035, 0.03499027, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [34]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama buku
cosine_sim_df = pd.DataFrame(cosine_sim, index=book_data_filtered['bookTitle'], columns=book_data_filtered['bookTitle'])
print('Shape:', cosine_sim_df.shape)

#Menyimpan cosine sim dalam bentuk csv
cosine_sim_df.to_csv('cosine.csv')

# Melihat similarity matrix pada setiap buku
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (1536, 1536)


bookTitle,The Memory of Old Jack,Tithe,The First Man in Rome,Job: A Comedy of Justice,Moll Flanders
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A Knight of the Word,0.032207,0.921502,0.033558,0.467551,0.039971
The Hero With a Thousand Faces,0.0,0.186262,0.0,0.178843,0.0
The Black Swan,0.031181,0.426378,0.032489,0.452646,0.038697
God-Shaped Hole,0.051959,0.0,0.054139,0.072072,0.064485
Close Range: Wyoming Stories,0.03367,0.0,0.035082,0.046703,0.041786
Practical Magic,0.044015,0.300943,0.045862,0.350009,0.054626
As I Lay Dying,0.052281,0.0,0.054474,0.072519,1.0
The Horse Whisperer,0.038238,0.0,0.039842,0.053039,0.047455
The Interpretation of Dreams,0.0,0.0,0.0,0.0,0.0
Pay It Forward,0.032851,0.0,0.034229,0.045567,0.04077


In [35]:
#Membaca data pada cosine sim
data = pd.read_csv('cosine.csv')
data = data.set_index('bookTitle')
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

bookTitle,The Pilgrim's Progress,The Tale of Peter Rabbit,The Happy Prince and Other Tales,The Virgin Suicides,Old Yeller
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mother of Pearl,0.030277,0.0,0.032372,0.038469,0.042017
What's Eating Gilbert Grape,0.313046,0.141526,0.334709,1.0,0.43443
Until You,0.0,0.0,0.0,0.308696,0.0
Flipped,0.05872,0.0,0.062783,0.074608,0.56124
Bury My Heart at Wounded Knee: An Indian History of the American West,0.0,0.0,0.0,0.0,0.0
The Prisoner of Zenda,0.257968,0.116626,0.27582,0.32777,0.357995
True History of the Kelly Gang,0.032382,0.0,0.034623,0.041144,0.044938
Divine Secrets of the Ya-Ya Sisterhood,0.036638,0.0,0.039173,0.421235,0.050844
"The Secret Diary of Adrian Mole, Aged 13 3/4",0.051482,0.0,0.055045,0.065413,0.492064
The Bear and the Dragon,0.029042,0.0,0.031052,0.0369,0.040303


In [36]:
#Fungsi untuk melakukan rekomendasi berbasis konten(genre)
def books_recommendations(bookTitle, similarity_data=cosine_sim_df, k=10):
    """
    Rekomendasi Buku berdasarkan kemiripan dataframe
 
    Parameter:
    ---
    book_title : tipe data string (str)
                Nama Buku (index kemiripan dataframe)
    similarity_data : tipe data pd.DataFrame (object)
                      Kesamaan dataframe, simetrik, dengan buku sebagai 
                      indeks dan kolom
    k : tipe data integer (int)
        Banyaknya jumlah rekomendasi yang diberikan
    ---
 
 
    Pada index ini, kita mengambil k dengan nilai similarity terbesar 
    pada index matrix yang diberikan (i).
    """
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,bookTitle].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop book_title agar nama buku yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(bookTitle, errors='ignore')
 
    return pd.DataFrame(closest).head(k)

In [37]:
# Menentukan contoh buku yang digunakan untuk rekomendasi
book_data_filtered[book_data_filtered.bookTitle.eq('The Sight')]

Unnamed: 0,ISBN,bookTitle,bookAuthor,bookGenre1,bookGenre2,bookGenre3,genre
388,014250047X,The Sight,David Clement-Davies,Fantasy,YoungAdult,Animals,"Fantasy,YoungAdult,Animals"


In [38]:
# Menampilkan hasil rekomendasi buku dari buku yang ditentukan
books_recommendations('The Sight')

Unnamed: 0,bookTitle
0,Fire Bringer
1,Tailchaser's Song
2,The Horse Whisperer
3,The Loop
4,War Horse
5,Modoc: The True Story of the Greatest Elephant...
6,The Lord God Made Them All
7,Animal Liberation
8,Many Waters
9,Abhorsen
