In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, sigmoid_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans

<p>Pada proses modelling, saya memakai pendekatan clustering untuk mengelompokkan data ke dalam beberapa kelompok kecil.<p>

In [120]:
items = pd.read_csv('items_final.csv')

In [121]:
items

Unnamed: 0.1,Unnamed: 0,itemID,title,author,publisher,language,general_topic,sub_topic
0,0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,english,children,childrenfiction
1,1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,german,arts,artstreatment
2,2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,english,children,childrenfiction
3,3,40250,Meine Kindergarten-Freunde (Pirat),otherauthor,Ars Edition GmbH,german,children,childrenpicture
4,4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,german,lifestyle,lifestyledecoration
...,...,...,...,...,...,...,...,...
77552,77820,37678,Timeless Fairy Tales,"Brothers Grimm, Marie-Catherine Baroness D'Aulnoy",MEDIAMORPHOSIS,english,children,childrenfiction
77553,77821,68688,Demon Games,A. Witt Timothy a. Witt,iUniverse,german,fiction,fictionfantasy
77554,77822,57291,Lori and the Lion's Den,A. M. Glass,Xlibris,english,children,childrenfiction
77555,77823,78130,The Everywhere Armchair,Ersila Bee,ELOQUENT BOOKS,english,children,childrenfiction


<p>Pada tahap ini saya menentukan kalau saya akan membuat model sistem rekomendasi buku menggunakan pendekatan clustering dan berdasarkan kemiripan deskripsi bukunya seperti author, language, general_topic, dan sub_topic. Oleh karena itu saya tidak membutuhkan kolom title dan publisher.<p>

In [122]:
items.drop(['Unnamed: 0'], axis = 1, inplace=True)
items.drop(['title'], axis = 1, inplace=True)
items.drop(['publisher'], axis = 1, inplace=True)

<p>Tahap selanjutnya saya akan menggabungkan kolom author, language, general_topic, dan sub_topic menjadi satu kolom baru.<p>

In [123]:
def combine_column(data):
    column = []
    for i in range(0, data.shape[0]):
        column.append( data['author'][i] + ' ' + data['language'][i] + ' ' + data['general_topic'][i] + ' ' + data['sub_topic'][i] )
    
    return column

In [124]:
items['combined_columns'] = combine_column(items)

In [125]:
items

Unnamed: 0,itemID,author,language,general_topic,sub_topic,combined_columns
0,21310,Janey Louise Jones,english,children,childrenfiction,Janey Louise Jones english children childrenfi...
1,73018,Wiebke Krabbe,german,arts,artstreatment,Wiebke Krabbe german arts artstreatment
2,19194,Victoria Aveyard,english,children,childrenfiction,Victoria Aveyard english children childrenfiction
3,40250,otherauthor,german,children,childrenpicture,otherauthor german children childrenpicture
4,46107,Elizabeth Golding,german,lifestyle,lifestyledecoration,Elizabeth Golding german lifestyle lifestylede...
...,...,...,...,...,...,...
77552,37678,"Brothers Grimm, Marie-Catherine Baroness D'Aulnoy",english,children,childrenfiction,"Brothers Grimm, Marie-Catherine Baroness D'Aul..."
77553,68688,A. Witt Timothy a. Witt,german,fiction,fictionfantasy,A. Witt Timothy a. Witt german fiction fiction...
77554,57291,A. M. Glass,english,children,childrenfiction,A. M. Glass english children childrenfiction
77555,78130,Ersila Bee,english,children,childrenfiction,Ersila Bee english children childrenfiction


<p>Agar mesin dapat membaca natural language pada data yang ada di kolom baru, data perlu dikonversi menggunakan vectorizer agar dapat dilakukan proses clustering.<p>

In [126]:
data = items['combined_columns'].values.astype('U')

In [127]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data)

<p> Pada tahap ini saya memakai jumlah cluster sebanyak 20. Sebenarnya semakin banyak jumlah cluster maka akan lebih mudah mencari kemiripan antar buku. Namun karena keterbatasan hardware, saya hanya akan menggunakan 20 cluster<p>

In [128]:
k = 20
model = KMeans(n_clusters=k)
model.fit(tfidf)

KMeans(n_clusters=20)

In [129]:
items['cluster'] = model.labels_
items.head()

Unnamed: 0,itemID,author,language,general_topic,sub_topic,combined_columns,cluster
0,21310,Janey Louise Jones,english,children,childrenfiction,Janey Louise Jones english children childrenfi...,14
1,73018,Wiebke Krabbe,german,arts,artstreatment,Wiebke Krabbe german arts artstreatment,14
2,19194,Victoria Aveyard,english,children,childrenfiction,Victoria Aveyard english children childrenfiction,14
3,40250,otherauthor,german,children,childrenpicture,otherauthor german children childrenpicture,8
4,46107,Elizabeth Golding,german,lifestyle,lifestyledecoration,Elizabeth Golding german lifestyle lifestylede...,18


<p>Simpan hasil clustering ke dalam file csv.<p>

In [131]:
items.to_csv('items_clustered.csv')

In [132]:
items = pd.read_csv('items_clustered.csv')
items.drop('Unnamed: 0', 1, inplace=True)

  items.drop('Unnamed: 0', 1, inplace=True)


In [133]:
items

Unnamed: 0,itemID,author,language,general_topic,sub_topic,combined_columns,cluster
0,21310,Janey Louise Jones,english,children,childrenfiction,Janey Louise Jones english children childrenfi...,14
1,73018,Wiebke Krabbe,german,arts,artstreatment,Wiebke Krabbe german arts artstreatment,14
2,19194,Victoria Aveyard,english,children,childrenfiction,Victoria Aveyard english children childrenfiction,14
3,40250,otherauthor,german,children,childrenpicture,otherauthor german children childrenpicture,8
4,46107,Elizabeth Golding,german,lifestyle,lifestyledecoration,Elizabeth Golding german lifestyle lifestylede...,18
...,...,...,...,...,...,...,...
77552,37678,"Brothers Grimm, Marie-Catherine Baroness D'Aulnoy",english,children,childrenfiction,"Brothers Grimm, Marie-Catherine Baroness D'Aul...",14
77553,68688,A. Witt Timothy a. Witt,german,fiction,fictionfantasy,A. Witt Timothy a. Witt german fiction fiction...,15
77554,57291,A. M. Glass,english,children,childrenfiction,A. M. Glass english children childrenfiction,14
77555,78130,Ersila Bee,english,children,childrenfiction,Ersila Bee english children childrenfiction,14


<p>Fungsi berikut merupakan fungsi untuk mencari 5 buku paling mirip dalam satu cluster untuk dijadikan sebagai buku rekomendasi.<p>

In [134]:
def give_rec(ID, sig, tf):
    idx = indices[ID]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:6]
    book_indices = [i[0] for i in sig_scores]
    return tf['itemID'].iloc[book_indices]

In [135]:
evaluation = pd.read_csv('evaluation.csv')

In [140]:
i = 0
j = 0
rec1 = []
rec2 = []
rec3 = []
rec4 = []
rec5 = []
vectorizer = TfidfVectorizer()

while i < len(evaluation):
    tf = items.loc[items['itemID'] == evaluation['itemID'][i]]
    # Jika itemID pada evaluation.csv tidak ditemukan pada items.csv, maka cari 5 buku rekomendasi berdasarkan hasil clustering dan similaritynya
    if tf.empty:
        rec1.append(items['itemID'].sample().tolist()[0])
        rec2.append(items['itemID'].sample().tolist()[0])
        rec3.append(items['itemID'].sample().tolist()[0])
        rec4.append(items['itemID'].sample().tolist()[0])
        rec5.append(items['itemID'].sample().tolist()[0])
    # Jika itemID pada evaluation.csv ditemukan pada items.csv, maka 5 buku rekomendasi merupakan buku random
    else:
        cluster_number = tf['cluster'].tolist()[0]
        df = None
        df = items.loc[items['cluster'] == cluster_number]
        df.reset_index(drop=True, inplace=True)
        features = vectorizer.fit_transform(df['combined_columns'])
        features = features.astype('float32')
        sig = None
        sig = sigmoid_kernel(features, features)
        indices = pd.Series(df.index, index=df['itemID']).drop_duplicates()
        rec_list = give_rec(evaluation['itemID'][i], sig, df).tolist()
        rec1.append(rec_list[0])
        rec2.append(rec_list[1])
        rec3.append(rec_list[2])
        rec4.append(rec_list[3])
        rec5.append(rec_list[4])
    print('iteration',i+1)
    i = i + 1


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [146]:
evaluation['book_recommendation_1'] = rec1
evaluation['book_recommendation_2'] = rec2
evaluation['book_recommendation_3'] = rec3
evaluation['book_recommendation_4'] = rec4
evaluation['book_recommendation_5'] = rec5

<p>save evaluation terbaru yang sudah ada book recommendationnya ke dalam file csv<p>

In [148]:
evaluation.head()

Unnamed: 0,itemID,book_recommendation_1,book_recommendation_2,book_recommendation_3,book_recommendation_4,book_recommendation_5
0,12,32729,2583,10048,41535,69791
1,45274,45274,6210,1746,15116,18662
2,10104,10143,22900,41320,47055,57759
3,41371,21540,49279,13702,38622,28401
4,14015,47952,19563,14015,40294,22184


In [149]:
evaluation.to_csv('book_recommendation.csv')