In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### Đọc tập dữ liệu

In [22]:
#Khởi tạo đường dẫn 

books_filename = "BX-Books.csv"

ratings_filename = "BX-Book-Ratings.csv"

users_filename = "BX-Users.csv"

In [24]:
#Đọc tập dữ liệu, xử lý sang dạng DataFrame

df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author','year'],
    usecols=['isbn', 'title', 'author','year'],
    dtype=str)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


In [25]:
# In tập df_books
print(df_books)
print(df_ratings)

              isbn                                              title  \
0       0195153448                                Classical Mythology   
1       0002005018                                       Clara Callan   
2       0060973129                               Decision in Normandy   
3       0374157065  Flu: The Story of the Great Influenza Pandemic...   
4       0393045218                             The Mummies of Urumchi   
...            ...                                                ...   
271374  0440400988                         There's a Bat in Bunk Five   
271375  0525447644                            From One to One Hundred   
271376  006008667X  Lily Dale : The True Story of the Town that Ta...   
271377  0192126040                        Republic (World's Classics)   
271378  0767409752  A Guided Tour of Rene Descartes' Meditations o...   

                      author  year  
0         Mark P. O. Morford  2002  
1       Richard Bruce Wright  2001  
2           

### Tiền xử lý dữ liệu

In [31]:
# Loại bỏ giá trị không hợp lệ trong cột 'year'

invalid_years = df_books[~df_books['year'].str.isnumeric()]

print("Giá trị không hợp lệ trong cột 'year':")

print(invalid_years)

Giá trị không hợp lệ trong cột 'year':
              isbn                                              title  \
6450    0735201994  Peterman Rides Again: Adventures Continue with...   
43665   0330482750     Three Novels: \A Strange and Sublime Address\"   
51749   0413326608  Upon my word!: More stories from \My word!\" a...   
92036   0440500702  Schrodinger's Cat Trilogy : \The Universe Next...   
104317  0373166982           Please Say \I Do\"  (Three Weddings &amp   
121766  0894805959  The Best of the Journal of Irreproducible Resu...   
144056  8423920143  GuÃ­a del lector del \Quijote\": Ensayo psicol...   
150787  034050823X       The Double Detective: \The Blank Page\" &amp   
157126  039482492X  C is for Clown: A Circus of \C\" Words, (Brigh...   
180187  0553570722         \R\" for Revenge: Written by Kate William    
185736  096401811X  Solid as a rock \I\" stand: Inspirational poet...   
209386  085409878X                                      \Pie-powder\"   
209550  0789

In [33]:
df_books['year'] = pd.to_numeric(df_books['year'], errors='coerce')  # Chuyển đổi year từ kiểu str sang numeric, coerce - nếu không chuyển được gán NaN

df_books = df_books.dropna(subset=['year']).astype({'year': 'int32'}) # Loại bỏ giá trị NaN trong year

df_books
print("Số lượng giá trị duy nhất trong cột 'Book-Title':", df_books['title'].nunique())
print("Số lượng giá trị duy nhất trong cột 'Book-Author':", df_books['author'].nunique())

Số lượng giá trị duy nhất trong cột 'Book-Title': 242132
Số lượng giá trị duy nhất trong cột 'Book-Author': 102020


In [35]:
df_books

Unnamed: 0,isbn,title,author,year
0,0195153448,Classical Mythology,Mark P. O. Morford,2002
1,0002005018,Clara Callan,Richard Bruce Wright,2001
2,0060973129,Decision in Normandy,Carlo D'Este,1991
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999
...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988
271375,0525447644,From One to One Hundred,Teri Sloat,1991
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004
271377,0192126040,Republic (World's Classics),Plato,1996


In [37]:
# Kiểm tra dữ liệu
chech_nan = df_books.isna().any().any()
if chech_nan == True:
    print("Dữ liệu bị thiếu")
else:
    print("Dữ liệu đầy đủ")

Dữ liệu bị thiếu


In [45]:
df_books.dropna(inplace=True)
df_books

Unnamed: 0,isbn,title,author,year
0,0195153448,Classical Mythology,Mark P. O. Morford,2002
1,0002005018,Clara Callan,Richard Bruce Wright,2001
2,0060973129,Decision in Normandy,Carlo D'Este,1991
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999
...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988
271375,0525447644,From One to One Hundred,Teri Sloat,1991
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004
271377,0192126040,Republic (World's Classics),Plato,1996


In [43]:
df_ratings

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
...,...,...,...
1149775,276704,1563526298,9.0
1149776,276706,0679447156,0.0
1149777,276709,0515107662,10.0
1149778,276721,0590442449,10.0


In [49]:
df_merge = pd.merge(df_books,df_ratings,on = "isbn",how = "inner")
df_merge

Unnamed: 0,isbn,title,author,year,user,rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,2,0.0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,8,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,11400,0.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,11676,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,41385,0.0
...,...,...,...,...,...,...
1031125,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,276463,7.0
1031126,0525447644,From One to One Hundred,Teri Sloat,1991,276579,4.0
1031127,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,276680,0.0
1031128,0192126040,Republic (World's Classics),Plato,1996,276680,0.0


In [51]:
chech_nan = df_merge.isna().any().any()
if chech_nan == True:
    print("Dữ liệu bị thiếu")
else:
    print("Dữ liệu đầy đủ")

Dữ liệu đầy đủ


In [77]:
users=df_merge["user"].value_counts()
isbn= df_merge["isbn"].value_counts()

users=users[users>=200].index #Giữ lại các user có số lần xuất hiện từ 200 trở lên.
isbn=isbn[isbn>=100].index #Giữ lại các isbn (mã sách) xuất hiện từ 100 lần trở lên.

df_merge_new=df_merge.loc[(df_merge["user"].isin(users.values))&(df_merge["isbn"].isin(isbn.values))]
df_merge_new

Unnamed: 0,isbn,title,author,year,user,rating
103,0440234743,The Testament,John Grisham,1999,2977,0.0
105,0440234743,The Testament,John Grisham,1999,3363,0.0
108,0440234743,The Testament,John Grisham,1999,7346,9.0
114,0440234743,The Testament,John Grisham,1999,11676,9.0
117,0440234743,The Testament,John Grisham,1999,13552,8.0
...,...,...,...,...,...,...
505459,0515135739,Eleventh Hour: An FBI Thriller (FBI Thriller (...,Catherine Coulter,2004,236283,0.0
505464,0515135739,Eleventh Hour: An FBI Thriller (FBI Thriller (...,Catherine Coulter,2004,251613,0.0
505465,0515135739,Eleventh Hour: An FBI Thriller (FBI Thriller (...,Catherine Coulter,2004,252071,0.0
505468,0515135739,Eleventh Hour: An FBI Thriller (FBI Thriller (...,Catherine Coulter,2004,256407,0.0


In [81]:
df_merge_newdrop=df_merge_new.drop_duplicates(['title','user']) # Chỉ giữ lại bản ghi đầu tiên trong các bản ghi trùng lặp.
print("Ma trận merge: ")
print(df_merge_newdrop)

print("Danh sách title: ")
listcl = pd.unique(df_merge_newdrop['title'].values.tolist()) # Tạo danh sách các tiêu đề sách duy nhất
for i in range(len(listcl)):
    print("Sách {}:{}".format(i,listcl[i]))

Ma trận merge: 
              isbn                                              title  \
103     0440234743                                      The Testament   
105     0440234743                                      The Testament   
108     0440234743                                      The Testament   
114     0440234743                                      The Testament   
117     0440234743                                      The Testament   
...            ...                                                ...   
505459  0515135739  Eleventh Hour: An FBI Thriller (FBI Thriller (...   
505464  0515135739  Eleventh Hour: An FBI Thriller (FBI Thriller (...   
505465  0515135739  Eleventh Hour: An FBI Thriller (FBI Thriller (...   
505468  0515135739  Eleventh Hour: An FBI Thriller (FBI Thriller (...   
505469  0515135739  Eleventh Hour: An FBI Thriller (FBI Thriller (...   

                   author  year    user  rating  
103          John Grisham  1999    2977     0.0  
105    

  listcl = pd.unique(df_merge_newdrop['title'].values.tolist()) # Tạo danh sách các tiêu đề sách duy nhất


In [63]:
df_pivot = df_merge_newdrop.pivot_table(index='title',columns='user',values='rating',fill_value=0)
df_pivot

user,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Sử dụng thuật toán KNN

In [65]:
df_matrix = csr_matrix(df_pivot.values)

nn = NearestNeighbors(metric='cosine')

nn.fit(df_matrix)

### Xây dựng hàm gợi ý

In [73]:
def get_recommends(book = ""):
    recommended_books = [book,[]]

    distance, book_info = nn.kneighbors([df_pivot.loc[book]], 6, return_distance=True)
    
    recom_book_info = df_pivot.iloc[np.flip(book_info[0])[:-1]].index.to_list()
    recom_distance = list(np.flip(distance[0])[:-1])

    for r in zip(recom_book_info,recom_distance):
        recommended_books[1].append(list(r))
        
    return recommended_books


### Kết quả 

In [69]:
get_recommends("Breath, Eyes, Memory")

['Breath, Eyes, Memory',
 [['River, Cross My Heart', 0.78799295],
  ['We Were the Mulvaneys', 0.76867867],
  ['Jewel', 0.76008],
  ["Song of Solomon (Oprah's Book Club (Paperback))", 0.7443067],
  ['Drowning Ruth', 0.68435717]]]