In [42]:
import pandas as pd
import numpy as np

In [43]:
review_data = pd.read_csv("all_review_preprocessing.csv")

In [81]:
review_data.tail()

Unnamed: 0,Nickname,Review_Text,Book_Name,Label,Series_Name
15563,꿈이현실,주린이들이 꼭 읽어봐야 할 책 이네요. 오랜만에 주식관련 책을 재미있게 잘 읽었습니다.,세력주 투자 기술,1,세력주 투자 기술
15564,지미오양,주식 잘 모르겠으면 이것부터 읽어보시라,세력주 투자 기술,1,세력주 투자 기술
15565,플러스비,장난이 너무 심한거 아니요?,세력주 투자 기술,0,세력주 투자 기술
15566,ADHD 직장인의 일생다반사,투자에 공부에 대한 마음에 다시끔 불을 지펴준 책,세력주 투자 기술,1,세력주 투자 기술
15567,kosmes2112029,도움이 많이되는 실전 노하우를 알려주는 좋은책,세력주 투자 기술,1,세력주 투자 기술


In [85]:
book_data = pd.read_csv("all_book_preprocessing.csv")

In [86]:
book_data.tail()

Unnamed: 0,Book_Name,Book_Author,Book_Category,Completion_Percent,ReadTogether,ReviewCount,AudioBook,Series_Name
8625,150세에도 뛸 거야! 2편,유왕규,과학,0.0,1,0,X,150세에도 뛸 거야!
8626,다음 생엔 돌이 되고 싶다.,강다희,에세이,0.0,3,0,X,다음 생엔 돌이
8627,배터리 다이제스트 TOP9,선우 준,과학,0.0,5,0,X,배터리 다이제스트 TOP9
8628,배터리 다이제스트 TOP10,선우 준,과학,0.0,5,0,X,배터리 다이제스트 TOP10
8629,나로 돌아가는 여행,곽나원,에세이,0.0,4,0,X,나로 돌아가는 여행


## Content-based

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer()

book_features = book_data[['Book_Name', 'Book_Author', 'Book_Category']].astype(str).agg(' '.join, axis=1)
count_matrix = vectorizer.fit_transform(book_features)

cosine_sim = cosine_similarity(count_matrix)

def get_recommendations_with_rounded_scores(book_name, cosine_sim=cosine_sim):
    idx = book_data.index[book_data['Book_Name'] == book_name].tolist()
    
    if not idx:
        return []

    idx = idx[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]
    book_indices_scores = [(book_data['Book_Name'].iloc[i[0]], round(i[1], 2)) for i in sim_scores]

    return book_indices_scores

selected_book_name = "꿀벌과 천둥"
recommendations = get_recommendations_with_rounded_scores(selected_book_name)

recommendations


[('플로라', 0.4),
 ('평균의 종말', 0.4),
 ('초기업', 0.4),
 ('본심', 0.4),
 ('장내세균의 역습', 0.4),
 ('남자아이 대백과', 0.37),
 ('성차별주의는 전쟁을 불러온다', 0.37),
 ('안아주는 말들', 0.37),
 ('세계사를 바꾼  의 책', 0.37),
 ('홀로서기 심리학', 0.37)]

## SVD Collaborative Filtering Using Surprise Library

In [70]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
import numpy as np
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(review_data[['Nickname', 'Book_Name', 'Label']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)

In [71]:
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3198  0.3148  0.3172  0.3168  0.3129  0.3163  0.0023  
MAE (testset)     0.1960  0.1914  0.1930  0.1934  0.1900  0.1928  0.0020  
Fit time          0.24    0.23    0.23    0.24    0.23    0.23    0.01    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    


{'test_rmse': array([0.3198256 , 0.31475151, 0.31723189, 0.31683047, 0.31293821]),
 'test_mae': array([0.1960279 , 0.19141342, 0.19300228, 0.19339393, 0.18999874]),
 'fit_time': (0.2416689395904541,
  0.2278900146484375,
  0.23209285736083984,
  0.24099302291870117,
  0.2297201156616211),
 'test_time': (0.025493144989013672,
  0.025828838348388672,
  0.02569413185119629,
  0.03238987922668457,
  0.02680206298828125)}

In [72]:
user = "무모한"  
bname = "꿀벌과 천둥" # Changing the book name which user read.

if user not in review_data['Nickname'].unique():
    print(f"'{user}' 사용자는 리뷰 데이터에 없습니다.")
elif bname not in review_data['Book_Name'].unique():
    print(f"'{bname}' 책은 리뷰 데이터에 없습니다.")
else:
    pred = model.predict(user, bname, verbose=True)

user: 무모한        item: 꿀벌과 천둥     r_ui = None   est = 0.95   {'was_impossible': False}


In [73]:
# The book which the user already read book
read_books = review_data[review_data['Nickname'] == user]['Book_Name'].unique()

# The book which the user does not read book
not_read_books = [bname for bname in review_data['Book_Name'].unique() if bname not in read_books]

predictions = [model.predict(user, bname) for bname in not_read_books]

top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]

for i, pred in enumerate(top_predictions, 1):
    print(f"Book {i}: {pred.iid}, Pred Score: {pred.est}")


Book 1: 남자는 어떻게 일어서는가, Pred Score: 1
Book 2: 읽는 인간, 리터러시를 경험하라, Pred Score: 1
Book 3: 역행자 : 확장판, Pred Score: 1
Book 4: 날이 좋아요, 차를 마셔요, Pred Score: 1
Book 5: 나의 이상하고 평범한 부동산 가족, Pred Score: 1
Book 6: 엘리멘탈 그래픽노블(한글), Pred Score: 1
Book 7: 나는 왜 자꾸 내 탓을 할까, Pred Score: 1
Book 8: 기분의 디자인, Pred Score: 1
Book 9: 이코노미 조선 509호 : 2023.09.20, Pred Score: 1
Book 10: 신비 섬 제주 유산, Pred Score: 1


In [74]:
high_pred_books = [pred.iid for pred in predictions if pred.est == 1]

top_books_with_favor = book_data[book_data['Book_Name'].isin(high_pred_books)]
top_books_with_favor = top_books_with_favor.sort_values(by='ReadTogether', ascending=False)

top_books_with_favor = top_books_with_favor.head(10)

for i, row in top_books_with_favor.iterrows():
    print(f"Book {i+1}: {row['Book_Name']}, Favor Count: {row['ReadTogether']}")


Book 3669: 적막한 폭발, Favor Count: 968
Book 6086: 트러블 사전, Favor Count: 965
Book 2861: 아레나옴므플러스 Arena Homme+ 2023년 11월호, Favor Count: 934
Book 8163: 나는 정신장애 아들을 둔 아버지입니다, Favor Count: 929
Book 7492: 쫓기지 않는 50대를 사는 법, Favor Count: 908
Book 18: 66일 인문학 대화법, Favor Count: 904
Book 1761: 김씨네과일, Favor Count: 897
Book 2012: 이코노미 조선 509호 : 2023.09.20, Favor Count: 856
Book 8155: 문장의 시대, 시대의 문장, Favor Count: 845
Book 8137: 컬러는 나를 알고 있다, Favor Count: 827


## KNN Surprise

In [77]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import accuracy

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(review_data[['Nickname', 'Book_Name', 'Label']], reader)

# Using KNNBasic
algo = KNNBasic()

algo.fit(trainset)

predictions = algo.test(testset)

rmse = accuracy.rmse(predictions)

mae = accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.3356
MAE:  0.2072


In [78]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3283  0.3445  0.3362  0.3351  0.3395  0.3367  0.0053  
MAE (testset)     0.2011  0.2096  0.2067  0.2068  0.2078  0.2064  0.0029  
Fit time          0.55    0.66    0.55    0.50    0.49    0.55    0.06    
Test time         0.18    0.16    0.17    0.17    0.16    0.17    0.01    


{'test_rmse': array([0.32831255, 0.34454528, 0.33623039, 0.33505279, 0.3394961 ]),
 'test_mae': array([0.20106116, 0.20957581, 0.20669201, 0.20675004, 0.20777613]),
 'fit_time': (0.5548522472381592,
  0.6569430828094482,
  0.5454328060150146,
  0.497769832611084,
  0.48821306228637695),
 'test_time': (0.1753687858581543,
  0.16428709030151367,
  0.17379093170166016,
  0.16584110260009766,
  0.157944917678833)}

In [84]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

user_book_matrix = review_data.pivot_table(index='Nickname', columns='Book_Name', values='Label')

user_book_matrix = user_book_matrix.fillna(0)

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)

model_knn.fit(user_book_matrix)

def get_user_recommendations(user_id, user_book_matrix, model_knn, n_recommendations=5):
    user_idx = list(user_book_matrix.index).index(user_id)

    user_data = user_book_matrix.iloc[user_idx].values.reshape(1, -1)

    distances, indices = model_knn.kneighbors(user_data, n_neighbors=n_recommendations+1)

    similar_users_book_ratings = user_book_matrix.iloc[indices.flatten()[1:]]
    avg_ratings = similar_users_book_ratings.mean(axis=0)
    
    already_read = user_book_matrix.loc[user_id]
    avg_ratings = avg_ratings[already_read.isna() | (already_read == 0)]

    recommended_books = avg_ratings.sort_values(ascending=False).head(n_recommendations).index.tolist()

    return recommended_books

user_id = '무모한' 
recommended_books = get_user_recommendations(user_id, user_book_matrix, model_knn, n_recommendations=10)

print("추천된 책 목록:", recommended_books)


추천된 책 목록: ['사라진 여자들', '꿀벌의 예언 1', '나는 미니멀 유목민 입니다', '상식이 결여된 카페', '내 몸의 설계자, 호르몬 이야기', '본심', "'시'가 머물러 있는 그곳에 글자락이 메아리 치다", '용의 만화경', '우리 MBTI가 같네요!', '우등생논술 2023년 7월호']




## Combining Collaborative + Content-based

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD

# Content-Based Filtering for Books
book_data['Content'] = book_data[['Book_Name', 'Book_Author', 'Book_Category']].apply(
    lambda x: ' '.join(x.dropna().astype(str)), axis=1)
count_vectorizer = CountVectorizer()
book_content_matrix = count_vectorizer.fit_transform(book_data['Content'])
book_content_similarity = linear_kernel(book_content_matrix, book_content_matrix)

def get_book_content_based_recommendations(book_name, top_n):
    index = book_data[book_data['Book_Name'] == book_name].index[0]
    similarity_scores = book_content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    recommendations = book_data.iloc[similar_indices]['Book_Name'].values
    return recommendations

# Collaborative Filtering for Books
reader = Reader(rating_scale=(review_data['Label'].min(), review_data['Label'].max()))
data = Dataset.load_from_df(review_data[['Nickname', 'Book_Name', 'Label']], reader)
algo = SVD()
trainset2 = data.build_full_trainset()
algo.fit(trainset2)

def get_book_collaborative_filtering_recommendations(nickname, top_n):
    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == nickname, testset)
    predictions = algo.test(testset)
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommendations = [review_data[review_data['Book_Name'] == prediction.iid]['Book_Name'].iloc[0] for prediction in predictions[:top_n]]
    return recommendations

# Hybrid Recommendation Function
def get_book_hybrid_recommendations(nickname, book_name, top_n):
    content_based_recommendations = get_book_content_based_recommendations(book_name, top_n)
    collaborative_filtering_recommendations = get_book_collaborative_filtering_recommendations(nickname, top_n)
    hybrid_recommendations = list(set(content_based_recommendations + collaborative_filtering_recommendations))
    return hybrid_recommendations[:top_n]

In [90]:
nickname = '무모한'
book_name = '꿀벌과 천둥' # Chaning book name which user read.
top_n = 10
recommendations = get_book_hybrid_recommendations(nickname, book_name, top_n)
recommendations

['성차별주의는 전쟁을 불러온다기계가 언어를 이해하는 방법 (자연어 처리 기술 이해)',
 '처음 읽는 여성 철학사감정은 습관이다',
 '디즈니의 악당들 1 : 사악한 여왕제노사이드',
 '당신은 어떤 가면을 쓰고 있나요연남동 빙굴빙굴 빨래방',
 '플로라인생에서 8가지 일에만 집중하라',
 '틀을 깨는 사고력나는 왜 자꾸 내 탓을 할까',
 '홀로서기 심리학서평 쉽게 쓰는 법',
 '남자아이 대백과나는 왜 저 인간이 싫을까?',
 '잉글사이드의 릴라비스트로 쿠킹 앳 홈',
 '처음 읽는 클래식 음악의 역사나에겐 상처받을 이유가 없다']

## Using Fastai Library wtih K-Fold

#### Not using K-Fold

In [57]:
# pip install fastai

# import pandas as pd
# from sklearn.model_selection import train_test_split
# from fastai.collab import CollabDataLoaders, collab_learner
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import numpy as np

# data = review_data[['Nickname', 'Book_Name', 'Label']]

# trainset_2, testset_2 = train_test_split(data, test_size=0.2, random_state=42)

# dls = CollabDataLoaders.from_df(trainset_2, user_name='Nickname', item_name='Book_Name', rating_name='Label', bs=64)

# learn = collab_learner(dls, n_factors=100, y_range=(0, 1), wd=0.1, use_nn=True, layers=[100])
# # learn = collab_learner(dls, n_factors=100, y_range=(0, 1), wd=0.1, use_nn=True, layers=[100, 50, 10], ps=[0.5, 0.5, 0.1])

# learn.fit_one_cycle(5, 1e-3)

# test_dl = dls.test_dl(testset_2)
# predictions, _ = learn.get_preds(dl=test_dl)

# mae = mean_absolute_error(testset_2['Label'], predictions)
# rmse = np.sqrt(mean_squared_error(testset_2['Label'], predictions))

# mae = f"{mae:.3f}"
# rmse = f"{rmse:.3f}"

# mae, rmse

epoch,train_loss,valid_loss,time
0,0.211432,0.193101,00:02
1,0.095301,0.129399,00:01
2,0.045149,0.110691,00:02
3,0.020498,0.106174,00:02
4,0.011928,0.105702,00:01


('0.182', '0.342')

#### Using K-Fold

In [80]:
# pip install fastai

from sklearn.model_selection import KFold
from fastai.collab import CollabDataLoaders, collab_learner
from sklearn.metrics import mean_squared_error, mean_absolute_error

data = review_data[['Nickname', 'Book_Name', 'Label']]

kf = KFold(n_splits=5) 
maes, rmses = [], []

for train_idx, test_idx in kf.split(data):
    train_data, test_data = data.iloc[train_idx], data.iloc[test_idx]

    dls = CollabDataLoaders.from_df(train_data, user_name='Nickname', item_name='Book_Name', rating_name='Label', bs=64)

    learn = collab_learner(dls, n_factors=100, y_range=(0, 1), wd=0.1, use_nn=True, layers=[100])
    learn.fit_one_cycle(5, 1e-3)

    test_dl = dls.test_dl(test_data)
    predictions, _ = learn.get_preds(dl=test_dl)

    mae = mean_absolute_error(test_data['Label'], predictions)
    rmse = np.sqrt(mean_squared_error(test_data['Label'], predictions))

    maes.append(mae)
    rmses.append(rmse)

avg_mae = np.mean(maes)
avg_rmse = np.mean(rmses)

print(f"Average MAE: {avg_mae:.3f}, Average RMSE: {avg_rmse:.3f}")


epoch,train_loss,valid_loss,time
0,0.203344,0.178841,00:02
1,0.091981,0.126135,00:01
2,0.045918,0.116681,00:01
3,0.027264,0.112135,00:01
4,0.016302,0.112058,00:01


epoch,train_loss,valid_loss,time
0,0.220539,0.202265,00:01
1,0.092605,0.119465,00:01
2,0.042931,0.10689,00:01
3,0.021425,0.101691,00:01
4,0.009825,0.101432,00:01


epoch,train_loss,valid_loss,time
0,0.220723,0.196333,00:01
1,0.095646,0.133659,00:01
2,0.051522,0.129558,00:01
3,0.028184,0.126793,00:01
4,0.016396,0.128135,00:01


epoch,train_loss,valid_loss,time
0,0.223586,0.195411,00:01
1,0.096942,0.1331,00:01
2,0.051057,0.113242,00:01
3,0.029381,0.114903,00:01
4,0.019267,0.111545,00:01


epoch,train_loss,valid_loss,time
0,0.220741,0.198629,00:01
1,0.102513,0.151262,00:01
2,0.049985,0.129097,00:01
3,0.027234,0.126378,00:01
4,0.016205,0.127589,00:01


Average MAE: 0.199, Average RMSE: 0.346
