In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### 데이터 전처리
- kaggle dataset 

In [3]:
book_df=pd.read_csv('./data/book_review_data.csv')

In [4]:
# 1) 필요한 컬럼 도출
# 2) 형변환 (float > int)

book_df=book_df[['user_id',"age","rating","book_title",'book_author','year_of_publication','img_m','Summary','Category','country','Language']]
book_df[['age',"year_of_publication"]]=book_df[['age',"year_of_publication"]].astype(int)

In [5]:
# 최소 평가 개수 ( 평가가 30개보다 많은 책만 추출 )
counts=book_df['book_title'].value_counts()
title = counts[counts > 30].index
book_filter_df=book_df[book_df['book_title'].isin(title)]


In [6]:
# 최소 평가 개수 ( 평가가 20개보다 많은 사용자만 추출 )
counts=book_filter_df["user_id"].value_counts()
user_id_=counts[counts>10].index
book_filter_df = book_filter_df[book_filter_df['user_id'].isin(user_id_)]

In [7]:
# 무의미한 0점 평가 제거
rating_zero=book_filter_df[book_filter_df['rating']==0].index
book_filter_df.drop(rating_zero, axis=0, inplace=True)

In [8]:
# 한 카테고리안에 책이 5권 미만인 경우 제거
counts=book_filter_df["Category"].value_counts()
categories = counts[counts > 5].index
book_filter_df = book_filter_df[book_filter_df['Category'].isin(categories)]

idx=book_filter_df[book_filter_df["Summary"]=='9'].index
book_filter_df.drop(idx,axis=0,inplace=True)

book_filter_df["book_title"]=book_filter_df['book_title'].str.lower()
book_filter_df["Summary"]=book_filter_df["Summary"].str.lower()

In [9]:
#book_filter_df.shape
len(book_filter_df['book_title'].unique())

3450

#### user_book_rating dataframe

In [10]:
user_book_df=book_filter_df.pivot_table(index='user_id',columns='book_title',values='rating',fill_value=0)
book_user_df=user_book_df.T
user_book_df.shape # user간 유사도 구할거 아니니까

(4836, 3450)

In [11]:
user_book_df.iloc[55].sort_values(ascending=False)[:10]

book_title
thinner                                                                     8.0
christine                                                                   8.0
stupid white men ...and other sorry excuses for the state of the nation!    6.0
cujo                                                                        6.0
'salem's lot                                                                0.0
sweet hereafter movie tie-in : a novel                                      0.0
sunset in st. tropez                                                        0.0
superfudge (yearling books (paperback))                                     0.0
superstitious                                                               0.0
surfacing                                                                   0.0
Name: 4334, dtype: float64

#### age_title_count dataframe

In [12]:
book_filter_df["age"].value_counts()

age
34    16829
33     1882
29     1708
28     1689
32     1585
      ...  
76        6
11        5
84        5
10        3
97        2
Name: count, Length: 78, dtype: int64

In [13]:
def make_ages(age):
    if 10<=age<20:
        return 10
    elif 20<=age<30:
        return 20
    elif 30<=age<40:
        return 30
    elif 40<=age<50:
        return 40
    elif 50<=age<60:
        return 50
    elif 60<=age<70:
        return 60
    else:
        return 70

In [14]:
book_filter_df['ages'] = book_filter_df['age'].apply(make_ages)

In [15]:
ages_book_=book_filter_df.groupby(['ages','book_title']).size().reset_index(name='count')

In [16]:
ages_book_df=ages_book_.pivot_table(index='ages',columns='book_title',values='count',fill_value=0)
book_ages_df=ages_book_df.T
book_ages_df[:100]

ages,10,20,30,40,50,60,70
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'salem's lot,0.0,0.0,4.0,1.0,0.0,0.0,0.0
10 lb. penalty,0.0,1.0,5.0,4.0,1.0,1.0,1.0
101 dalmatians,0.0,2.0,0.0,1.0,0.0,0.0,0.0
16 lighthouse road,1.0,1.0,6.0,3.0,2.0,0.0,0.0
1984,3.0,26.0,18.0,8.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
a stitch in time (needlecraft mysteries),0.0,1.0,4.0,1.0,3.0,0.0,0.0
a streetcar named desire,1.0,7.0,3.0,0.0,1.0,0.0,0.0
a sudden change of heart,0.0,1.0,5.0,1.0,0.0,0.0,0.0
a suitable vengeance,0.0,0.0,4.0,1.0,2.0,0.0,0.0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
ages_similarity=cosine_similarity(book_ages_df,book_ages_df)
ages_similarity_df=pd.DataFrame(ages_similarity,index=ages_book_df.columns, columns=ages_book_df.columns)

In [18]:
ages_similarity_df.shape

(3450, 3450)

#### user별 안 읽은 책 추천

In [19]:
def get_unseen_books(user_idx):
    user_read_df=user_book_df.iloc[user_idx,:]
    return user_read_df[user_read_df==0].index

In [20]:
def ages_popular_book(user_idx):
    user_ages=book_filter_df.iloc[user_idx]["ages"]
ages_popular_book(176)

# 작가 평점 추천 전처리

In [21]:
# TODO
'''
bm_df 이름을 바꾸고 V
유저 아이디를 받아서 그 유저가 가장 높게 평을 준 작가를 저장하는 함수를 만들고 V
그 작가를 기준으로 recommend_authors_books의 작가을 대입하도록 하자. V
user_idx값을 user_id값으로 받기
'''

'\nbm_df 이름을 바꾸고 V\n유저 아이디를 받아서 그 유저가 가장 높게 평을 준 작가를 저장하는 함수를 만들고\n그 작가를 기준으로 recommend_authors_books의 작가을 대입하도록 하자.\nuser_idx값을 user_id값으로 받기\n'

In [22]:
# 책의 평점으로 평균내기
# user별 작가 평점으로 평균내기

user_author_rating_df = book_filter_df.groupby(['user_id', 'book_author'])['rating'].mean().reset_index()
print(user_author_rating_df.shape)
user_author_rating_df=user_author_rating_df.rename(columns={'rating':'author_mean_rating'})
user_author_rating_df

(44332, 3)


Unnamed: 0,user_id,book_author,author_mean_rating
0,243,Arthur Golden,10.0
1,243,Arundhati Roy,7.0
2,243,Belva Plain,6.0
3,243,JOHN SAUL,6.0
4,243,Jack Canfield,5.0
...,...,...,...
44327,278843,J. K. Rowling,8.0
44328,278843,Maeve Binchy,7.0
44329,278843,Rebecca Wells,7.0
44330,278843,Richard Carlson,8.0


In [23]:
user_author_df=user_author_rating_df.pivot_table(index="user_id",columns="book_author",values="author_mean_rating",fill_value=0)
print(user_author_df.shape)
author_user_df = user_author_df.T
print(author_user_df.shape)
author_user_df.iloc[:5, :10]

(4836, 1721)
(1721, 4836)


user_id,243,254,383,388,487,503,507,638,735,741
book_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A. Manette Ansay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. N. Roquelaure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADRIANA TRIGIANI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALBERT CAMUS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALEXANDRA FULLER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

user_author_sim = cosine_similarity(author_user_df,author_user_df)

user_author_sim_df = pd.DataFrame(user_author_sim, index=author_user_df.index, columns=author_user_df.index)
user_author_sim_df['ANN BENSON'].head(10).sort_values(ascending=False)

book_author
ANN BENSON          1.000000
ALICE STEINBACH     0.122211
ALEXANDRA FULLER    0.081019
ANN PACKER          0.044426
A. Manette Ansay    0.000000
A. N. Roquelaure    0.000000
ADRIANA TRIGIANI    0.000000
ALBERT CAMUS        0.000000
AMANDA QUICK        0.000000
ANN BRASHARES       0.000000
Name: ANN BENSON, dtype: float64

In [47]:
#유저 아이디 입력받기
def input_user_id():
    user_idx = int(input('아이디를 입력하세요! : '))

    # 입력한 아이디가 데이터프레임에 존재하는지 확인
    if user_idx in user_author_rating_df['user_id'].values:
        print(f'{user_idx}님 환영합니다!')
        return user_idx
    else:
        print('올바른 아이디를 입력하세요.')
        return input_user_id()
        
user_id = input_user_id()

243님 환영합니다!


In [58]:
def recommend_authors_books(author_name, top_n=5):
    """
    특정 작가와 유사한 작가들의 책을 추천하는 함수
    :param author_name: 추천의 기준이 될 작가명 
    :param top_n: 유사도가 높은 상위 작가 수 . 미설정시 5개 기본
    :return: 유사 작가들 리스트
    """
    
        # 입력된 작가와 유사한 작가 top_n 선정
    #순서 정리
    #유저 아이디 입력받기 => 입력받은 유저가 평점 준 작가 조회 => 그 중 가장 높은 평점 조회 => 가장 높게 준 작가 기준으로 높은 유사도 조회
    
    user_ratings = user_author_rating_df[user_author_rating_df['user_id'] == user_id]
    
    # 유저가 높은 평점을 준 작가를 기준으로 정렬
    user_ratings = user_ratings.sort_values(by='author_mean_rating', ascending=False)
    
    
    top_author = user_ratings.iloc[0]['book_author']
    print(f"유저 {user_id}님이 좋아하는 작가: {top_author} 작가님입니다.")
    
        # 작가 유사도 상위 top_n 작가 선택
    similar_authors = user_author_sim_df[top_author].sort_values(ascending=False)[1:top_n+1].index

    # 유사한 작가들 추천
    recommended_books = user_author_rating_df[user_author_rating_df['book_author'].isin(similar_authors)]
    unique_authors = recommended_books['book_author'].unique()

    return display(unique_authors)

# 예시 사용
user_id = input_user_id()
recommend_authors_books(user_id, top_n=10)

507님 환영합니다!
유저 507님이 좋아하는 작가: Sue Monk Kidd 작가님입니다.


array(['Rebecca Wells', 'Dan Brown', 'Tracy Chevalier', 'Jennifer Weiner',
       'Mitch Albom', 'Gregory Maguire', 'Janet Evanovich',
       'Alexander McCall Smith', 'Leif Enger', 'Sheri Reynolds'],
      dtype=object)

# 가중평점

In [59]:

def predict_ratings(author_user_df, user_author_sim):
    """
    :param author_user_df: 사용자와 작가 간의 평점 행렬 (사용자 수 x 작가 수)
    :param user_author_sim: 사용자 간의 유사도 행렬 (사용자 수 x 사용자 수)
    """
    # 사용자-유사도 행렬과 작가 평점 행렬의 곱셈
    ratings_pred = np.dot(user_author_sim, author_user_df) / np.abs(user_author_sim).sum(axis=1)[:, np.newaxis]
    return ratings_pred

ratings_pred = predict_ratings(author_user_df, user_author_sim)
print(ratings_pred.shape)
type(ratings_pred)  # 첫 5개의 예측 평점 확인
ratings_pred_df= pd.DataFrame(ratings_pred)

(1721, 4836)


In [60]:
#MSE 측정
def get_mse(actual, pred):
    
    non_zero_idx = actual.nonzero() #0이 아닌 겂의 인덱스
    #print(non_zero_idx) #([row_idx, row_idx,...], [col_idx, col_idx,...]) 비교해서 분석
    actual = actual[non_zero_idx]
    pred = pred[non_zero_idx]
    return ((actual - pred) ** 2).mean()

get_mse(author_user_df.values,ratings_pred_df.values)

53.32529737478435

In [61]:
# 작가별 가중평점
def predict_ratings_for_user(user_idx,author_idx,topn_sim_idx):
    """
    #내용에 알맞게 주석 수적
    :param user_idx: 유저의 아이디. 고유한 번호
    :param author_idx: 예측하고자 하는 작가의 고유 번호
    :param topn_sim_idx : author_idx별 평점유사도 상위 topn건의 인덱스  
    :return: 
    """
    
    # user_idx는 user에게 부여되는 유저 번호를 받아올 것.
    # author_idx값을 구하는 받아오는 코드 작성 
    topn_sim = user_author_sim[author_idx,:][topn_sim_idx] # 가중평점 계산식의 S
    # 평점유사도 topn건에 대한 사용자 평점
    topn_rating = user_author_df.values[user_idx,:][topn_sim_idx] # 가중평점 계산식의 R
    # 예측 평점 계산
    return topn_sim.dot(topn_rating) / np.abs(topn_sim).sum()
    
topn=20
user_idx = input_user_id() # 1721개이나 0부터 시작하기 때문에 0~1720 사이의 숫자를 넣어야 한다.
author_idx = 95
topn_sim_idx = user_author_sim_df.iloc[author_idx].argsort()[:-(topn+1):-1]

pred_rating=predict_ratings_for_user(user_idx,author_idx,topn_sim_idx)

503님 환영합니다!


In [62]:
def predict_ratings_user_to_author(topn=20):
    # 최종 반환된 예측 평점 ndarray
    pred = np.zeros(user_author_df.shape)  # (1721, 4836)
    
    # 작가별 루프
    for author_idx in range(pred.shape[1]):
        # 작가별 평점 유사도 topn건 인덱스
        topn_sim_idx = user_author_sim_df.iloc[author_idx].argsort()[:-(topn + 1):-1]
        
        # 사용자 루프
        for user_idx in range(pred.shape[0]):
            # 개별 사용자와 작가에 대해 가중평점 예측
            pred[user_idx, author_idx] = predict_ratings_for_user(user_idx, author_idx, topn_sim_idx)
            
    return pred

# 예측 평점 생성
ratings_pred = predict_ratings_user_to_author(20)
print(ratings_pred.shape)


(4836, 1721)


In [63]:
"""
오류 메시지를 보면, ratings_pred 배열의 크기 (4836, 1721)이지만, author_user_df의 index와 columns는 (1721, 4836) 형식을 가지고 있다는 점에서 불일치가 발생했습니다.

문제의 원인:

ratings_pred의 크기가 (4836, 1721)인데, 이를 pred_rating_df로 변환할 때 index와 columns를 author_user_df.index와 author_user_df.columns를 그대로 사용하려고 합니다.
그러나 author_user_df.index는 1721개의 사용자, author_user_df.columns는 4836개의 작가에 해당합니다. 따라서 ratings_pred 배열의 크기 (4836, 1721)과 author_user_df의 행렬 형식이 맞지 않아서 에러가 발생한 것입니다.
해결 방법:
ratings_pred 배열의 크기 (4836, 1721)을 적절히 반영하려면, index와 columns를 반대로 지정해야 합니다.
"""
# 예측 평점을 데이터프레임으로 변환
# (4836, 1721) 행에는 유저 수. 열에는 작가 수
pred_rating_df = pd.DataFrame(ratings_pred, index=author_user_df.columns, columns=author_user_df.index)
pred_rating_df.head(100)
print(pred_rating_df.shape)

(4836, 1721)


In [64]:
ratings_pred=ratings_pred.T
get_mse(author_user_df.values, ratings_pred)

28.970193006576743

In [65]:
# 안 읽은 작가 조회
def get_unread_author(user_idx):
    user_ratings_df = user_author_df.iloc[user_idx]
    return user_ratings_df[user_ratings_df==0]

get_unread_author(150)

book_author
A. Manette Ansay         0.0
A. N. Roquelaure         0.0
ADRIANA TRIGIANI         0.0
ALBERT CAMUS             0.0
ALEXANDRA FULLER         0.0
                        ... 
Zachary Fox              0.0
Zadie Smith              0.0
Zilpha Keatley Snyder    0.0
Zlata Filipovic          0.0
Zora Neale Hurston       0.0
Name: 10241, Length: 1719, dtype: float64

In [34]:
pred_rating_df.iloc[:5, :10]

book_author,A. Manette Ansay,A. N. Roquelaure,ADRIANA TRIGIANI,ALBERT CAMUS,ALEXANDRA FULLER,ALICE STEINBACH,AMANDA QUICK,ANN BENSON,ANN BRASHARES,ANN PACKER
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
243,0.534839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
def recommand_authors(user_idx,topn=20):
    # 안본 책 (Series로 반환 받고, index(책제목) label로 변환
    unread = get_unread_author(user_idx).index
    print(unread)
    # 예측 평점 상위 topn개의 영화 추천 | 입력받은 사용자가 안 본 것들 중에서, 평점이 높은 순으로 정렬.
    temp = pred_rating_df.iloc[user_idx].loc[unread].sort_values(ascending=False)[:topn]
    # user_rating_df = author_user_df.loc[user_idx - 1, temp.index]
    return pd.DataFrame({
        'book_author': temp.index,
        'pred_rating': temp.values,
        # 'user_rating': user_rating_df.values,
    })

recommand_authors(175,20)

Index(['A. Manette Ansay', 'A. N. Roquelaure', 'ADRIANA TRIGIANI',
       'ALBERT CAMUS', 'ALEXANDRA FULLER', 'ALICE STEINBACH', 'AMANDA QUICK',
       'ANN BENSON', 'ANN BRASHARES', 'ANN PACKER',
       ...
       'Witi Ihimaera', 'Wollstonecraft Mary Shelley', 'Yann Martel',
       'Yxta Maya Murray', 'Zachary Alan Fox', 'Zachary Fox', 'Zadie Smith',
       'Zilpha Keatley Snyder', 'Zlata Filipovic', 'Zora Neale Hurston'],
      dtype='object', name='book_author', length=1718)


Unnamed: 0,book_author,pred_rating
0,Wilbur Smith,0.671083
1,Alice Borchardt,0.444523
2,William W. Johnstone,0.384802
3,Susan Power,0.384246
4,Dawna Markova,0.358128
5,Anne Rice,0.35416
6,Larry Brown,0.344272
7,Stephen King,0.343002
8,ANNE RICE,0.342794
9,Laurie Notaro,0.330478
