In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### 데이터 전처리
- kaggle dataset 

In [2]:
book_df=pd.read_csv('./data/book_review_data.csv')

In [3]:
# 1) 필요한 컬럼 도출
# 2) 형변환 (float > int)

book_df=book_df[['user_id',"age","rating","book_title",'book_author','year_of_publication','img_m','Summary','Category','country','Language']]
book_df[['age',"year_of_publication"]]=book_df[['age',"year_of_publication"]].astype(int)

In [4]:
# 최소 평가 개수 ( 평가가 30개보다 많은 책만 추출 )
counts=book_df['book_title'].value_counts()
title = counts[counts > 30].index
book_filter_df=book_df[book_df['book_title'].isin(title)]


In [5]:
# 최소 평가 개수 ( 평가가 20개보다 많은 사용자만 추출 )
counts=book_filter_df["user_id"].value_counts()
user_id_=counts[counts>10].index
book_filter_df = book_filter_df[book_filter_df['user_id'].isin(user_id_)]

In [6]:
# 무의미한 0점 평가 제거
rating_zero=book_filter_df[book_filter_df['rating']==0].index
book_filter_df.drop(rating_zero, axis=0, inplace=True)

In [7]:
# 한 카테고리안에 책이 5권 미만인 경우 제거
counts=book_filter_df["Category"].value_counts()
categories = counts[counts > 5].index
book_filter_df = book_filter_df[book_filter_df['Category'].isin(categories)]

idx=book_filter_df[book_filter_df["Summary"]=='9'].index
book_filter_df.drop(idx,axis=0,inplace=True)

book_filter_df["book_title"]=book_filter_df['book_title'].str.lower()
book_filter_df["Summary"]=book_filter_df["Summary"].str.lower()

In [8]:
book_filter_df.shape
len(book_filter_df['book_title'].unique())

3450

#### user_book_rating dataframe

In [9]:
user_book_df=book_filter_df.pivot_table(index='user_id',columns='book_title',values='rating',fill_value=0)
book_user_df=user_book_df.T
user_book_df.shape # user간 유사도 구할거 아니니까

(4836, 3450)

In [10]:
user_book_df.iloc[55].sort_values(ascending=False)[:10]

book_title
thinner                                                                     8.0
christine                                                                   8.0
stupid white men ...and other sorry excuses for the state of the nation!    6.0
cujo                                                                        6.0
'salem's lot                                                                0.0
sweet hereafter movie tie-in : a novel                                      0.0
sunset in st. tropez                                                        0.0
superfudge (yearling books (paperback))                                     0.0
superstitious                                                               0.0
surfacing                                                                   0.0
Name: 4334, dtype: float64

#### age_title_count dataframe

In [11]:
book_filter_df["age"].value_counts()

age
34    16829
33     1882
29     1708
28     1689
32     1585
      ...  
76        6
11        5
84        5
10        3
97        2
Name: count, Length: 78, dtype: int64

In [12]:
def make_ages(age):
    if 10<=age<20:
        return 10
    elif 20<=age<30:
        return 20
    elif 30<=age<40:
        return 30
    elif 40<=age<50:
        return 40
    elif 50<=age<60:
        return 50
    elif 60<=age<70:
        return 60
    else:
        return 70

In [13]:
book_filter_df['ages'] = book_filter_df['age'].apply(make_ages)

In [14]:
ages_book_=book_filter_df.groupby(['ages','book_title']).size().reset_index(name='count')

In [15]:
ages_book_df=ages_book_.pivot_table(index='ages',columns='book_title',values='count',fill_value=0)
book_ages_df=ages_book_df.T
book_ages_df[:100]

ages,10,20,30,40,50,60,70
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'salem's lot,0.0,0.0,4.0,1.0,0.0,0.0,0.0
10 lb. penalty,0.0,1.0,5.0,4.0,1.0,1.0,1.0
101 dalmatians,0.0,2.0,0.0,1.0,0.0,0.0,0.0
16 lighthouse road,1.0,1.0,6.0,3.0,2.0,0.0,0.0
1984,3.0,26.0,18.0,8.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
a stitch in time (needlecraft mysteries),0.0,1.0,4.0,1.0,3.0,0.0,0.0
a streetcar named desire,1.0,7.0,3.0,0.0,1.0,0.0,0.0
a sudden change of heart,0.0,1.0,5.0,1.0,0.0,0.0,0.0
a suitable vengeance,0.0,0.0,4.0,1.0,2.0,0.0,0.0


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
ages_similarity=cosine_similarity(book_ages_df,book_ages_df)
ages_similarity_df=pd.DataFrame(ages_similarity,index=ages_book_df.columns, columns=ages_book_df.columns)

In [17]:
ages_similarity_df.shape

(3450, 3450)

#### user별 안 읽은 책 추천

In [18]:
def get_unseen_books(user_idx):
    user_read_df=user_book_df.iloc[user_idx,:]
    return user_read_df[user_read_df==0].index

In [19]:
def ages_popular_book(user_idx):
    user_ages=book_filter_df.iloc[user_idx]["ages"]
ages_popular_book(176)

# 작가 평점 추천 전처리

In [20]:
# 책의 평점으로 평균내기
# user별 작가 평점으로 평균내기
bm_df = book_filter_df.groupby(['user_id', 'book_author'])['rating'].mean().reset_index()
print(bm_df.shape)
bm_df=bm_df.rename(columns={'rating':'author_mean_rating'})
bm_df

(44332, 3)


Unnamed: 0,user_id,book_author,author_mean_rating
0,243,Arthur Golden,10.0
1,243,Arundhati Roy,7.0
2,243,Belva Plain,6.0
3,243,JOHN SAUL,6.0
4,243,Jack Canfield,5.0
...,...,...,...
44327,278843,J. K. Rowling,8.0
44328,278843,Maeve Binchy,7.0
44329,278843,Rebecca Wells,7.0
44330,278843,Richard Carlson,8.0


In [21]:
user_author_df=bm_df.pivot_table(index="user_id",columns="book_author",values="author_mean_rating",fill_value=0)
print(user_author_df.shape)
author_user_df = user_author_df.T
print(author_user_df.shape)

(4836, 1721)
(1721, 4836)


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

user_author_sim = cosine_similarity(author_user_df,author_user_df)

user_author_sim_df = pd.DataFrame(user_author_sim, index=author_user_df.index, columns=author_user_df.index)
user_author_sim_df['ANN BENSON'].head(10).sort_values(ascending=False)

book_author
ANN BENSON          1.000000
ALICE STEINBACH     0.122211
ALEXANDRA FULLER    0.081019
ANN PACKER          0.044426
A. Manette Ansay    0.000000
A. N. Roquelaure    0.000000
ADRIANA TRIGIANI    0.000000
ALBERT CAMUS        0.000000
AMANDA QUICK        0.000000
ANN BRASHARES       0.000000
Name: ANN BENSON, dtype: float64

In [23]:
def recommend_authors_books(author_name, top_n=5):
    """
    특정 작가와 유사한 작가들의 책을 추천하는 함수
    :param author_name: 추천의 기준이 될 작가명 
    :param top_n: 유사도가 높은 상위 작가 수 . 미설정시 5개 기본
    :return: 유사 작가들 리스트
    """
    # 입력된 작가와 유사한 작가 top_n 선정
    if author_name not in user_author_sim_df.index:
        return f"작가 '{author_name}'를 찾을 수 없습니다."
    
    # 작가 유사도 상위 top_n 작가 선택
    similar_authors = user_author_sim_df[author_name].sort_values(ascending=False)[1:top_n+1].index

    # 유사한 작가들의 책 추천
    recommended_books = bm_df[bm_df['book_author'].isin(similar_authors)]
    unique_authors = recommended_books['book_author'].unique()

    return display(unique_authors)

# 예시 사용
print(recommend_authors_books('TONI MORRISON', top_n=10))

array(['David McCullough', 'Iris, D. Rainer', 'John Ashbery',
       'John J. Edward', 'Judith Reeves-Stevens', 'Kathleen Cross',
       'Linda Sunshine', 'MICHAEL POLLAN', 'Paul Vincent',
       'Ruthanne Lum McCunn'], dtype=object)

None


# 가중평점

In [24]:
import numpy as np

def predict_ratings(author_user_df, user_author_sim):
    """
    :param author_user_df: 사용자와 작가 간의 평점 행렬 (사용자 수 x 작가 수)
    :param user_author_sim: 사용자 간의 유사도 행렬 (사용자 수 x 사용자 수)
    """
    # 사용자-유사도 행렬과 작가 평점 행렬의 곱셈
    ratings_pred = np.dot(user_author_sim, author_user_df) / np.abs(user_author_sim).sum(axis=1)[:, np.newaxis]
    return ratings_pred

ratings_pred = predict_ratings(author_user_df, user_author_sim)
print(ratings_pred.shape)
type(ratings_pred)  # 첫 5개의 예측 평점 확인
ratings_pred_df= pd.DataFrame(ratings_pred)

(1721, 4836)


In [25]:
#MSE 측정
def get_mse(actual, pred):
    
    non_zero_idx = actual.nonzero() #0이 아닌 겂의 인덱스
    #print(non_zero_idx) #([row_idx, row_idx,...], [col_idx, col_idx,...]) 비교해서 분석
    actual = actual[non_zero_idx]
    pred = pred[non_zero_idx]
    return ((actual - pred) ** 2).mean()

get_mse(author_user_df.values,ratings_pred_df.values)

53.32529737478435

In [40]:
# 유저별 사용자별 가중평점 예측
def predict_ratings_by_user(user_idx,author_idx,topn_sim_idx):
    """
    
    :param user_idx: 작가의 아이디 고유한 번호
    :param author_idx: 가중 평점을 예측해보고자 할 작가의 고유한 번호
    :param topn_sim_idx : movie_idx별 평점유사도 상위 topn건의 인덱스  
    :return: 
    """
    


    topn_sim = user_author_sim_df.iloc[author_idx, :].values[topn_sim_idx]  # 가중평점 계산식의 S
    topn_rating = author_user_df.values[user_idx,:][topn_sim_idx] # 가중평점 계산식의 R
    # 예측 평점 계산
    return topn_sim.dot(topn_rating) / np.abs(topn_sim).sum()
topn=20
user_idx = 1721
author_idx = 2500
topn_sim_idx = user_author_sim_df.iloc[author_idx].argsort()[:-(topn+1):-1]

predict_ratings_by_user(user_idx,author_idx,topn_sim_idx)

IndexError: single positional indexer is out-of-bounds