In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.release import author
from catboost.carry import carry

#### 데이터 전처리
- kaggle dataset 

In [2]:
book_df=pd.read_csv('book_review_data.csv')

In [3]:
# 1) 필요한 컬럼 도출
# 2) 형변환 (float > int)

book_df=book_df[['user_id',"age","rating","book_title",'book_author','year_of_publication','img_m','Summary','Category','country','Language']]
book_df[['age',"year_of_publication"]]=book_df[['age',"year_of_publication"]].astype(int)

In [4]:
# 최소 평가 개수 ( 평가가 30개보다 많은 책만 추출 )
counts=book_df['book_title'].value_counts()
title = counts[counts > 30].index
book_filter_df=book_df[book_df['book_title'].isin(title)]


In [5]:
# 최소 평가 개수 ( 평가가 20개보다 많은 사용자만 추출 )
counts=book_filter_df["user_id"].value_counts()
user_id_=counts[counts>10].index
book_filter_df = book_filter_df[book_filter_df['user_id'].isin(user_id_)]

In [6]:
# 무의미한 0점 평가 제거
rating_zero=book_filter_df[book_filter_df['rating']==0].index
book_filter_df.drop(rating_zero, axis=0, inplace=True)

In [7]:
# 한 카테고리안에 책이 5권 미만인 경우 제거
counts=book_filter_df["Category"].value_counts()
categories = counts[counts > 5].index
idx=book_filter_df[book_filter_df['Category']=='9'].index
book_filter_df.drop(idx, axis=0, inplace=True)
book_filter_df = book_filter_df[book_filter_df['Category'].isin(categories)]

idx=book_filter_df[book_filter_df["Summary"]=='9'].index
book_filter_df.drop(idx,axis=0,inplace=True)


book_filter_df["Category"]=book_filter_df['Category'].str.lower()
book_filter_df["book_title"]=book_filter_df['book_title'].str.lower()
book_filter_df["Summary"]=book_filter_df["Summary"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(" ","")
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(".","")

In [8]:
book_filter_df.shape
len(book_filter_df['book_title'].unique())

3442

#### category 대분류 만들기

In [9]:
def categorize(categories):
    if "nonfiction" in categories:
        return "nonfic"
    elif any(word in categories for word in ["fiction","fictitious"]):
        return "fiction"
    elif any(word in categories for word in ['lovely',"behavior","entertain","book","read","choco","relation","actor","comic","drama","travel","liter","humor","life","cooking", "art", "poetry",'philosophy',"community","character","courtship","geishas","bildung","christmas","boy"]):
        return 'culture'
    elif any(word in categories for word in ["children", "child","adolescence","school"]):
        return "child"
    elif any(word in categories for word in ['famil','aunts','cousin','fathers', 'mothers', 'parents', 'marriage', 'brothers', 'sisters', 'divorced', 'relationship','adult','abortion',"sex","married",'orphan','wives']):
        return "family"
    elif any(word in categories for word in ['church', 'christian', 'religion','jerusalem','apologetics']):
        return "religion"
    elif any(word in categories for word in ['dragons', 'aliens', 'imaginary', 'planet', 'adventure','fantasy','spirit']):
        return "sf"
    elif any(word in categories for word in ['pets', 'dogs', 'animals','cats']):
        return "animals"
    elif any(word in categories for word in ["history", "regions", "england", "california", "boston", "brooklyn", "africa", "chicago", "france", "american","cornwall", "amsterdam", "cape cod", "capitalists","britain","indian","middle west","town","soldier","baltimore","amerikan","british"]):
        return "history"
    elif any(word in categories for word in ["science", "computers", "intelligence", "medical", "health", "medical", "engineers", "automotives", "management","bio","nature","mental","social","psychology","archit","reference","compulsive","medica","vet","auto","cancer"]):
        return "sci&tech"
    elif any(word in categories for word in ["mystery", "crime", "haunted", "horror", "murder","burglar","accidents","death","assassins",'conspiracies']):
        return "mystery"
    elif any(word in categories for word in ["econo","business","law"]):
        return "e&b"
    elif any(word in categories for word in ["self-help","audio","brewing","kindness","avarice",'curious']):
        return "etc"

book_filter_df["L_Category"]=book_filter_df["Category"].apply(categorize)

In [10]:
book_filter_df.head()

Unnamed: 0,user_id,age,rating,book_title,book_author,year_of_publication,img_m,Summary,Category,country,Language,L_Category
31,11676,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],,en,fiction
32,29526,26,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
34,46398,37,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
36,49635,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
42,148712,34,10,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction


#### user_book_rating dataframe

In [11]:
user_book_df=book_filter_df.pivot_table(index='user_id',columns='book_title',values='rating',fill_value=0)
book_user_df=user_book_df.T
user_book_df.shape # user간 유사도 구할거 아니니까

(4835, 3442)

In [12]:
user_book_df.iloc[55].sort_values(ascending=False)[:10]

book_title
thinner                                                                     8.0
christine                                                                   8.0
cujo                                                                        6.0
stupid white men ...and other sorry excuses for the state of the nation!    6.0
'salem's lot                                                                0.0
sweet anger                                                                 0.0
sundiver (the uplift saga, book 1)                                          0.0
sunset in st. tropez                                                        0.0
superfudge (yearling books (paperback))                                     0.0
superstitious                                                               0.0
Name: 4334, dtype: float64

#### age_title_count dataframe

In [13]:
book_filter_df["age"].value_counts()

age
34    16793
33     1876
29     1699
28     1681
32     1580
      ...  
76        6
11        5
84        5
10        3
97        2
Name: count, Length: 78, dtype: int64

In [14]:
def make_ages(age):
    if 10<=age<20:
        return 10
    elif 20<=age<30:
        return 20
    elif 30<=age<40:
        return 30
    elif 40<=age<50:
        return 40
    elif 50<=age<60:
        return 50
    elif 60<=age<70:
        return 60
    else:
        return 70

In [15]:
book_filter_df['ages'] = book_filter_df['age'].apply(make_ages)

In [16]:
ages_book_=book_filter_df.groupby(['ages','book_title']).size().reset_index(name='count')

In [17]:
ages_book_df=ages_book_.pivot_table(index='ages',columns='book_title',values='count',fill_value=0)
book_ages_df=ages_book_df.T
book_ages_df[:100]

ages,10,20,30,40,50,60,70
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'salem's lot,0.0,0.0,4.0,1.0,0.0,0.0,0.0
10 lb. penalty,0.0,1.0,5.0,4.0,1.0,1.0,1.0
101 dalmatians,0.0,2.0,0.0,1.0,0.0,0.0,0.0
16 lighthouse road,1.0,1.0,6.0,3.0,2.0,0.0,0.0
1984,3.0,26.0,18.0,8.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
a streetcar named desire,1.0,7.0,3.0,0.0,1.0,0.0,0.0
a sudden change of heart,0.0,1.0,5.0,1.0,0.0,0.0,0.0
a suitable vengeance,0.0,0.0,4.0,1.0,2.0,0.0,0.0
a superior death,0.0,0.0,4.0,1.0,3.0,0.0,0.0


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
ages_similarity=cosine_similarity(book_ages_df,book_ages_df)
ages_similarity_df=pd.DataFrame(ages_similarity,index=ages_book_df.columns, columns=ages_book_df.columns)

In [19]:
ages_similarity_df.shape

(3442, 3442)

#### user별 안 읽은 책 추천

In [20]:
def get_unseen_books(user_idx):
    user_read_df=user_book_df.loc[user_idx,:]
    return user_read_df[user_read_df==0].index

In [21]:
def user_fav_book(user_idx):
    user_ages=pd.DataFrame(book_filter_df[book_filter_df['user_id']==user_idx])
    user_ages.sort_values(by='rating',ascending=False,inplace=True)
    user_ages.value_counts()
    user_favorite_book=user_ages.iloc[0]['book_title']
    print(user_favorite_book)
    return user_favorite_book

In [22]:
def user_fav_book(user_idx):
    user_ages=pd.DataFrame(book_filter_df[book_filter_df['user_id']==user_idx])
    user_ages.sort_values(by='rating',ascending=False,inplace=True)
    l_category=user_ages[:20]["L_Category"].value_counts().idxmax()
    i=0
    while True:
        if user_ages.iloc[i]["L_Category"]==l_category:
            favorite_book=user_ages.iloc[i]['book_title']
            break
    return l_category, favorite_book
user_fav_book(29526)

('fiction', "bridget jones's diary")

In [38]:
def match_L_Category(title):
    title_L_Category=book_filter_df[["book_title","L_Category"]].drop_duplicates()
    L_Category=title_L_Category[title_L_Category['book_title']==title]["L_Category"].iloc[0]
    return L_Category
def match_image_link(title):
    title_image=book_filter_df[["book_title","img_m"]].drop_duplicates()
    image=title_image[title_image['book_title']==title]["img_m"].iloc[0]
    return image
def match_author(title):
    title_author=book_filter_df[["book_title","book_author"]].drop_duplicates()
    author=title_author[title_author['book_title']==title]["book_author"].iloc[0]
    return author

In [34]:
def match_imglink_author(title):
    temp_df=book_filter_df[["book_title","book_author","img_m","L_Category"]].drop_duplicates()
    L_Category=temp_df[temp_df['book_title']==title]["L_Category"].iloc[0]
    image=temp_df[temp_df['book_title']==title]["img_m"].iloc[0]
    author=temp_df[temp_df["book_title"]==title]["book_author"].iloc[0]
    return L_Category,image,author

In [41]:
def predict_books(user_idx):
    unseen_idx=get_unseen_books(user_idx)
    l_cate,fav_idx=user_fav_book(user_idx)
    temp=ages_similarity_df.loc[fav_idx][unseen_idx].sort_values(ascending=False)
    temp=pd.DataFrame(temp).reset_index()   
    temp["L_Category"] = temp["book_title"][:300].apply(match_L_Category)
    predict_same_L_category=temp[temp["L_Category"]==l_cate][:5]
    predict_dif_L_category=temp[temp["L_Category"]!=l_cate][:5]
    return predict_same_L_category,predict_dif_L_category
#---------------------------------------------------------------------#
def book_link_image(user_idx):
    same,dif=predict_books(user_idx)
    same["image_link"]=same["book_title"].apply(match_image_link)
    dif["image_link"]=dif["book_title"].apply(match_image_link)
    same["book_author"]=same["book_title"].apply(match_author)
    dif["book_author"]=dif["book_title"].apply(match_author)
    return same,dif
#---------------------------------------------------------------------#
# test-code
same,dif=book_link_image(11676)
display(same)
display(dif)

Unnamed: 0,book_title,cat and mouse,L_Category,image_link,book_author
0,reap the wind,0.99732,fiction,http://images.amazon.com/images/P/0553586122.0...,irisjohansen
1,the bestseller,0.996826,fiction,http://images.amazon.com/images/P/0061096083.0...,oliviagoldsmith
2,the devil's code,0.996744,fiction,http://images.amazon.com/images/P/0425179885.0...,johnsandford
3,wonder boys : a novel (bestselling backlist),0.99655,fiction,http://images.amazon.com/images/P/0312140940.0...,michaelchabon
4,lost in a good book: a thursday next novel,0.996219,fiction,http://images.amazon.com/images/P/0670031909.0...,jasperfforde


Unnamed: 0,book_title,cat and mouse,L_Category,image_link,book_author
5,the woman's comfort book : a self-nurturing gu...,0.995736,etc,http://images.amazon.com/images/P/0062505319.0...,jenniferlouden
8,lucky man: a memoir,0.995249,sci&tech,http://images.amazon.com/images/P/0786867647.0...,michaeljfox
10,master of the game,0.995131,e&b,http://images.amazon.com/images/P/0446802204.0...,sidneysheldon
24,sleepers,0.993805,sci&tech,http://images.amazon.com/images/P/0345404114.0...,lorenzocarcaterra
25,eden close,0.993631,mystery,http://images.amazon.com/images/P/0156005891.0...,anitashreve


#### 작가별 평점유사도

In [25]:
# user별 작가 평점으로 평균내기
bm_df = book_filter_df.groupby(['user_id', 'book_author'])['rating'].mean().reset_index()
print(bm_df.shape)
bm_df=bm_df.rename(columns={'rating':'author_mean_rating'})
bm_df

(43044, 3)


Unnamed: 0,user_id,book_author,author_mean_rating
0,243,arthurgolden,10.0
1,243,arundhatiroy,7.0
2,243,belvaplain,6.0
3,243,jackcanfield,5.0
4,243,janehamilton,7.0
...,...,...,...
43039,278843,jkrowling,8.0
43040,278843,maevebinchy,7.0
43041,278843,rebeccawells,7.0
43042,278843,richardcarlson,8.0


In [26]:
user_author_df=bm_df.pivot_table(index="user_id",columns="book_author",values="author_mean_rating",fill_value=0)