In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### 데이터 전처리
- kaggle dataset 

In [81]:
book_df=pd.read_csv('csv/book_review_data.csv')

In [82]:
# 1) 필요한 컬럼 도출
# 2) 형변환 (float > int)

book_df=book_df[['user_id',"age","rating","book_title",'book_author','year_of_publication','img_m','Summary','Category','country','Language']]
book_df[['age',"year_of_publication"]]=book_df[['age',"year_of_publication"]].astype(int)

In [83]:
# 최소 평가 개수 ( 평가가 30개보다 많은 책만 추출 )
counts=book_df['book_title'].value_counts()
title = counts[counts > 30].index
book_filter_df=book_df[book_df['book_title'].isin(title)]


In [84]:
# 최소 평가 개수 ( 평가가 20개보다 많은 사용자만 추출 )
counts=book_filter_df["user_id"].value_counts()
user_id_=counts[counts>30].index
book_filter_df = book_filter_df[book_filter_df['user_id'].isin(user_id_)]

In [85]:
# 무의미한 0점 평가 제거
rating_zero=book_filter_df[book_filter_df['rating']==0].index
book_filter_df.drop(rating_zero, axis=0, inplace=True)

In [86]:
# 한 카테고리안에 책이 5권 미만인 경우 제거
counts=book_filter_df["Category"].value_counts()
categories = counts[counts > 5].index
idx=book_filter_df[book_filter_df['Category']=='9'].index
book_filter_df.drop(idx, axis=0, inplace=True)
book_filter_df = book_filter_df[book_filter_df['Category'].isin(categories)]

idx=book_filter_df[book_filter_df["Summary"]=='9'].index
book_filter_df.drop(idx,axis=0,inplace=True)


book_filter_df["Category"]=book_filter_df['Category'].str.lower()
book_filter_df["book_title"]=book_filter_df['book_title'].str.lower()
book_filter_df["Summary"]=book_filter_df["Summary"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(" ","")
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(".","")

In [87]:
book_filter_df.shape
len(book_filter_df['book_title'].unique())

3374

In [88]:
book_filter_df.to_csv("book_filter_df.csv")

#### category 대분류 만들기

In [89]:
def categorize(categories):
    if "nonfiction" in categories:
        return "nonfic"
    elif any(word in categories for word in ["fiction","fictitious"]):
        return "fiction"
    elif any(word in categories for word in ['lovely',"behavior","entertain","book","read","choco","relation","actor","comic","drama","travel","liter","humor","life","cooking", "art", "poetry",'philosophy',"community","character","courtship","geishas","bildung","christmas","boy"]):
        return 'culture'
    elif any(word in categories for word in ["children", "child","adolescence","school"]):
        return "child"
    elif any(word in categories for word in ['famil','aunts','cousin','fathers', 'mothers', 'parents', 'marriage', 'brothers', 'sisters', 'divorced', 'relationship','adult','abortion',"sex","married",'orphan','wives']):
        return "family"
    elif any(word in categories for word in ['church', 'christian', 'religion','jerusalem','apologetics']):
        return "religion"
    elif any(word in categories for word in ['dragons', 'aliens', 'imaginary', 'planet', 'adventure','fantasy','spirit','wonder']):
        return "sf"
    elif any(word in categories for word in ['pets', 'dogs', 'animals','cats']):
        return "animals"
    elif any(word in categories for word in ["history", "regions", "england", "california", "boston", "brooklyn", "africa", "chicago", "france", "american","cornwall", "amsterdam", "cape cod", "capitalists","britain","indian","middle west","town","soldier","baltimore","amerikan","british"]):
        return "history"
    elif any(word in categories for word in ["science", "computers", "intelligence", "medical", "health", "medical", "engineers", "automotives", "management","bio","nature","mental","social","psychology","archit","reference","compulsive","medica","vet","auto","cancer"]):
        return "sci&tech"
    elif any(word in categories for word in ["mystery", "crime", "haunted", "horror", "murder","burglar","accidents","death","assassins",'conspiracies']):
        return "mystery"
    elif any(word in categories for word in ["econo","business","law"]):
        return "e&b"
    elif any(word in categories for word in ["self-help","audio","brewing","kindness","avarice",'curious']):
        return "etc"

book_filter_df["L_Category"]=book_filter_df["Category"].apply(categorize)

In [90]:
book_filter_df.head()

Unnamed: 0,user_id,age,rating,book_title,book_author,year_of_publication,img_m,Summary,Category,country,Language,L_Category
31,11676,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],,en,fiction
32,29526,26,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
34,46398,37,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
42,148712,34,10,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
54,230522,52,7,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction


#### user_book_rating dataframe

In [91]:
user_book_df=book_filter_df.pivot_table(index='user_id',columns='book_title',values='rating',fill_value=0)
book_user_df=user_book_df.T
user_book_df.shape # user간 유사도 구할거 아니니까

(1924, 3374)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity
book_similarity = cosine_similarity(book_user_df, book_user_df)

book_similarity_df = pd.DataFrame(book_similarity, index=user_book_df.columns, columns=user_book_df.columns)

In [93]:
book_similarity_df.to_csv("book_similarity_df.csv")

In [94]:
user_book_df.to_csv("user_book_df.csv")

In [95]:
user_book_df.iloc[55].sort_values(ascending=False)[:10]

book_title
cruel & unusual (kay scarpetta mysteries (paperback))     9.0
desperation                                               9.0
southern cross                                            9.0
all that remains (kay scarpetta mysteries (paperback))    9.0
body of evidence (kay scarpetta mysteries (paperback))    7.0
the cat who ate danish modern (cat who... (paperback))    7.0
body of evidence                                          7.0
obsession                                                 6.0
taken                                                     0.0
sweet liar                                                0.0
Name: 8936, dtype: float64

#### age_title_count dataframe

In [96]:
book_filter_df["age"].value_counts()

age
34    11397
33     1468
29     1244
28     1211
32     1156
30     1110
26     1075
31     1036
36     1036
47      992
27      936
43      909
37      892
35      874
25      855
38      791
46      784
39      748
44      729
24      679
49      649
51      600
23      590
52      583
40      564
41      523
42      504
54      442
18      438
22      403
45      351
48      321
50      279
57      267
53      240
21      192
55      190
20      166
60      162
17      145
56      135
19      124
61      114
65      108
58      103
59       97
14       95
63       89
67       81
62       56
69       48
9        23
79       19
16       18
15       18
68       12
71        9
66        9
Name: count, dtype: int64

In [97]:
def make_ages(age):
    if 10<=age<20:
        return 10
    elif 20<=age<30:
        return 20
    elif 30<=age<40:
        return 30
    elif 40<=age<50:
        return 40
    elif 50<=age<60:
        return 50
    elif 60<=age<70:
        return 60
    else:
        return 70

In [98]:
book_filter_df['ages'] = book_filter_df['age'].apply(make_ages)


In [99]:
ages_book_=book_filter_df.groupby(['ages','book_title']).size().reset_index(name='count')

In [100]:
ages_book_df=ages_book_.pivot_table(index='ages',columns='book_title',values='count',fill_value=0)
book_ages_df=ages_book_df.T
book_ages_df[:100]

ages,10,20,30,40,50,60,70
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'salem's lot,0.0,0.0,4.0,1.0,0.0,0.0,0.0
10 lb. penalty,0.0,1.0,4.0,3.0,1.0,1.0,0.0
101 dalmatians,0.0,2.0,0.0,1.0,0.0,0.0,0.0
16 lighthouse road,1.0,0.0,2.0,3.0,2.0,0.0,0.0
1984,0.0,18.0,13.0,6.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
a suitable vengeance,0.0,0.0,3.0,1.0,1.0,0.0,0.0
a superior death,0.0,0.0,3.0,1.0,3.0,0.0,0.0
a swiftly tilting planet,1.0,3.0,5.0,0.0,0.0,0.0,0.0
a tale of two cities,0.0,2.0,2.0,1.0,0.0,0.0,0.0


In [101]:
from sklearn.metrics.pairwise import cosine_similarity
ages_similarity=cosine_similarity(book_ages_df,book_ages_df)
ages_similarity_df=pd.DataFrame(ages_similarity,index=ages_book_df.columns, columns=ages_book_df.columns)
ages_similarity_df.to_csv("age_similarity_df.csv",index=True)

In [102]:
ages_similarity_df=pd.read_csv("csv/age_similarity_df.csv", index_col='book_title')



In [103]:
ages_similarity_df.index

Index([''salem's lot', '10 lb. penalty', '101 dalmatians',
       '16 lighthouse road', '1984', '1st to die: a novel',
       '2010: odyssey two', '24 hours', '4 blondes', '84 charing cross road',
       ...
       'you just don't understand', 'you shall know our velocity',
       'you'll never eat lunch in this town again',
       'your money or your life: transforming your relationship with money and achieving financial independence',
       'your oasis on flame lake (ballantine reader's circle)',
       'zen and the art of motorcycle maintenance: an inquiry into values',
       'zia', 'zlata's diary: a child's life in sarajevo',
       'zodiac: the eco-thriller', 'zoya'],
      dtype='object', name='book_title', length=3374)

#### user별 안 읽은 책 추천

In [104]:
def get_unseen_books(user_idx):
    user_read_df=user_book_df.loc[user_idx,:]
    return user_read_df[user_read_df==0].index

In [105]:
def user_fav_book(user_idx):
    user_ages=pd.DataFrame(book_filter_df[book_filter_df['user_id']==user_idx])
    user_ages.sort_values(by='rating',ascending=False,inplace=True)
    user_ages.value_counts()
    user_favorite_book=user_ages.iloc[0]['book_title']
    print(user_favorite_book)
    return user_favorite_book
user_fav_book(254)

american gods


'american gods'

In [106]:
def user_fav_book(user_idx):
    user_ages=pd.DataFrame(book_filter_df[book_filter_df['user_id']==user_idx])
    user_ages.sort_values(by='rating',ascending=False,inplace=True)
    l_category=user_ages[:20]["L_Category"].value_counts().idxmax()
    i=0
    while True:
        if user_ages.iloc[i]["L_Category"]==l_category:
            favorite_book=user_ages.iloc[i]['book_title']
            break
    return l_category, favorite_book

user_fav_book(254)

('fiction', 'american gods')

In [107]:
def match_L_Category(title):
    title_L_Category=book_filter_df[["book_title","L_Category"]].drop_duplicates()
    L_Category=title_L_Category[title_L_Category['book_title']==title]["L_Category"].iloc[0]
    return L_Category
def match_image_link(title):
    title_image=book_filter_df[["book_title","img_m"]].drop_duplicates()
    image=title_image[title_image['book_title']==title]["img_m"].iloc[0]
    return image
def match_author(title):
    title_author=book_filter_df[["book_title","book_author"]].drop_duplicates()
    author=title_author[title_author['book_title']==title]["book_author"].iloc[0]
    return author

In [108]:
def match_imglink_author(title):
    temp_df=book_filter_df[["book_title","book_author","img_m","L_Category"]].drop_duplicates()
    L_Category=temp_df[temp_df['book_title']==title]["L_Category"].iloc[0]
    image=temp_df[temp_df['book_title']==title]["img_m"].iloc[0]
    author=temp_df[temp_df["book_title"]==title]["book_author"].iloc[0]
    return L_Category,image,author

In [109]:
ages_similarity_df.loc["american gods"]

'salem's lot                                                         0.735545
10 lb. penalty                                                       0.720847
101 dalmatians                                                       0.601237
16 lighthouse road                                                   0.412681
1984                                                                 0.947937
                                                                       ...   
zen and the art of motorcycle maintenance: an inquiry into values    0.959974
zia                                                                  0.031265
zlata's diary: a child's life in sarajevo                            0.750366
zodiac: the eco-thriller                                             0.887204
zoya                                                                 0.799390
Name: american gods, Length: 3374, dtype: float64

In [110]:
def predict_books(user_idx):
    unseen_idx=get_unseen_books(user_idx)
    l_cate,fav_idx=user_fav_book(user_idx)
    print(fav_idx)
    temp=ages_similarity_df.loc[fav_idx][unseen_idx].sort_values(ascending=False)
    temp=pd.DataFrame(temp).reset_index()   
    temp["L_Category"] = temp["book_title"][:300].apply(match_L_Category)
    predict_same_L_category=temp[temp["L_Category"]==l_cate][:5]
    predict_dif_L_category=temp[temp["L_Category"]!=l_cate][:5]
    return predict_same_L_category,predict_dif_L_category
#---------------------------------------------------------------------#
def book_link_image(user_idx):
    same,dif=predict_books(user_idx)
    same["image_link"]=same["book_title"].apply(match_image_link)
    dif["image_link"]=dif["book_title"].apply(match_image_link)
    same["book_author"]=same["book_title"].apply(match_author)
    dif["book_author"]=dif["book_title"].apply(match_author)
    return same,dif
#---------------------------------------------------------------------#
# test-code
same,dif=book_link_image(254)
display(same)
display(dif)

american gods


Unnamed: 0,book_title,american gods,L_Category,image_link,book_author
0,the queen of the damned (vampire chronicles (p...,0.997896,fiction,http://images.amazon.com/images/P/0345351525.0...,annerice
1,sophie's world: a novel about the history of p...,0.997096,fiction,http://images.amazon.com/images/P/0425152251.0...,josteingaarder
2,hidden jewel (landry),0.997011,fiction,http://images.amazon.com/images/P/0671873202.0...,vcandrews
3,if there be thorns (dollanganger),0.996774,fiction,http://images.amazon.com/images/P/0671729454.0...,vcandrews
4,ruby (landry),0.9962,fiction,http://images.amazon.com/images/P/0671759345.0...,vcandrews


Unnamed: 0,book_title,american gods,L_Category,image_link,book_author
5,when rabbit howls,0.996094,sci&tech,http://images.amazon.com/images/P/0515103292.0...,truddichase
7,naked lunch,0.996094,sci&tech,http://images.amazon.com/images/P/0802132952.0...,williamsburroughs
16,living wicca: a further guide for the solitary...,0.994855,sf,http://images.amazon.com/images/P/0875421849.0...,scottcunningham
22,the prince (penguin classics),0.994855,culture,http://images.amazon.com/images/P/0140449159.0...,niccolomachiavelli
23,babyhood,0.994855,sci&tech,http://images.amazon.com/images/P/0380728729.0...,paulreiser


#### 작가별 평점유사도

In [111]:
# user별 작가 평점으로 평균내기
bm_df = book_filter_df.groupby(['user_id', 'book_author'])['rating'].mean().reset_index()
print(bm_df.shape)
bm_df=bm_df.rename(columns={'rating':'author_mean_rating'})
bm_df

(29020, 3)


Unnamed: 0,user_id,book_author,author_mean_rating
0,243,arthurgolden,10.0
1,243,arundhatiroy,7.0
2,243,belvaplain,6.0
3,243,jackcanfield,5.0
4,243,janehamilton,7.0
...,...,...,...
29015,278633,richardpaulevans,10.0
29016,278633,sidneysheldon,9.0
29017,278633,stephenking,5.0
29018,278633,sylviaplath,8.0


In [112]:
user_author_df=bm_df.pivot_table(index="user_id",columns="book_author",values="author_mean_rating",fill_value=0)