카테고리 분류 후 사용자가 읽은 책의 상위 장르의 빈도수를 계산하여 해당 장르의 Top_n개의 책 추천


In [79]:
from platform import uname_result

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost.carry import carry
from pandas import pivot_table
from pandas.core.interchange.dataframe_protocol import DataFrame
from streamlit import columns

#### 데이터 전처리
- kaggle dataset 

In [80]:
book_df=pd.read_csv('data/book_review_data.csv')

In [81]:
# 1) 필요한 컬럼 도출
# 2) 형변환 (float > int)

book_df=book_df[['user_id',"age","rating","book_title",'book_author','year_of_publication','img_m','Summary','Category','country','Language']]
book_df[['age',"year_of_publication"]]=book_df[['age',"year_of_publication"]].astype(int)

In [82]:
# 최소 평가 개수 ( 평가가 30개보다 많은 책만 추출 )
counts=book_df['book_title'].value_counts()
title = counts[counts > 30].index
book_filter_df=book_df[book_df['book_title'].isin(title)]


In [83]:
# 최소 평가 개수 ( 평가가 20개보다 많은 사용자만 추출 )
counts=book_filter_df["user_id"].value_counts()
user_id_=counts[counts>10].index
book_filter_df = book_filter_df[book_filter_df['user_id'].isin(user_id_)]

In [84]:
# 무의미한 0점 평가 제거
rating_zero=book_filter_df[book_filter_df['rating']==0].index
book_filter_df.drop(rating_zero, axis=0, inplace=True)

In [85]:
# 한 카테고리안에 책이 5권 미만인 경우 제거
counts=book_filter_df["Category"].value_counts()
categories = counts[counts > 5].index
idx=book_filter_df[book_filter_df['Category']=='9'].index
book_filter_df.drop(idx, axis=0, inplace=True)
book_filter_df = book_filter_df[book_filter_df['Category'].isin(categories)]

idx=book_filter_df[book_filter_df["Summary"]=='9'].index
book_filter_df.drop(idx,axis=0,inplace=True)


book_filter_df["Category"]=book_filter_df['Category'].str.lower()
book_filter_df["book_title"]=book_filter_df['book_title'].str.lower()
book_filter_df["Summary"]=book_filter_df["Summary"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.lower()
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(" ","")
book_filter_df["book_author"]=book_filter_df["book_author"].str.replace(".","")

In [86]:
book_filter_df.shape
len(book_filter_df['book_title'].unique())

3442

#### category 대분류 만들기

In [87]:
def categorize(categories):
    if "nonfiction" in categories:
        return "nonfic"
    elif any(word in categories for word in ["fiction","fictitious"]):
        return "fiction"
    elif any(word in categories for word in ['lovely',"behavior","entertain","book","read","choco","relation","actor","comic","drama","travel","liter","humor","life","cooking", "art", "poetry",'philosophy',"community","character","courtship","geishas","bildung","christmas","boy"]):
        return 'culture'
    elif any(word in categories for word in ["children", "child","adolescence","school"]):
        return "child"
    elif any(word in categories for word in ['famil','aunts','cousin','fathers', 'mothers', 'parents', 'marriage', 'brothers', 'sisters', 'divorced', 'relationship','adult','abortion',"sex","married",'orphan','wives']):
        return "family"
    elif any(word in categories for word in ['church', 'christian', 'religion','jerusalem','apologetics']):
        return "religion"
    elif any(word in categories for word in ['dragons', 'aliens', 'imaginary', 'planet', 'adventure','fantasy','spirit']):
        return "sf"
    elif any(word in categories for word in ['pets', 'dogs', 'animals','cats']):
        return "animals"
    elif any(word in categories for word in ["history", "regions", "england", "california", "boston", "brooklyn", "africa", "chicago", "france", "american","cornwall", "amsterdam", "cape cod", "capitalists","britain","indian","middle west","town","soldier","baltimore","amerikan","british"]):
        return "history"
    elif any(word in categories for word in ["science", "computers", "intelligence", "medical", "health", "medical", "engineers", "automotives", "management","bio","nature","mental","social","psychology","archit","reference","compulsive","medica","vet","auto","cancer","wonder"]):
        return "sci&tech"
    elif any(word in categories for word in ["mystery", "crime", "haunted", "horror", "murder","burglar","accidents","death","assassins",'conspiracies']):
        return "mystery"
    elif any(word in categories for word in ["econo","business","law"]):
        return "e&b"
    elif any(word in categories for word in ["self-help","audio","brewing","kindness","avarice",'curious']):
        return "etc"

book_filter_df["L_Category"]=book_filter_df["Category"].apply(categorize)

In [88]:
book_filter_df.head(10)

Unnamed: 0,user_id,age,rating,book_title,book_author,year_of_publication,img_m,Summary,Category,country,Language,L_Category
31,11676,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],,en,fiction
32,29526,26,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
34,46398,37,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
36,49635,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
42,148712,34,10,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
54,230522,52,7,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
56,238557,21,10,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],malaysia,en,fiction
58,245827,34,9,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],,en,fiction
62,255661,34,5,the kitchen god's wife,amytan,1991,http://images.amazon.com/images/P/0399135782.0...,a chinese immigrant who is convinced she is dy...,['fiction'],usa,en,fiction
106,7346,49,9,the testament,johngrisham,1999,http://images.amazon.com/images/P/0440234743.0...,"a suicidal billionaire, a burnt-out washington...",['fiction'],usa,en,fiction


## 사용자별 상위 카테고리 찾기

In [272]:
user_category_df = pd.DataFrame(book_filter_df[['user_id','book_title','L_Category']])
# user_category_df['L_Category'].value_counts()
group_by_category_df = user_category_df.groupby('user_id')['L_Category'].value_counts()
group_by_category_df=group_by_category_df.reset_index()
group_by_category_df[group_by_category_df['user_id']==278843]

Unnamed: 0,user_id,L_Category,count
10543,278843,fiction,7
10544,278843,etc,1


In [90]:
group_by_df=user_category_df.groupby(["user_id","L_Category",'book_title']).size().reset_index(name='count')
group_by_df[group_by_df["user_id"]==7346]
# type(group_by) DF으로 변환


Unnamed: 0,user_id,L_Category,book_title,count
1089,7346,animals,night mare #06,1
1090,7346,child,the secret diary of adrian mole aged 13 3/4,1
1091,7346,culture,adventures of tom sawyer,1
1092,7346,culture,gift from the sea,1
1093,7346,culture,it's always something,1
...,...,...,...,...
1176,7346,sci&tech,cuckoo's egg,1
1177,7346,sci&tech,drinking : a love story,1
1178,7346,sci&tech,she said yes: the unlikely martyrdom of cassie...,1
1179,7346,sci&tech,the hot zone,1


In [285]:
# 피벗테이블로 만들기
user_category_count_df = group_by_category_df.pivot_table(index = 'user_id', columns = 'L_Category', values = 'count',fill_value=0)
top_category = user_category_count_df.idxmax(axis=1)
print(top_category) # series

top_category_by_user_df = top_category.reset_index(name='top_L_category')
print(top_category_by_user_df)
print(top_category_by_user_df[top_category_by_user_df['user_id']==1025])

print(book_filter_df[book_filter_df['user_id'] ==1025]['L_Category'].value_counts())

user_id
243       fiction
254       fiction
383       fiction
388       fiction
487       fiction
           ...   
278418    fiction
278535    fiction
278582    fiction
278633    fiction
278843    fiction
Length: 4835, dtype: object
      user_id top_L_category
0         243        fiction
1         254        fiction
2         383        fiction
3         388        fiction
4         487        fiction
...       ...            ...
4830   278418        fiction
4831   278535        fiction
4832   278582        fiction
4833   278633        fiction
4834   278843        fiction

[4835 rows x 2 columns]
    user_id top_L_category
13     1025       sci&tech
L_Category
sci&tech    2
fiction     1
culture     1
etc         1
Name: count, dtype: int64


# 상위 카테고리에서 상위 10개 책 추출

In [187]:
top_book_by_category = book_filter_df[['L_Category','book_title']].drop_duplicates()
book_ratings=book_filter_df.groupby(['L_Category','book_title'])["rating"].mean().reset_index()
book_ratings

Unnamed: 0,L_Category,book_title,rating
0,animals,101 dalmatians,9.500000
1,animals,a dog's life,7.666667
2,animals,all things bright and beautiful,10.000000
3,animals,animal farm,8.090909
4,animals,chicken soup for the cat and dog lover's soul ...,8.357143
...,...,...,...
3640,sf,to ride a silver broomstick: new generation wi...,8.083333
3641,sf,voyage of the dawn treader,7.500000
3642,sf,white fang,8.000000
3643,sf,wicca: a guide for the solitary practitioner,8.222222


In [188]:
book_filter_df['L_Category'].unique()

array(['fiction', 'culture', 'history', 'sci&tech', 'etc', 'child', 'sf',
       'animals', 'e&b', 'family', 'religion', 'mystery', 'nonfic'],
      dtype=object)

In [189]:
# L_category별 상위 10개 책 추천하기(L_category * top_10_book)
#  L_category별 rating상위 10개 책
# .sort_values(by='rating', ascending=False)[:10]

# fiction
fiction_column=book_ratings[book_ratings["L_Category"]=="fiction"]
fiction_column=fiction_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# culture'
culture_column=book_ratings[book_ratings["L_Category"]=="culture"]
culture_column=culture_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# history
history_column=book_ratings[book_ratings["L_Category"]=="history"]
history_column=history_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# sci&tech
sci_tech_column=book_ratings[book_ratings["L_Category"]=="history"]
sci_tech_column=sci_tech_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# etc
etc_column=book_ratings[book_ratings["L_Category"]=="etc"]
etc_column=etc_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# child
child_column=book_ratings[book_ratings["L_Category"]=="child"]
child_column=child_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# sf
sf_column=book_ratings[book_ratings["L_Category"]=="sf"]
sf_column=sf_column.sort_values(by='rating', ascending=False)[:10]['book_title']

#animals
animals_column=book_ratings[book_ratings["L_Category"]=="animals"]
animals_column=animals_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# e&b
e_b_column=book_ratings[book_ratings["L_Category"]=="e&b"]
e_b_column=e_b_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# 'family', 
family_column=book_ratings[book_ratings["L_Category"]=="family"]
family_column=family_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# 'religion', 
religion_column=book_ratings[book_ratings["L_Category"]=="religion"]
religion_column=religion_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# 'mystery',
mystery_column=book_ratings[book_ratings["L_Category"]=="mystery"]
mystery_column=mystery_column.sort_values(by='rating', ascending=False)[:10]['book_title']

# 'nonfic
nonfic_column=book_ratings[book_ratings["L_Category"]=="nonfic"]
nonfic_column=book_ratings[book_ratings["L_Category"]=="nonfic"].sort_values(by='rating', ascending=False)[:10]['book_title']

book_filter_df["L_Category"].drop_duplicates().values

array(['fiction', 'culture', 'history', 'sci&tech', 'etc', 'child', 'sf',
       'animals', 'e&b', 'family', 'religion', 'mystery', 'nonfic'],
      dtype=object)

In [190]:
top_book_by_category=pd.DataFrame(index=range(1,11),columns=book_filter_df["L_Category"].drop_duplicates().values)
columns=book_filter_df["L_Category"].drop_duplicates().values
for col in columns:
    book=book_ratings[book_ratings["L_Category"]==col].sort_values(by='rating', ascending=False)[:10]['book_title']
    top_book_by_category[col]=book.values
top_book_by_category

Unnamed: 0,fiction,culture,history,sci&tech,etc,child,sf,animals,e&b,family,religion,mystery,nonfic
1,"that was then, this is now",new vegetarian: bold and beautiful recipes for...,growing up,pilgrim at tinker creek,a woman's worth,anne's house of dreams,the hobbit,all things bright and beautiful,nickel and dimed: on (not) getting by in america,from the mixed-up files of mrs. basil e. frank...,mere christianity: a revised and enlarged edit...,sleepers,when the wind blows
2,cheaper by the dozen,animal farm,stonehenge,among schoolchildren,pearl,matilda,the celestine prophecy,"old possum's book of practical cats, illustrat...",fish! a remarkable way to boost morale and imp...,ender's game (ender wiggins saga (paperback)),the choice,remember me,wings
3,rough justice,on the road (essential.penguin s.),anne frank: the diary of a young girl,geisha : a life,the blue day book,the member of the wedding,crossing over,101 dalmatians,one l : the turbulent true story of a first ye...,scarlet letter,mere christianity,bridge to terabithia,where the sidewalk ends : poems and drawings
4,robots and empire,best friends,"band of brothers : e company, 506th regiment, ...",the heart of a woman,writing down the bones,the neverending story,hatchet,every living thing,heart of the sea (irish trilogy),island of the blue dolphins,the invitation,and then there were none,zlata's diary: a child's life in sarajevo
5,hocus pocus,scientific progress goes 'boink': a calvin an...,a time to kill,one day my soul just opened up : 40 days and 4...,random acts of kindness,jane eyre,illusions,old yeller,self matters : creating your life from the ins...,anna karenina (oprah's book club),more than a carpenter,the dead zone,"old possum's book of practical cats, illustrat..."
6,green mars,the authoritative calvin and hobbes (calvin an...,beloved: a novel (plume contemporary fiction),red azalea,the gift of fear,"then again, maybe i won't",dragons of winter night,where the red fern grows,a civil action,star flight,this present darkness,haunted,go ask alice (avon/flare book)
7,sarum: the novel of england,calvin and hobbes,seabiscuit: an american legend,kitchen table wisdom: stories that heal,my side of the mountain,maus a survivors tale: my father bleeds history,peace is every step: the path of mindfulness i...,james herriot's cat stories,how to win friends and influence people,color purple,chicken soup for the christian soul (chicken s...,special delivery,falling up
8,first wives club,king lear,the grapes of wrath,october sky: a memoir,charlie and the chocolate factory,jane eyre (penguin classics),the bourne supremacy,felidae. roman.,master of the game,anna karenina,the purpose-driven life: what on earth am i he...,small sacrifices: a true story of passion and ...,chicken soup for the kid's soul : 101 stories ...
9,the rosewood casket,the far side gallery 4,night shift,reading lolita in tehran: a memoir in books,eight weeks to optimum health: a proven progra...,the bfg,"the gunslinger (the dark tower, book 1)",chicken soup for the cat and dog lover's soul ...,the 9 steps to financial freedom,killjoy,left behind: a novel of the earth's last days ...,murder on the orient express,chicken soup for the teenage soul ii (chicken ...
10,tulip fever,the calvin and hobbes tenth anniversary book,undaunted courage: meriwether lewis thomas jef...,a man named dave: a story of triumph and forgi...,a 5th portion of chicken soup for the soul : 1...,about a boy uk,dune (dune chronicles (berkley paperback)),unadulterated cat,dead man walking: an eyewitness account of the...,housekeeping,nine parts of desire: the hidden world of isla...,every breath you take : a true story of obsess...,never cry wolf


## 최종 추천 테이블 생성(user_id로 입력)

In [287]:
# L_category_ = book_filter_df["L_Category"].unique()
# def recommend_top10_book_by_category(user_id):
#     # 사용자별 상위 카테고리 찾기
# 
#     # 상위 카테고리의 영화 10개 찾기

max_category = book_filter_df[book_filter_df['user_id'] == 1025]['L_Category'].value_counts().idxmax()

top_book_by_category[max_category]

1                               pilgrim at tinker creek
2                                  among schoolchildren
3                                       geisha : a life
4                                  the heart of a woman
5     one day my soul just opened up : 40 days and 4...
6                                            red azalea
7               kitchen table wisdom: stories that heal
8                                 october sky: a memoir
9           reading lolita in tehran: a memoir in books
10    a man named dave: a story of triumph and forgi...
Name: sci&tech, dtype: object

In [292]:
def get_top_category_book(user_id):
    # 해당 user_id의 가장 많이 선택한 L_Category 찾기
    max_category = book_filter_df[book_filter_df['user_id'] == user_id]['L_Category'].value_counts().idxmax()
    
    # top_book_by_category에서 해당 카테고리의 책 제목 반환
    result = top_book_by_category[max_category]
    return result

get_top_category_book(1025)

1                               pilgrim at tinker creek
2                                  among schoolchildren
3                                       geisha : a life
4                                  the heart of a woman
5     one day my soul just opened up : 40 days and 4...
6                                            red azalea
7               kitchen table wisdom: stories that heal
8                                 october sky: a memoir
9           reading lolita in tehran: a memoir in books
10    a man named dave: a story of triumph and forgi...
Name: sci&tech, dtype: object