In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
U = pd.read_csv("Users.csv")

In [264]:
I = pd.read_csv("Books.csv")

In [8]:
R = pd.read_csv("Ratings.csv")

In [3]:
U.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
U['Nation'] = U.Location.str.split(',').str[-1]

In [5]:
U['City1'] = U.Location.str.split(',').str[0]

In [6]:
U['City2'] = U.Location.str.split(',').str[-2]

In [7]:
U.head()

Unnamed: 0,User-ID,Location,Age,Nation,City1,City2
0,1,"nyc, new york, usa",,usa,nyc,new york
1,2,"stockton, california, usa",18.0,usa,stockton,california
2,3,"moscow, yukon territory, russia",,russia,moscow,yukon territory
3,4,"porto, v.n.gaia, portugal",17.0,portugal,porto,v.n.gaia
4,5,"farnborough, hants, united kingdom",,united kingdom,farnborough,hants


In [9]:
R.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [220]:
R_u_cnt = R.groupby('User-ID').ISBN.size().reset_index(name = 'User_cnt')

In [226]:
more15_user = R_u_cnt[R_u_cnt.User_cnt >= 15]

구매 이력이 15번 이상인 User 집단만 추출

In [230]:
R_i_cnt = R.groupby('ISBN').ISBN.size().reset_index(name = 'ISBN_cnt')

In [231]:
more15_book = R_i_cnt[R_i_cnt.ISBN_cnt >= 15]

구매 이력이 15번 이상인 User 집단만 추출

In [232]:
R = R[(R['User-ID'].isin(more15_user['User-ID'])) & (R.ISBN.isin(more15_book.ISBN))]

In [233]:
R['User-ID'].nunique(), R.ISBN.nunique()

(8930, 8429)

데이터 필터링 후 User는 8,930명, 도서는 8,429권이다.

In [237]:
R_ct = pd.crosstab(R['User-ID'], R.ISBN, aggfunc = lambda x : x, values = R['Book-Rating'])

In [238]:
R_ct.head()

ISBN,0006493580,000649840X,0006512135,0006514006,0006514855,0006547834,0006550576,0006550681,0006550789,0007108265,...,8472236552,8478884459,8478884955,8495501198,8495618605,8497593588,8806143042,8817125539,8873122933,950491036X
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,


데이터가 매우 sparse한 것을 알 수 있다. -> 이를 해결하기 위해 협력 필터링 기법과 사회연결망 기법을 결합한 분석의 시도가 존재했다(Shin et al, 2012)

In [239]:
R_ct.fillna(-1, inplace = True)

아직 사용하거나 구매하지 않은 Item에 대해서는 -1로 결측치를 채운다. 

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

In [240]:
R_cos_sim = cosine_similarity(R_ct)

In [241]:
R_cos_sim.shape

(8930, 8930)

무난하게 User - Item Cross Table에서 코사인 유사도 기준 상위 5명의 User가 구매한 도서 중 해당 User가 아직 구매하지 않은 Item 추천

In [333]:
def get_recommendation_by_top_5(user_idx) :
    target_user = R_ct.iloc[user_idx]
    target_user_buyed = target_user[target_user != -1]
    user_buyed_titles = I[I.ISBN.isin(target_user_buyed.index)]['Book-Title']
    top5_idx = np.argsort(R_cos_sim[user_idx])[-6:-1]
    top5_data = R_ct.iloc[top5_idx]
    purchase_record = top5_data.sum(axis = 0)
    buyed = purchase_record[purchase_record > -5].index.tolist()
    
    rec_books_idx = list(set(buyed).difference(set(target_user_buyed.index)))
    rec_books_titles = I[I.ISBN.isin(rec_books_idx)].iloc[:, [1,2,3,4]]
    print(f'User{R_ct.index[user_idx]}가 구매한 도서 목록은 {user_buyed_titles.values}')
    return rec_books_titles

In [334]:
get_recommendation_by_top_5(1)

User242가 구매한 도서 목록은 ['Wild Animus' 'The Martian Chronicles']


Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Publisher
1532,Adventures of Huckleberry Finn (Dover Thrift E...,Mark Twain,1994,Dover Publications
2230,Fahrenheit 451,RAY BRADBURY,1987,Del Rey
3078,Call of the Wild,Jack London,0,Selldowns/no More Orders
9175,A Connecticut Yankee in King Arthur's Court (B...,Mark Twain,1994,Bantam Books
9415,Heaven,V.C. Andrews,1985,Pocket
10296,Don't Stand Too Close to a Naked Man,Tim Allen,1994,Hyperion Books
12137,The Adventures of Huckleberry Finn (Bantam Cla...,MARK TWAIN,1981,Bantam
12731,The Illustrated Man (Grand Master Editions),RAY BRADBURY,1983,Spectra
13095,Dandelion Wine (Grand Master Editions),RAY BRADBURY,1985,Bantam
20747,The Adventures of Tom Sawyer (Adventures of To...,MARK TWAIN,1995,Bantam
