## 판다스를 활용한 추천 시스템 만들기

### 협업 필터링  

과거 사용자의 데이터를 기반으로 비슷한
사용자를 발견하여 새로운 아이템을 추천하는 방법

1) user & item 테이블 생성 (과거 구매 테이블)  
2) 테이블을 수치화 (사면 1, 안사면 0)  
3) user간의 유사도 계산  
4) 유사도가 높은 사람 찾아 아이템 가중치 계산  
5) 아이템 추천  

In [42]:
#데이터 불러오기
import pandas as pd
import os
os.getcwd()
os.chdir("C:/Users/hyelim/Desktop/ml-1m")
user = pd.read_csv('users.dat', sep ='::', header =None, engine ='python')

In [43]:
user.rename(columns={0:'user_id',1:'Gender',2:'Age',3:'occupation',4:'zip-code'},inplace=True)

In [44]:
user

Unnamed: 0,user_id,Gender,Age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [45]:
movie = pd.read_csv('movies.dat', sep ='::', header =None, engine ='python')
movie.rename(columns={0:'movie_id',1:'title',2:'genre'},inplace = True)

In [46]:
movie

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [47]:
rating = pd.read_csv('ratings.dat', sep ='::', header =None, engine ='python')
rating.rename(columns={0:'user_id',1:'movie_id',2:'rating',3:'timestamp'},inplace=True)

In [48]:
rating.timestamp = pd.to_datetime(rating.timestamp,unit='s')

In [49]:
rating

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11
...,...,...,...,...
1000204,6040,1091,1,2000-04-26 02:35:41
1000205,6040,1094,5,2000-04-25 23:21:27
1000206,6040,562,5,2000-04-25 23:19:06
1000207,6040,1096,4,2000-04-26 02:20:48


In [50]:
rating.merge(user) #user_id로 병합

Unnamed: 0,user_id,movie_id,rating,timestamp,Gender,Age,occupation,zip-code
0,1,1193,5,2000-12-31 22:12:40,F,1,10,48067
1,1,661,3,2000-12-31 22:35:09,F,1,10,48067
2,1,914,3,2000-12-31 22:32:48,F,1,10,48067
3,1,3408,4,2000-12-31 22:04:35,F,1,10,48067
4,1,2355,5,2001-01-06 23:38:11,F,1,10,48067
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,2000-04-26 02:35:41,M,25,6,11106
1000205,6040,1094,5,2000-04-25 23:21:27,M,25,6,11106
1000206,6040,562,5,2000-04-25 23:19:06,M,25,6,11106
1000207,6040,1096,4,2000-04-26 02:20:48,M,25,6,11106


In [51]:
rating.drop(columns='timestamp',inplace= True)

In [52]:
rating

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [55]:
#user & item 매트릭스 생성
#x.pivot(index='a',columns='b',값 ='c')
#하나의 좌표에는 하나의 값만 들어갈 수 있음
#pivot_table은 값이 두개일 때, 평균값이 들어감
basic = rating.pivot('user_id','movie_id','rating').fillna(0)

In [56]:
#rating.set_index(['user_id','movie_id']).unstack().fillna(0)
#위와 똑같은 결과

In [62]:
#유사도 구하기
basic_corr = basic.T.corr()

In [63]:
basic_corr.loc[231].nlargest(10)

user_id
231     1.000000
6022    0.460327
3170    0.456441
5931    0.409029
4982    0.401485
5396    0.399446
5445    0.397927
236     0.393447
3048    0.393039
431     0.391935
Name: 231, dtype: float64

In [70]:
#가장 유사도가 높은 사용자 5명 추출
def close_user(user_id,k=5):
    return basic_corr.loc[user_id].sort_values(ascending=False)[1:k+1]

In [71]:
close_user(3)

user_id
3000    0.341801
479     0.339164
5691    0.335842
3500    0.327703
311     0.316425
Name: 3, dtype: float64

In [72]:
a = set(rating[rating.user_id == 6022].movie_id.values)
b = set(rating[rating.user_id == 231].movie_id.values)
a-b #231이 안본 영화

{110,
 246,
 296,
 1079,
 1222,
 1588,
 1961,
 2028,
 2115,
 2150,
 2371,
 2474,
 2791,
 2795,
 3039,
 3208,
 3210,
 3360,
 3392}

In [88]:
#영화 추천 함수
def best_close_movie_recommendation(user_id):
    user = close_user(user_id,k=1).index
    
    movies = set(rating[(rating.user_id == user[0]) & (rating.rating == 5)].movie_id.values)
    my_movie = set(rating[rating.user_id == user_id].movie_id.values)
    return movies - my_movie

In [89]:
best_close_movie_recommendation(3)

{32, 34, 144, 1193, 1199, 1307, 1653, 2968, 3358}