### 추천 시스템 

In [1]:
import numpy as np

user1 = np.array( [2,2,1,2] )
user2 = np.array( [1,1,2,1] )
user3 = np.array( [2,2,1,0] )

rMatrix = np.vstack( (user1, user2, user3))
print(rMatrix.shape)
rMatrix

(3, 4)


array([[2, 2, 1, 2],
       [1, 1, 2, 1],
       [2, 2, 1, 0]])

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
cosineSim = cosine_similarity(rMatrix)
print(cosineSim.shape)
cosineSim

(3, 3)


array([[1.        , 0.83862787, 0.83205029],
       [0.83862787, 1.        , 0.75592895],
       [0.83205029, 0.75592895, 1.        ]])

### 아이템 기반 협업 필터링

In [3]:
rMatrix_t = np.transpose(rMatrix)
rMatrix_t

array([[2, 1, 2],
       [2, 1, 2],
       [1, 2, 1],
       [2, 1, 0]])

In [4]:
cosineSim_t = cosine_similarity(rMatrix_t)
cosineSim_t

array([[1.        , 1.        , 0.81649658, 0.74535599],
       [1.        , 1.        , 0.81649658, 0.74535599],
       [0.81649658, 0.81649658, 1.        , 0.73029674],
       [0.74535599, 0.74535599, 0.73029674, 1.        ]])

### 아이템 기반 협업 필터링 기법을 활용한 영화 추천 시스템 만들기

In [7]:
import os
import pandas as pd
os.getcwd()

'C:\\Users\\toto\\Documents\\Github\\MachineLearning_Basic_Class\\class_code'

In [9]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("../data/ml-100k/u.data", sep='\t', names=columns)
print(df.shape)
print(df)

(100000, 4)
       user_id  item_id  rating  timestamp
0          196      242       3  881250949
1          186      302       3  891717742
2           22      377       1  878887116
3          244       51       2  880606923
4          166      346       1  886397596
...        ...      ...     ...        ...
99995      880      476       3  880175444
99996      716      204       5  879795543
99997      276     1090       1  874795795
99998       13      225       2  882399156
99999       12      203       3  879959583

[100000 rows x 4 columns]


### 2번째 데이터 셋

* u.item 
 * item_id : 영화 정보
 * movie title : 영화 제목
 * release date : 출시일
 * video release date : 비디오 출시일
 * IMDb URL : IMDb URL 정보
 * unkonwn, ... : 기타 장르 정보

In [10]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
           'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
           'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv("../data/ml-100k/u.item", sep="|",
                     names=columns, encoding='latin-1')

print(movies.shape)
movies.head()


(1682, 24)


Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
print( len( df.item_id.unique() ) ) # 영화 리뷰 정보의 영화의 개수는 1682편
print( len( movies.item_id.unique() ) ) # 영화 정보의 영화의 개수는 1682편

1682
1682


In [13]:
# df : 10만개 리뷰 정보
# movies : 영화 정보 및 장르 정보
# df  -  movie_names (item_id, movie title)

# 두개 컬럼 선택
movie_names = movies[ ['item_id', 'movie title']]

# 두개의 데이터 셋을 병합
c_movies_data = pd.merge(df, movie_names, on='item_id')
print(c_movies_data.shape)
c_movies_data.head()

(100000, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [16]:
print( len( c_movies_data['movie title'].unique() ) ) 

1664


### 미션 : 하나의 영화를 선택하고 관련 유사한 영화 10편을 추천해 주는 시스템을 만들어라.

In [15]:
rating_c = c_movies_data.pivot_table(values='rating',
                                     index='user_id',
                                     columns='movie title',
                                     fill_value=0)

print(rating_c.shape)
rating_c

(943, 1664)


movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,0,0,0,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0


### 아이템 기반 협업 필터링을 위해 행열 바꾸기

In [17]:
X = rating_c.T
print(X.shape)

(1664, 943)


### 차원 축소

In [20]:
from sklearn.decomposition import TruncatedSVD

In [21]:
SVD = TruncatedSVD(n_components=12, random_state = 5)
r_matrix = SVD.fit_transform(X) # 943명-1664영화아이템 ->1664 X 943 -> 1664 X 12
print( r_matrix.shape )

(1664, 12)


In [22]:
corr_mat = np.corrcoef(r_matrix)
print( corr_mat.shape )
corr_mat

(1664, 1664)


array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])

### 유사 영화 찾기

* Star Wars (1977)을 좋아합니다.

In [24]:
rating_c.columns.get_loc('Star Wars (1977)')

1398

In [25]:
col_idx = rating_c.columns.get_loc('Star Wars (1977)')
corr_spec = corr_mat[col_idx]

result = pd.DataFrame( {'corr_specific':corr_spec, 'Movies':rating_c.columns })
result

Unnamed: 0,corr_specific,Movies
0,0.357238,'Til There Was You (1997)
1,0.421507,1-900 (1994)
2,0.593815,101 Dalmatians (1996)
3,0.722361,12 Angry Men (1957)
4,0.325221,187 (1997)
...,...,...
1659,0.669308,Young Guns II (1990)
1660,0.492406,"Young Poisoner's Handbook, The (1995)"
1661,0.331338,Zeus and Roxanne (1997)
1662,0.639006,unknown


In [27]:
result.sort_values('corr_specific', ascending=False).head(10)

Unnamed: 0,corr_specific,Movies
1398,1.0,Star Wars (1977)
1234,0.988052,Return of the Jedi (1983)
1460,0.942655,Terminator 2: Judgment Day (1991)
1523,0.933978,Toy Story (1995)
1461,0.931701,"Terminator, The (1984)"
1205,0.925185,Raiders of the Lost Ark (1981)
456,0.923562,"Empire Strikes Back, The (1980)"
570,0.915965,"Fugitive, The (1993)"
414,0.914299,Die Hard (1988)
44,0.892894,Aliens (1986)


### 101 달마시안 영화 좋아하는 사람에게 15편 추천

In [28]:
col_idx = rating_c.columns.get_loc('101 Dalmatians (1996)')
corr_spec = corr_mat[col_idx]

result = pd.DataFrame( {'corr_specific':corr_spec, 'Movies':rating_c.columns })
result.sort_values('corr_specific', ascending=False).head(15)

Unnamed: 0,corr_specific,Movies
2,1.0,101 Dalmatians (1996)
693,0.944203,Homeward Bound II: Lost in San Francisco (1996)
713,0.93253,"Hunchback of Notre Dame, The (1996)"
659,0.92215,Harriet the Spy (1996)
46,0.910804,All Dogs Go to Heaven 2 (1996)
805,0.903955,Kazaam (1996)
23,0.899279,"Adventures of Pinocchio, The (1996)"
435,0.899266,Dragonheart (1996)
764,0.890192,Jack (1996)
505,0.881306,Father of the Bride Part II (1995)


### 아이템 기반 협업 필터링 방식 - 코사인 유사도를 활용해 보기

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
cosine_mat = cosine_similarity(r_matrix)
print( cosine_mat.shape )
corr_mat

(1664, 1664)


array([[ 1.        , -0.11860848,  0.48002306, ...,  0.36854216,
         0.20260033,  0.47462744],
       [-0.11860848,  1.        , -0.02453875, ...,  0.08030375,
         0.50236826,  0.31092735],
       [ 0.48002306, -0.02453875,  1.        , ...,  0.79694267,
         0.41316297,  0.04725658],
       ...,
       [ 0.36854216,  0.08030375,  0.79694267, ...,  1.        ,
         0.1801474 ,  0.00395231],
       [ 0.20260033,  0.50236826,  0.41316297, ...,  0.1801474 ,
         1.        ,  0.18219915],
       [ 0.47462744,  0.31092735,  0.04725658, ...,  0.00395231,
         0.18219915,  1.        ]])

In [33]:
col_idx = rating_c.columns.get_loc('101 Dalmatians (1996)')
cosine_spec = cosine_mat[col_idx]

result = pd.DataFrame( {'cosine_sim':cosine_spec, 'Movies':rating_c.columns })
result.sort_values('cosine_sim', ascending=False).head(15)

Unnamed: 0,cosine_sim,Movies
2,1.0,101 Dalmatians (1996)
693,0.94606,Homeward Bound II: Lost in San Francisco (1996)
713,0.943881,"Hunchback of Notre Dame, The (1996)"
46,0.921926,All Dogs Go to Heaven 2 (1996)
23,0.916551,"Adventures of Pinocchio, The (1996)"
659,0.909108,Harriet the Spy (1996)
505,0.892762,Father of the Bride Part II (1995)
1547,0.892041,Twister (1996)
764,0.887689,Jack (1996)
532,0.884186,Flipper (1996)
