## 확률적 경사 하강법 이용한 행렬 분해

L2 규제를 반영해 실제 R 행렬 값과 예측 R 행렬 값의 차이를 최소화

In [2]:
import numpy as np

#원본 행렬 R 생성, 분해 행렬 P와 Q 초기화, 잠재 요인 차원 K는 3으로 설정
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
             [np.NaN, 5, np.NaN, 3, 1],
            [np.NaN, np.NaN, 3, 4, 4],
            [5, 2, 1, 2, np.NaN]])
num_users, num_items = R.shape
K=3

#P와 Q 행렬의 크기 지정하고 정규 분포를 가진 임의의 값으로 입력
np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))
Q = np.random.normal(scale=1./K, size=(num_items, K))

In [3]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error=0
    #두개의 분해된 행렬 P와 Q.T의 내적으로 예측 행렬 R 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    #실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출해 실제 R 행렬과 예측 R 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [4]:
#R에서 널 값을 제외한 데이터의 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

#step: SGD 반복해서 업데이트할 횟수, learning_rate: 학습률, r_lambda: L2 Regularization 계수
steps = 1000
learning_rate = 0.01
r_lambda = 0.01

#SGD 기법으로 P와 Q 매트릭스 계속 업데이트
for step in range(steps):
    for i, j, r in non_zeros:
        #실제 값과 예측 값의 차이인 오류 값 구함
        eij = r - np.dot(P[i, :], Q[j, :].T)
        #Regularization을 반영한 SGD 업데이트 공식 적용
        P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
        Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
        
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 50) == 0:
            print('### iteration step:', step, ", rmse:", rmse)

### iteration step: 0 , rmse: 3.261355059488935
### iteration step: 0 , rmse: 3.26040057174686
### iteration step: 0 , rmse: 3.253984404542389
### iteration step: 0 , rmse: 3.2521583839863624
### iteration step: 0 , rmse: 3.252335303789125
### iteration step: 0 , rmse: 3.251072196430487
### iteration step: 0 , rmse: 3.2492449982564864
### iteration step: 0 , rmse: 3.247416477570409
### iteration step: 0 , rmse: 3.241926055455223
### iteration step: 0 , rmse: 3.2400454107613084
### iteration step: 0 , rmse: 3.240166740749792
### iteration step: 0 , rmse: 3.2388050277987723
### iteration step: 50 , rmse: 0.5003190892212749
### iteration step: 50 , rmse: 0.5001616291326989
### iteration step: 50 , rmse: 0.49899601202578087
### iteration step: 50 , rmse: 0.498848345014583
### iteration step: 50 , rmse: 0.4989518925663175
### iteration step: 50 , rmse: 0.49833236830090993
### iteration step: 50 , rmse: 0.49841484893787
### iteration step: 50 , rmse: 0.49792599580240865
### iteration step: 5

### iteration step: 700 , rmse: 0.01665720134529714
### iteration step: 700 , rmse: 0.016472928381641265
### iteration step: 700 , rmse: 0.016452412570473487
### iteration step: 700 , rmse: 0.016138379086448103
### iteration step: 700 , rmse: 0.0162699937479048
### iteration step: 700 , rmse: 0.016352885085045565
### iteration step: 700 , rmse: 0.016605910068210022
### iteration step: 750 , rmse: 0.01660906046895527
### iteration step: 750 , rmse: 0.016708562969098305
### iteration step: 750 , rmse: 0.016569153528341686
### iteration step: 750 , rmse: 0.016493367054249863
### iteration step: 750 , rmse: 0.01660702796687087
### iteration step: 750 , rmse: 0.01662368102752532
### iteration step: 750 , rmse: 0.016441927271724627
### iteration step: 750 , rmse: 0.016420802465343626
### iteration step: 750 , rmse: 0.016104179990850738
### iteration step: 750 , rmse: 0.016236628551953007
### iteration step: 750 , rmse: 0.016320141009291966
### iteration step: 750 , rmse: 0.016574200475704817

In [5]:
pred_matrix = np.dot(P, Q.T)
print('예측 행렬')
print(np.round(pred_matrix, 3))

예측 행렬
[[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


## 콘텐츠 기반 필터링 실습 - TMDB 5000 영화 데이터 세트

**장르 속성을 이용해 콘텐츠 기반 필터링 - 장르 칼럼 값의 유사도 비교한 뒤 그 중 높은 평점 가지는 영화 추천**

### 데이터 로딩 및 가공

In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies = pd.read_csv('data/tmdb_5000_movies.csv')
print(movies.shape)
movies.head(3)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [8]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]

In [9]:
pd.set_option('max_colwidth', 100)
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [12]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4803 non-null   int64  
 1   title         4803 non-null   object 
 2   genres        4803 non-null   object 
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
 5   popularity    4803 non-null   float64
 6   keywords      4803 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.3+ KB


In [10]:
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [13]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])
movies_df[['genres', 'keywords']][:3]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, ship..."
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]"


### 장르 콘텐츠 유사도 측정

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#genres 칼럼을 문자열로 변환한 뒤, CountVectorizer 이용해 피처 벡터 행렬로
#공백문자로 word 단위가 구분되는 문자열 반환
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x:(' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [15]:
movies_df['genres_literal'][:3]

0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
2                      Action Adventure Crime
Name: genres_literal, dtype: object

In [16]:
#코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]


In [19]:
#유사도가 높은 순으로 정리된 genre_sim 객체의 비교 행 위치 인덱스 값
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


### 장르 콘텐츠 필터링을 이용한 영화 추천

In [20]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    #인자로 입력된 movies_df에서 title 칼럼이 입력된 title_name 값 추출
    title_movie = df[df['title']==title_name]
    
    #title_name을 가진 DataFrame의 index 객체를 ndarray로 반환하고
    #sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :top_n]
    
    #추출된 top_n_index 출력, 2차원임
    #DataFrame에서 index로 사용하기 위해서 1차원으로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [21]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [22]:
#좀 더 많은 후보군 선정한 뒤에 영화의 평점에 따라 필터링해 최종 추천해야함
#평점은 높지만 vote_count가 적어서 왜곡된 데이터 존재
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [23]:
#가중 평점 = (v/(v+m))*R + (m/(v+m))*C
#v: 'vote_count', R: 'vote_average', m: 평점 부여하기 위한 최소 투표 횟수, C: 전체 영화 평균 평점
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6) #전체 투표 횟수에서 상위 60%에 해당하는 횟수를 기준으로
print('C:', round(C, 3), 'm:', round(m, 3))

C: 6.092 m: 370.2


In [24]:
percentile = 0.6
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(percentile)

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return (v/(v+m))*R + (m/(v+m))*C

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1) 

In [25]:
movies_df[['title', 'vote_average', 'vote_count', 'weighted_vote']].sort_values('weighted_vote', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count,weighted_vote
1881,The Shawshank Redemption,8.5,8205,8.396052
3337,The Godfather,8.4,5893,8.263591
662,Fight Club,8.3,9413,8.216455
3232,Pulp Fiction,8.3,8428,8.207102
65,The Dark Knight,8.2,12002,8.13693
1818,Schindler's List,8.3,4329,8.126069
3865,Whiplash,8.3,4254,8.123248
809,Forrest Gump,8.2,7927,8.105954
2294,Spirited Away,8.3,3840,8.105867
2731,The Godfather: Part II,8.3,3338,8.079586


In [26]:
#top_n의 2배수만큼 후보로 선정하고 weight_vote 칼럼이 높은순으로 top_n만큼 추출
def find_sim_movie(df, sorted_ind, title_name, top_n=10):

    title_movie = df[df['title']==title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)
    #기준 영화 인덱스 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    # top_n의 2배에 해당하는 후보군에서 weighted_vote 높은 순으로 top_n 만큼 추출 
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427
