### 경사하강법 이용행렬분해 예제 
행렬분해과정
- 1) 𝑃, 𝑄 를임의의값들을가진행렬로초기화한다
- 2) 𝑃, 𝑄𝑇를행렬곱하여 መ 𝑅을생성한다
- 3) 𝑅과 መ 𝑅을빼손실을구한다
     - 이때손실은𝑅의값이존재하는부분들과 መ 𝑅예측값들간의차이를의미한다
- 4) 𝑃, 𝑄𝑇를손실이최소화하는방향으로업데이트하며, 수렴할때까지작업을반복한다

1) 원본행렬 생성

In [1]:
import numpy as np

# 원본 사용자 - 아이템 행렬 생성
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
             [np.NaN, np.NaN, 3, 4, 4],
              [5, 2, 1, 2, np.NaN]])

num_users, num_items = R.shape
K=3 # 잠재요인의 개수(차원)

In [2]:
# P, Q를 임의의 값들을 가진 행렬로 초기화

np.random.seed(1)
P = np.random.normal(scale = 1./K, size = (num_users, K))
Q = np.random.normal(scale = 1./K, size = (num_items, K))

In [3]:
P

array([[ 0.54144845, -0.2039188 , -0.17605725],
       [-0.35765621,  0.28846921, -0.76717957],
       [ 0.58160392, -0.25373563,  0.10634637],
       [-0.08312346,  0.48736931, -0.68671357]])

In [4]:
Q

array([[-0.1074724 , -0.12801812,  0.37792315],
       [-0.36663042, -0.05747607, -0.29261947],
       [ 0.01407125,  0.19427174, -0.36687306],
       [ 0.38157457,  0.30053024,  0.16749811],
       [ 0.30028532, -0.22790929, -0.04096341]])

RMSE 측정 함수 생성

In [5]:
# 손실은 𝑅의 값이 존재하는 부분들과 መ 𝑅예측값들 간의 차이를 의미한다

# 실제 행렬과 예측행렬 오차를 구하는 함수 생성
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    # 2) 예측행렬 생성 - 𝑃, 𝑄𝑇를 행렬곱해서 𝑅을 생성.
    full_pred_matrix = np.dot(P, Q.T)
    
    # R에서 NaN이 아닌 값의 위치 인덱스를 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    
    # NaN이 아닌 값들만을 R과 full_pred_matrix에서 추출
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse
    

경사하강법이용 행렬분해실시

In [6]:
# 원본행렬에서 결측값이 아닌 값의 행, 열 , 실제 값을 튜플형태로 리스트안에 담기.
non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] >0]

steps = 1000 #훈련횟수
learning_rate = 0.01
r_lambda = 0.01

for step in range(steps):
    for i, j, r in non_zeros:
        eij = r - np.dot(P[i, :], Q[j,:].T) # i번째 행, j번째 열 값의 오차.
        
        # L2정규화를 적용한 SGD기법 이용 P,Q 업데이트
        P[i,:] = P[i,:] + learning_rate * (eij * Q[j,:] - r_lambda * P[i,:])
        Q[j,:] = Q[j,:] + learning_rate * (eij * P[i,:] - r_lambda * Q[j,:])
    
    # 손실을 측정
    rmse = get_rmse(R, P, Q, non_zeros) # 매 스탭이 끝날 때마다 rmse값 산출
    
    if(step% 50) ==0:
        print('### iteration step: {} rmse: {}'.format(step, rmse))

### iteration step: 0 rmse: 3.2388050277987723
### iteration step: 50 rmse: 0.4876723101369647
### iteration step: 100 rmse: 0.15643403848192475
### iteration step: 150 rmse: 0.07455141311978038
### iteration step: 200 rmse: 0.04325226798579314
### iteration step: 250 rmse: 0.029248328780878977
### iteration step: 300 rmse: 0.022621116143829396
### iteration step: 350 rmse: 0.01949363619652524
### iteration step: 400 rmse: 0.018022719092132586
### iteration step: 450 rmse: 0.017319685953442663
### iteration step: 500 rmse: 0.016973657887570895
### iteration step: 550 rmse: 0.016796804595895595
### iteration step: 600 rmse: 0.016701322901884613
### iteration step: 650 rmse: 0.01664473691247672
### iteration step: 700 rmse: 0.016605910068210078
### iteration step: 750 rmse: 0.016574200475704973
### iteration step: 800 rmse: 0.01654431582921599
### iteration step: 850 rmse: 0.016513751774735196
### iteration step: 900 rmse: 0.016481465738194947
### iteration step: 950 rmse: 0.016447171683

In [7]:
# 경사하강법이용 행렬분해예측 결과

pred_matrix = np.dot(P, Q.T)
print('예측행렬: \n', np.round(pred_matrix, 3))

예측행렬: 
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


In [8]:
# 원본행렬 출력 결과
print(R)

# SGD 기법을 이용한 행렬분해를 통해 - > 손실을 최소화 하는 방향으로 학습

[[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]


## 콘텐츠기반필터링실습
- 영화 데이터 세트
- TMDB 5000 Movie Dataset
https://www.kaggle.com/tmdb/tmdb-movie-metadata

In [41]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('./tmdb_5000_movies.csv')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [51]:
movies_df = movies[['id','title', 'genres', 'vote_average', 'vote_count',
                 'popularity', 'keywords', 'overview']]
movies_df.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp...","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",6.9,4500,139.082615,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is...","Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""name"": ""Crime""}]",6.3,4466,107.376788,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret...",A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name"": ""Drama""}, {""id"": ...",7.6,9106,112.31295,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 949, ""name"": ""te...","Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 878, ""name"": ""Science Fic...",6.1,2124,43.926995,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": 1456, ""name"": ""medal...","John Carter is a war-weary, former military captain who's inexplicably transported to the myster..."


In [52]:
pd.set_option('max_colwidth', 100)
movies_df[['genres', 'keywords']][:1]
# 리스트(list)내부에 여러 개의 딕셔너리(dict)가 있는 형태

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [53]:
from ast import literal_eval
# 문자열을 이 문자열이 의미하는 list [dict1,dict2] 객체로 만들 수 있다.

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [58]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [ y['name'] for y in x])
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


장르 콘텐츠 유사도 측정
- Genres 칼럼을기반으로 하는 콘텐츠기반필터링
    1. 문자열로 변환된 genres 칼럼을Count 기반으로 피처벡터화 변환 (CountVectorizer이용)
    2. Genres 문자열을 피처 백터화 행렬로 변환한데이터 세트를 코사인 유사도를 통해비교
        --> 데이터 세트의 레코드별로 타레코드와 장르에서 코사인유사도 값을 가지는 객체를 생성

    3. 장르 유사도가 높은 영화중에 평점이 높은순으로 영화를 추천


In [63]:
# 1. countvectorizer로 피처 벡터 행렬 생성

from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])

genre_mat.shape

(4803, 276)

In [66]:
#  2. 코사인 유사도계산 ->기준 행과 비교행의 코사인 유사도를 행렬형태로 변환

from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim[:4]


array([[1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.        ,
        0.        ],
       [0.59628479, 1.        , 0.4       , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 0.4       , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12598816, 0.16903085, 0.3380617 , ..., 0.12598816, 0.        ,
        0.        ]])

In [67]:
# 유사도가 높은 순으로 정리된 genre_sim 객체의 인덱스값 얻기

genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_ind[:1]
# 0번 레코드는 자기 자신.

array([[   0, 3494,  813, ..., 3038, 3037, 2401]], dtype=int64)

In [70]:
# 3. 장르 유사도에 따라 영화를 추천하는 함수 생성

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title'] == title_name]
    
    title_index = title_movie.index.values # 인덱스 객체를 ndarray로 반환
    similar_indexes = sorted_ind[title_index, :(top_n)] #유사도 순으로 top_n개의 index 추출
     
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1) #top_n index는2차원 데이터여서 index로사용하기 위해1차원 array로변경
    
    return df.iloc[similar_indexes]

In [71]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

# 어울리지않는 영화도많고, 평점이 0인 영화도 존재 -> 개선이 필요하다.

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [73]:
# 영화의 평점에 따라 필터링

movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

# 평점에 평가횟수를 반영할 필요가 있음.

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [75]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_average'].quantile(0.6) # 상위60%값 추출
print('C: ',round(C, 3), 'm: ', round(m, 3))

C:  6.092 m:  6.5


In [79]:
# 기존평점을 새로운 가중평점으로 변경하는 함수 생성

percentile = 0.6
C = movies_df['vote_average'].mean()
m = movies_df['vote_average'].quantile(percentile)

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    # 영화평점사이트인 IMDB에서 제공하는 가중평점 방식
    return ( (v/(v+m))* R) + ( (m/(m+v)) * C)
    
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis =1)
movies_df['weighted_vote'].head()

0    7.199390
1    6.898835
2    6.299698
3    7.598924
4    6.099976
Name: weighted_vote, dtype: float64

In [80]:
# 새롭게 부여된weighted_vote 평점이 높은 순으로 상위 10개의 영화 추출
movies_df[['title','vote_average','weighted_vote','vote_count']].sort_values('weighted_vote',
                                                                          ascending=False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.498094,8205
3337,The Godfather,8.4,8.397457,5893
662,Fight Club,8.3,8.298476,9413
3232,Pulp Fiction,8.3,8.298299,8428
1818,Schindler's List,8.3,8.29669,4329
3865,Whiplash,8.3,8.296632,4254
2294,Spirited Away,8.3,8.296269,3840
2731,The Godfather: Part II,8.3,8.295709,3338
65,The Dark Knight,8.2,8.198859,12002
809,Forrest Gump,8.2,8.198273,7927


In [81]:
# 새롭게 정의된 평점 기준에 따라 영화 추천

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title']==title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :(top_n*2)] # top_n의 2배에 해당하는 장르 유사성이 높은 인덱스 추출
    similar_indexes = similar_indexes.reshape(-1)
    
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]


similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.295709
1847,GoodFellas,8.2,8.195629
1663,Once Upon a Time in America,8.2,8.187261
3866,City of God,8.1,8.092831
883,Catch Me If You Can,7.7,7.697251
281,American Gangster,7.4,7.394365
4041,This Is England,7.4,7.376994
1243,Mean Streets,7.2,7.179514
2839,Rounders,6.9,6.888214
1149,American Hustle,6.8,6.798365
