# 연관 분석

## 공용 코드

In [1]:
# 파이썬
# ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 공통 모듈 임포트
import numpy as np
import pandas as pd
import os

# 깔끔한 그래프 출력을 위해 %matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# 그림을 저장할 위치
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
# 이미지를 저장할 디렉토리 생성
os.makedirs(IMAGES_PATH, exist_ok=True)

# 이미지 저장
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("그림 저장:", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

from matplotlib import font_manager, rc
import platform

path = "c:/Windows/Fonts/malgun.ttf"
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
    
    
mpl.rcParams['axes.unicode_minus'] = False
# Jupyter Notebook의 출력을 소수점 이하 3자리로 제한
%precision 3

# 그래픽 출력을 좀 더 고급화하기 위한 라이브러리
import seaborn as sns

# 과학 기술 통계 라이브러리
import scipy as sp
from scipy import stats

# 사이킷런 ≥0.20 필수
# 0.20 이상 버전에서 데이터 변환을 위한 Transformer 클래스가 추가됨
import sklearn
assert sklearn.__version__ >= "0.20"

# 노트북 실행 결과를 동일하게 유지하기 위해 시드 고정
# 데이터를 분할할 때 동일한 분할을 만들어 냄
np.random.seed(21)

## 컨텐츠 기반의 영화 추천 서비스

In [2]:
# 데이터 읽어오기
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
print(movies.shape)
# (4803, 20) 4803 개 데이터와 20 개의 feature

(4803, 20)


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

### 필요한 피처만 추출

In [6]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 
                    'popularity', 'keywords', 'overview']]
movies_df.info()
# genres 와 keywords 데이터 확인
print(movies_df[['genres', 'keywords']][:1])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4803 non-null   int64  
 1   title         4803 non-null   object 
 2   genres        4803 non-null   object 
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
 5   popularity    4803 non-null   float64
 6   keywords      4803 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.3+ KB
                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                            keywords  
0  [{"id": 1463, "name": "culture clash"}, {"id":...  


In [7]:
# 장르와 키워드 데이터는 문자열 자료형인데 구조는 python의 list임
# 이 때 사용하는 함수는 eval 이나 ast 패키지의 liter_eval 함수
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

# 필요로 하는 데이터는 장르와 키워드의 이름이므로
# genres 와 keywords 의 dict 에서 name 에 해당하는 값만 가져와서 list로 생성
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])

# 데이터 확인 - 배열로 생성됨
print(movies_df[['genres', 'keywords']][:1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['genres'] = movies_df['genres'].apply(literal_eval)


                                          genres  \
0  [Action, Adventure, Fantasy, Science Fiction]   

                                            keywords  
0  [culture clash, future, space war, space colon...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['keywords'] = movies_df['keywords'].apply(lamb

In [10]:
# 장르별 유사도 측정을 위해 피처 벡터 행렬을 생성

# 빈 공간 하나를 사이에 두는 데이터로 join
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))

# join 한 영화 장르에 대해 피처 벡터 생성
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(min_df = 0.0, ngram_range = (1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])

print(genre_mat.shape) 
# (4803, 276)
# 4803개 영화에 대해 영화 장르는 276 가지

(4803, 276)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))


In [13]:
# 코사인 유사도 측정

from sklearn.metrics.pairwise import cosine_similarity

# 장르 매트릭스끼리 유사도를 측정해서 유사도 행렬을 생성
# 자기 자신과 유사도를 측정했으므로 주 대각선은 전부 1
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim[:3])
print(genre_sim.shape) # (4803, 4803)

[[1.    0.596 0.447 ... 0.    0.    0.   ]
 [0.596 1.    0.4   ... 0.    0.    0.   ]
 [0.447 0.4   1.    ... 0.    0.    0.   ]]
(4803, 4803)


In [18]:
# 장르의 유사도 정렬

genre_sim_sorted = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted.shape)
print(genre_sim_sorted[:3])
# 여기에서 출력되는 값은 유사도가 높은 행의 인덱스
# 자기 자신과 유사도를 측정했기 때문에 주 대각선은 자신의 인덱스

(4803, 4803)
[[   0 3494  813 ... 3038 3037 2401]
 [ 262    1  129 ... 3069 3067 2401]
 [   2 1740 1542 ... 3000 2999 2401]]


In [24]:
# 장르의 유사도를 이용한 영화 추천 함수
def find_sim_movie(df, sorted_ind, title, top_num = 10):
    # 제목을 가지고 영화 찾기
    movie_title = df[df['title'] == title]
    
    # 해당 제목의 영화 인덱스 찾기
    movie_index = movie_title.index.values
    
    # 유사도 행렬 찾아오기 - 지정 갯수만큼
    similar_index = sorted_ind[movie_index, :(top_num)]
    # 2차원을 1차원으로 변환
    similar_index = similar_index.reshape(-1)
    
    return df.iloc[similar_index]

In [29]:
# 영화 추천 테스트

# 영화 title 확인
#print(movies_df['title'])

# 영화 추천
similar_movies = find_sim_movie(movies_df, genre_sim_sorted, 'The Godfather', 15)
print(similar_movies[['title', 'genres', 'vote_average', 'popularity']])

                                               title          genres  \
2731                          The Godfather: Part II  [Drama, Crime]   
1243                                    Mean Streets  [Drama, Crime]   
3636                                   Light Sleeper  [Drama, Crime]   
1946  The Bad Lieutenant: Port of Call - New Orleans  [Drama, Crime]   
2640         Things to Do in Denver When You're Dead  [Drama, Crime]   
4065                                      Mi America  [Drama, Crime]   
1847                                      GoodFellas  [Drama, Crime]   
4217                                            Kids  [Drama, Crime]   
883                              Catch Me If You Can  [Drama, Crime]   
3866                                     City of God  [Drama, Crime]   
3112                         Blood Done Sign My Name  [Drama, Crime]   
4041                                 This Is England  [Drama, Crime]   
588                  Wall Street: Money Never Sleeps  [Drama, Cr

In [35]:
# 영화의 평점을 내림차순으로 정렬해서 15개 확인

movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average'
                                                               , ascending = False)[:15]
# 영화 평점 투표 인원수가 적은데 평점은 높은 경우가 발생

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [36]:
# 가중 평점 구하기

# 전체 영화들의 평균 평점
C = movies_df['vote_average'].mean()
# 영화 평점에 대한 최소 리뷰의 갯수
# 투표 인원수 중 상위 60% - 약 370명 이상
# 직접 설정해도 되고 여기처럼 비율로 설정해도 됨
m = movies_df['vote_count'].quantile(0.6)

print(C)
print(m)

6.092171559442016
370.1999999999998


In [40]:
# 가중 평점을 구하는 함수
def weighted_vote_average(record):
    v = float(record['vote_count'])
    R = float(record['vote_average'])
    weighted_avg = (v/(v+m)) * R + (m/(v+m)) * C
    
    return weighted_avg

In [49]:
# 가중 평점을 적용
movies_df['weighted_vote_average'] = movies_df.apply(weighted_vote_average, axis = 1)

# 가중 평점을 기준으로 상위 15개 영화 출력
movies_df[['title', 'vote_average', 'weighted_vote_average',
                'vote_count']].sort_values('weighted_vote_average', 
                                           ascending = False)[:15]
# 이전과 다르게 count 는 적으면서 평점만 높았던 영화가 사라짐

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['weighted_vote_average'] = movies_df.apply(weighted_vote_average, axis = 1)


Unnamed: 0,title,vote_average,weighted_vote_average,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [50]:
# 기존 방식에서 가중 평점 비교
movies_df[['title', 'vote_average', 'weighted_vote_average',
                'vote_count']].sort_values('vote_average', ascending = False)[:15]
# 투표 수가 적은 영화들은 가중 평점이 6점대로 매우 낮은 모습을 보임

Unnamed: 0,title,vote_average,weighted_vote_average,vote_count
3519,Stiff Upper Lips,10.0,6.102699,1
4247,Me You and Five Bucks,10.0,6.11317,2
4045,"Dancer, Texas Pop. 81",10.0,6.102699,1
4662,Little Big Top,10.0,6.102699,1
3992,Sardaarji,9.5,6.110483,2
2386,One Man's Hero,9.3,6.109409,2
2970,There Goes My Baby,8.5,6.10511,2
1881,The Shawshank Redemption,8.5,8.396052,8205
2796,The Prisoner of Zenda,8.4,6.158767,11
3337,The Godfather,8.4,8.263591,5893


In [53]:
# 장르의 유사도와 가중 평점을 이용한 영화 추천
# 지정 갯수의 2배만큼 가져와서 가중 평점을 기준으로 다시 추천
def find_sim_movie(df, sorted_ind, title, top_num = 10):
    movie_title = df[df['title'] == title]
    movie_index = movie_title.index.values
    
    # 유사도 행렬 찾아오기 - 지정 갯수의 2배만큼
    similar_index = sorted_ind[movie_index, :(top_num * 2)]
    similar_index = similar_index.reshape(-1)
    # 자기 자신은 제외
    similar_index = similar_index[similar_index != movie_index]
    
    return df.iloc[similar_index].sort_values('weighted_vote_average',
                                              ascending = False)[:top_num]

In [55]:
# 영화 추천 테스트
similar_movies = find_sim_movie(movies_df, genre_sim_sorted, 'The Godfather', 15)
similar_movies[['title', 'genres', 'vote_average', 'weighted_vote_average','popularity']]

Unnamed: 0,title,genres,vote_average,weighted_vote_average,popularity
1881,The Shawshank Redemption,"[Drama, Crime]",8.5,8.396052,136.747729
2731,The Godfather: Part II,"[Drama, Crime]",8.3,8.079586,105.792936
1847,GoodFellas,"[Drama, Crime]",8.2,7.976937,63.654244
3866,City of God,"[Drama, Crime]",8.1,7.759693,44.356711
1663,Once Upon a Time in America,"[Drama, Crime]",8.2,7.657811,49.336397
3887,Trainspotting,"[Drama, Crime]",7.8,7.591009,63.513324
883,Catch Me If You Can,"[Drama, Crime]",7.7,7.557097,73.944049
892,Casino,"[Drama, Crime]",7.8,7.42304,40.06688
281,American Gangster,"[Drama, Crime]",7.4,7.141396,42.361215
4041,This Is England,"[Drama, Crime]",7.4,6.739664,8.395624


## 아이템 기반 개인화된 영화 추천

In [57]:
# 데이터 가져오기

# 영화 정보
movies = pd.read_csv('./data/movielens/movies.csv')
# 평점 정보
ratings = pd.read_csv('./data/movielens/ratings.csv')

print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [59]:
# 데이터의 정보 확인
# 각 피처의 자료형, 결측값 여부 등을 확인

movies.info()
ratings.info()
# timestamp 가 영화 관람 시간이라면 유의미한 데이터이므로
# 연,월,일,시,계절 등으로 분리해서 활용
# 여기서는 리뷰 작성 시간이므로 유효한 데이터는 아님

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [70]:
# 일반적으로 각 피처의 unique 를 출럭
# 범주형인지 확인하고 범주형이라면 순서를 갖는지 확인해야 함
print(movies['movieId'].unique)
print(ratings['userId'].unique)

<bound method Series.unique of 0            1
1            2
2            3
3            4
4            5
         ...  
9737    193581
9738    193583
9739    193585
9740    193587
9741    193609
Name: movieId, Length: 9742, dtype: int64>
<bound method Series.unique of 0           1
1           1
2           1
3           1
4           1
         ... 
100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, Length: 100836, dtype: int64>


In [74]:
# ratings 수정
# timestamp 는 리뷰 작성 시간이므로 의미가 없어서 제거
ratings = ratings[['userId', 'movieId', 'rating']]
#ratings.info()
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [76]:
# 각 개인별(userId 별) 모든 영화에 대한 평점 테이블을 구성

ratings_mat = ratings.pivot_table('rating', index = 'userId',
                                 columns = 'movieId')
print(ratings_mat)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
...         ...     ...     ...     ...     ...     ...     ...     ...   
606         2.5     NaN     NaN     NaN     NaN     NaN     2.5     NaN   
607         4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
608         2.5     2.0     2.0     NaN     NaN     NaN     NaN     NaN   
609         3.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
610         5.0     NaN     NaN     NaN     NaN     5.0     NaN     NaN   

movieId  9       10     

In [77]:
# 테이블의 movieId 를 영화 제목으로 변경
# 영화 제목은 movies 에 존재함

# rating 와 movies 를 합치기 - JOIN
# on 매개변수를 사용해서 무엇을 기준으로 합칠 것인지 지정
rating_movies = pd.merge(ratings, movies, on = 'movieId')
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId',
                                 columns = 'title')
print(ratings_matrix)
# 이제는 영화 id 가 아니라 영화 제목이 테이블에 나타남

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   
...            ...                                      ...   
606            NaN                                      NaN   
607            NaN                                      NaN   
608            NaN                                      NaN   
609            NaN                                      NaN   
610            4.0                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2              

In [82]:
# 테이블에 나타나는 NaN 데이터를 0으로 치환
# 데이터가 없는 경우 0이 되도록
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# 영화간 유사도 산출 - 유사도를 계산할 항목이 인덱스가 돼야 함
rating_matrix_T = ratings_matrix.transpose()
rating_matrix_T

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos! (1986),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# 유사도 계산
item_sim = cosine_similarity(rating_matrix_T, rating_matrix_T)
#print(item_sim)
# 단점은 이름이 사라지고 numpy 의 ndarray 로 반환된다는 점

# DataFrame 으로 변환하고 컬럼과 인덱스에 이름 부여
item_sim_df = pd.DataFrame(item_sim, index = ratings_matrix.columns,
                         columns = ratings_matrix.columns)
#print(item_sim_df) # 확인

In [93]:
# 영화를 가지고 유사도가 가장 높은 영화를 추천
# 자기 자신과 유사도를 측정하면 1이므로 자신이 가장 높게 나옴
# 그래서 추천을 할 때는 자신을 제외하고 추천해야 함 - 0이 아닌 1번부터 
item_sim_df['Inception (2010)'].sort_values(ascending = False)[1:11]

title
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Interstellar (2014)              0.608150
Up (2009)                        0.606173
Avengers, The (2012)             0.586504
Django Unchained (2012)          0.581342
Departed, The (2006)             0.580849
Name: Inception (2010), dtype: float64

### 개인화된 평점 계산

In [94]:
def predict_rating(ratings_array, item_sim_array):
    ratings_pred = ratings_array.dot(item_sim_array) / np.array([np.abs(item_sim_array).sum(axis = 1)])
    
    return ratings_pred

In [95]:
# 예측 평점 계산

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index,
                                  columns = ratings_matrix.columns)
ratings_pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.212070,0.192921,0.136024,0.292955,0.720347
2,0.018260,0.042744,0.018861,0.000000,0.000000,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.015640,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.000000
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.011800,0.012225,0.000000,0.008194,0.007017,0.009229,0.010420,0.084501
4,0.049145,0.277628,0.160448,0.206892,0.309632,0.042337,0.130048,0.116442,0.099785,0.097432,...,0.051269,0.076051,0.055563,0.054137,0.008343,0.159242,0.100941,0.062253,0.146054,0.231187
5,0.007278,0.066951,0.041879,0.013880,0.024842,0.018240,0.026405,0.018673,0.021591,0.018841,...,0.009689,0.022246,0.013360,0.012378,0.000000,0.025839,0.023712,0.018012,0.028133,0.052315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.246832,1.293006,0.752661,0.935924,1.032354,0.407038,0.516819,0.594590,0.489913,0.408661,...,0.227092,0.405114,0.291452,0.276143,0.055006,0.636525,0.510522,0.346652,0.550174,0.893777
607,0.052248,0.305255,0.180669,0.218834,0.179443,0.115288,0.165817,0.075548,0.104890,0.109500,...,0.059516,0.135763,0.093843,0.086643,0.003707,0.144222,0.134705,0.107674,0.156614,0.576894
608,0.324435,1.022541,0.598467,0.425468,0.349562,0.494081,0.529903,0.227746,0.480980,0.442384,...,0.276586,0.594918,0.457094,0.444436,0.038681,0.616733,0.717768,0.538586,0.527639,0.698871
609,0.004835,0.053593,0.026251,0.000000,0.002827,0.015528,0.017849,0.007791,0.013172,0.014981,...,0.006575,0.014368,0.010334,0.007742,0.000000,0.018070,0.015600,0.013108,0.018328,0.033377


In [100]:
# 원본 데이터와 예측 평점 계산 이후 데이터의 차이 확인

from sklearn.metrics import mean_squared_error

# 예측 값과 실제 값의 차이를 구하는 함수
def get_mse(pred, actual_data):
    # 0이 아닌 데이터만 가져와서 1차원으로 변환
    pred = pred[actual_data.nonzero()].flatten()
    actual_data = actual_data[actual_data.nonzero()].flatten()
    
    return mean_squared_error(pred, actual_data)

# MSE 는 오차를 제곱한 값으로 스케일이 다름에 유의
mse_data = get_mse(ratings_pred, ratings_matrix.values)
print('MSE :', mse_data)
print('RMSE :', np.sqrt(mse_data))

MSE : 9.895354759094706
RMSE : 3.1456882806620725


In [111]:
# 유사도가 높은 영화만을 가지고 예측을 수행하는 함수
def predict_ratings_top_sim(ratings_array, item_sim_array, num = 20):
    pred = np.zeros(ratings_array.shape)
    
    for col in range(ratings_array.shape[1]):
        # 정렬한 다음 유사도가 가장 높은 자신을 제외하고 나머지 n개 index 반환
        top_n_items = [np.argsort(item_sim_array)[:, col][: -num-1 : -1]]
        
        for row in range(ratings_array.shape[0]):
            pred[row, col] = item_sim_array[col, :][top_n_items].dot(ratings_array[row, :][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_array[col, :][top_n_items]))
            
    return pred

In [113]:
# 새로운 함수를 가지고 오차 비교
ratings_pred = predict_ratings_top_sim(ratings_matrix.values, 
                                      item_sim_df.values, num = 20)

mse_data = get_mse(ratings_pred, ratings_matrix.values)
print('MSE :', mse_data)
print('RMSE :', np.sqrt(mse_data))

  pred[row, col] /= np.sum(np.abs(item_sim_array[col, :][top_n_items]))


KeyboardInterrupt: 

In [None]:
# 계산된 예측 평점 데이터를 DataFrame 으로 변환
ratings_pred_matrix = pd.DataFrame(data = ratings_pred,
                                  index = ratings_matrix.index,
                                  columns = ratings_matrix.columns)

# 유저 정보를 가지고 영화를 추천
# 사용자 id 가 숫자이므로 임의로 9를 사용
user_rating_id = ratings_matrix.loc[9,:]
user_rating_id[user_rating_id > 0].sort_values(ascending = False)[:10]
# 이 방식의 문제점은 이미 봤던 영화도 출력된다는 점


In [None]:
# 유저가 아직 보지 않은 영화 리스트를 만들어주는 함수
def get_unseen_movies(ratings_matrix, userId):
    # 유저 id를 가지고 평점 정보 가져오기
    user_rating = ratings_matrix.loc[usdrId, :]
    
    # 평점이 0을 초과하는 데이터는 이미 본 영화들임
    # 해당하는 영화들의 인덱스를 리스트로 가져옴
    already_seen_movies = user_rating[user_rating > 0].index.tolist()
    
    # 이미 본 영화 리스트에 포함되는 영화를 제외하고 리스트를 가져옴
    movies_list = ratings_matrix.columns.tolist()
    unseen_movies_list = [movie for movie in movies_list if movie not in already_seen_movies]
    
    return unseen_moveis_list

In [None]:
# 유저가 보지 않은 영화 중에서 예측 평점이 높은 영화를 추천하는 함수
def recommend_movie_by_userid(pred_df, userid, unseen_movies_list, top_n = 10):
    recommend_movies = pred_df.loc[userid, 
                                   unseen_movies_list].sort_values(ascendinmg = False)[top:n]
    
    return recommend_movies

In [None]:
# 사용자가 보지 않은 영화 확인

# 이전에 사용했던 userid 인 9를 이어서 사용
# 본 여화보다 보지 않은 영화가 훨씬 많음
unseen_moveis_list = get_unseen_movies(ratings_matrix, 9)

In [None]:
# 보지 않은 영화들 중에서 예측 평점이 높은 영화를 가져오기

recommend_movies = recommend_movie_by_userid(ratings_pred_matrix, 9,
                                            unseen_movie_list, top_n = 10)

# 보기 편하도록 DataFrame 으로 정리
recommend_movies = pd.DataFrame(data = recommend_movies.values,
                               index = recommend_movies.index,
                               columns = ['pred_score'])
print(recommend_movies)

## 행렬 분해

In [122]:
#from numpy.linalg import svd

# svd 는 2차원에 대해서만 적용
A = np.array([[3, -1], [1, 3], [1, 1]])
# 2차원 배열에 대해 svd를 적용
U, S, Vt = svd(A)

print(U)
print(S)
print(Vt)

[[-4.082e-01  8.944e-01 -1.826e-01]
 [-8.165e-01 -4.472e-01 -3.651e-01]
 [-4.082e-01 -1.943e-16  9.129e-01]]
[3.464 3.162]
[[-0.707 -0.707]
 [ 0.707 -0.707]]


In [123]:
# 분해된 행렬을 복원
print(U @ np.diag(S, 1)[:, 1:] @ Vt)

[[ 3. -1.]
 [ 1.  3.]
 [ 1.  1.]]


In [125]:
# 0을 포함하는 행렬에 대해 행렬 분해

A = np.array([[3, -1], [1, 3], [1, 0]])
U, S, Vt = svd(A)

print(U)
print(S)
print(Vt)

print(U @ np.diag(S, 1)[:, 1:] @ Vt)
# 원래 0이었던 위치가 다른 값으로 채워져 있음
# 이를 활용해서 Null 인 데이터를 채워넣는데 활용

[[-9.045e-01  3.162e-01 -2.860e-01]
 [-3.015e-01 -9.487e-01 -9.535e-02]
 [-3.015e-01 -1.082e-17  9.535e-01]]
[3.317 3.162]
[[-1. -0.]
 [-0. -1.]]
[[ 3.00e+00 -1.00e+00]
 [ 1.00e+00  3.00e+00]
 [ 1.00e+00  3.42e-17]]


## 행렬 분해를 이용한 잠재 요인 협업 추천 알고리즘 구현

### 평가 지표를 계산하는 함수 
### 원래는 meas_squared_error 를 제공하는데 rmse 를 사용하기 위해

In [126]:
from sklearn.metrics import mean_squared_error
import numpy as np

# 원래 데이터를 행렬 분해한 데이터를 복원한 데이터와 비교해서
# 차이를 구해주는 함수
# P와 Q는 행렬 분해가 된 데이터들이고
# non_zeros 는 0이 아닌 데이터의 인덱스
def get_rmse(R, P, Q, non_zeors):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeors]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeors]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    # 행렬 분해되어 있던 데이터를 원래 행렬로 복원
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    # 복원된 데이터와 원래의 데이터를 가지고 mse 를 계산
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    # mse 에 제곱근
    rmse = np.sqrt(mse)
    
    return rmse

In [127]:
# 행렬 분해 함수
# 실제 딥러닝 모델에서는 이미 구현이 되어 있어서 필요 x
# steps 는 반복 횟수, learning_rate 는 학습률,
# r_lambda 는 정규화에 사용할 가중치 값
def matrix_factorization(R, K, steps = 200, learning_rate = 0.01,
                        r_lambda = 0.01):
    num_users, num_items = R.shape
    
    np.random.seed(21)
    
    # item 이 영화에 해당
    P = np.random.nromal(scale = 1.0/K, size = (num_users, K))
    Q = np.random.nromal(scale = 1.0/K, size = (num_items, K))
    
    # 중간에 멈춘 갯수 파악
    break_counts = 0
    
    # 행렬에서 값이 0이 아닌 데이터의 인덱스 리스트를 생성
    # 유저와 영화를 순회하는데 그 평점이 0이 아닌 경우를 가져옴
    non_zeros = [(i, j, R[i, j]) 
                 for i in range(num_users) 
                     for j in range(num_items) if R[i, j] > 0]
    
    # steps 횟수만큼 가중치를 수정(업데인트) 하면서 학습
    # rmse 는 계속 작아지게 됨 
    # 이론상 0으로 만들 수 있지만 0이 되면 과적합
    # 원래 값인 r 에서 인덱스 i, j 를 활용해서 빼기 적용
    for step in range(stpes):
        for i, j, r in non_zeors:
            eji = r - np.dot(P[i, :], Q[j, :].T)
            # 학습률에 따라 P와 Q를 수정
            # 계산의 결과로 나온 오류를 가지고 다시 행렬을 바꾸는 가중치 업데이트
            # 이를 역전파라고 함
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[i, :] = Q[i, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeors)
        # 10번 반복마다 rmse 를 출력해서 개선되는지 파악
        if(step % 10) == 0:
            print('Iteration step :', step, 'rmse :', rmse)
    return P, Q