In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

### 1. 데이터 읽기
- MovieLens 100K 데이터는 3가지 파일로 구성
    1. 사용자 데이터: u.user
    2. 영화에 대한 데이터: u.item
    3. 영화 평점 데이터: u.data

In [98]:
# 사용자 u.user 파일을 DataFrame으로 읽기 (사용자 데이터)
import os
import pandas as pd

base_src = 'data/'
u_user_src = os.path.join(base_src, 'u.user')

u_user_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv(u_user_src, 
                   sep='|', 
                   names = u_user_cols,
                   encoding = 'latin-1'
                  )
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [99]:
# u.item 파일을 읽기 (영화 정보)
u_item_src = os.path.join(base_src, 'u.item')

u_item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 'Action', 
               'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
               'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv(u_item_src,
                    sep = '|',
                    names = u_item_cols,
                    encoding = 'latin-1'
                   )

movies = movies.set_index('movie_id')

movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [100]:
# 사용자 u.data 파일을 DataFrame으로 읽기 (평점 데이터)
u_data_src = os.path.join(base_src, 'u.data')
u_data_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv(u_data_src,
                      sep = '\t',
                      encoding = 'latin-1',
                      names = u_data_cols
                     )

ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


### 2. 인기제품 방식
- 개별 사용자의 특성을 고려하지 않고, 단순히 가장 인기있는(best-seller) 제품을 추천하는 방법
- 개별 사용자 정보가 적고, 간단한 추천 제공 필요시 각 제품 평가의 평균을 구해 순서대로 제시

In [101]:
# 인기 제품 방식 추천 function
def recom_movie(n_items):
    movie_mean = ratings.groupby('movie_id')['rating'].mean()  # 영화ID별 평점의 평균 집계
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]  # 평점 기준 상위 n개의 movie_id
    
    recom_df = movies.reset_index().merge(movie_mean.reset_index(), on='movie_id', how='left')  # movie_id 세부정보 + 평균 평점
    recom_df = recom_df.set_index('movie_id')
    
    recom_movies = recom_df.loc[movie_sort.index]  # movies의 index는 movie_id
    recommendations = recom_movies[['title', 'rating']]

    return recommendations

In [102]:
recom_movie(15)

Unnamed: 0_level_0,title,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
814,"Great Day in Harlem, A (1994)",5.0
1599,Someone Else's America (1995),5.0
1201,Marlene Dietrich: Shadow and Light (1996),5.0
1122,They Made Me a Criminal (1939),5.0
1653,Entertaining Angels: The Dorothy Day Story (1996),5.0
1293,Star Kid (1997),5.0
1500,Santa with Muscles (1996),5.0
1189,Prefontaine (1997),5.0
1536,Aiqing wansui (1994),5.0
1467,"Saint of Fort Washington, The (1993)",5.0


In [103]:
# 100,000개 영화 평점에 대해 실제평점과 best-seller 방식 예측값의 RMSE를 계산하는 코드
def RMSE(pred, true):  
    return np.sqrt(np.mean((true - pred) ** 2))

In [104]:
# example
pred = np.array([1,2,3])
true = np.array([1,2,5])

RMSE(pred, true)

1.1547005383792515

In [105]:
def RMSE(pred, true):  
    return np.sqrt(np.mean((np.array(true) - np.array(pred)) ** 2)) 

# User 별 영화 평점에 대한 RMSE를 구한 뒤 전체 평균
rmse = []
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    acc = RMSE(y_true, y_pred)
    rmse.append(acc)
    
print('Best-Seller Method\'s RMSE: {}'.format(np.mean(rmse)))

Best-Seller Method's RMSE: 0.996007224010567


### 3. 사용자 집단별 추천
- 집단을 나누기 위한 변수 설정이 중요 (ex - 남성/여성)
- 아래에서는 남성과 여성별로 Best-Seller 방식의 추천 알고리즘 적용

In [106]:
# 0. preprocessing
users = users.reset_index()   # user_id를 인덱스로 두지 않음
movies = movies.reset_index()
ratings = ratings.reset_index()

ratings.drop(['timestamp'], axis=1, inplace=True)  # ratings DF에서 timestamp 제거
movies = movies[['movie_id', 'title']]

In [107]:
# 1. train/validation set split
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify = y )

In [108]:
# 2. Define RMSE Function
def RMSE(pred, true):
    return np.sqrt(np.mean((true - pred)**2))

In [109]:
# 3. 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_pred, y_true)

In [110]:
# 4. Best-Seller 함수를 이용한 정확도 계산
train_mean = x_train.groupby(['movie_id'])['rating'].mean()

def best_seller(user_id, movie_id):
    # 해당하는 영화에 대한 평점이 없을 경우를 고려해 try~except 구문 활용
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

score(best_seller)

1.0196231194399683

In [111]:
# 성별에 따른 예측값 계산
# ratings의 user는 users와 동일한 user들
a = set(ratings.user_id.to_list())
b = set(users.user_id.to_list())
print(a == b)

# x_train에 유저정보 merge
x_train = x_train.merge(users, on='user_id')
x_test = x_test.merge(users, on='user_id')

gender_mean = x_train.groupby(['movie_id', 'sex'])['rating'].mean()

rating_matrix = pd.pivot_table(x_train,
                                index = 'user_id',
                                columns = 'movie_id',
                                values = 'rating'
                               )

True


In [116]:
print(gender_mean)
rating_matrix

movie_id  sex
1         F      3.707865
          M      3.917603
2         F      3.500000
          M      3.164835
3         F      3.100000
                   ...   
1677      F      3.000000
1678      M      1.000000
1679      M      3.000000
1680      M      2.000000
1682      M      3.000000
Name: rating, Length: 3030, dtype: float64


movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1669,1672,1675,1676,1677,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,3.0,5.0,,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [140]:
# Gender 기준 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix.columns:    # train set에 존재하는 movie라면 추천 진행
        gender = users[users['user_id'] == user_id]['sex'].values[0]   # User의 성별 추출
        if gender in gender_mean[movie_id]:  # 해당 영화를 보지 않은 성별일 수도 있기 때문에 처리
            gender_rating = gender_mean[movie_id][gender]
        else:
            gender_rating = 3.0
        
    else:                                    # train set에 존재하지 않는 movie라면 3.0
        gender_rating = 3.0
    
    return gender_rating

def RMSE(pred, true):
    return np.sqrt(np.mean((true - pred) ** 2))

def score(model):
    y_pred = np.array([model(user, movie) for user, movie in zip(x_test['user_id'], x_test['movie_id'])])
    y_true = x_test['rating']

    return RMSE(y_pred, y_true)
    

In [141]:
score(cf_gender)

1.0251862604437008