In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# 사용자가 평점을 부여한 영화에 한해서 예측 성능 평가를 함
def get_evaluation(prd, actual):
    # ignore nonzero
    prd = prd[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(prd, actual), r2_score(prd, actual)

In [4]:
# 아이템 기반의 협업 필터링에서 개인화된 예측 평점 도출 공식
def predict_rating(ratings_arr, item_sim_arr):
    ratings_prd = ratings_arr.dot(item_sim_arr) / np.sum(np.abs(item_sim_arr))
    return ratings_prd

In [5]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 예측 행렬 0으로 초기화
    prd = np.zeros(ratings_arr.shape)
    print("### predict_rating_topsim - ratings_arr ###\n", ratings_arr, "\n", ratings_arr.shape)
    print("### predict_rating_topsim - item_sim_arr ###\n", item_sim_arr, "\n", item_sim_arr.shape)

    # 사용자-아이템 평점 행렬 열 크기만큼 반복
    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 평점 계산
        for row in range(ratings_arr.shape[0]):
            if row % 1000 == 0:
                print("# predict_rating_topsim:", col, "/", ratings_arr.shape[1], " | ", row, "/", ratings_arr.shape[0])
            ratings_prd = predict_rating(ratings_arr[row, :][top_n_items].T, item_sim_arr[col, :][top_n_items])
            prd[row, col] = ratings_prd
    return prd

In [6]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    return unseen_list

In [7]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [9]:
# movies 는 movieId, title, genres 를 column 으로 가지고 있음
movies = pd.read_table(
    './dataset/ml-1m/movies.dat',
    sep="::",
    engine="python",
    names=["MovieID", "Title", "Genres"],
    encoding='latin1'
)
# ratings 는 userId, movieId, rating, timestamp 를 column 으로 가지고 있음
ratings = pd.read_table(
    './dataset/ml-1m/ratings.dat',
    sep="::",
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"]
)

ratings = ratings[['UserID', 'MovieID', 'Rating']]

In [10]:
from sklearn.model_selection import train_test_split

# split train dataset and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(ratings[["UserID", "MovieID"]], ratings[["Rating"]], test_size=0.2, stratify=ratings[["Rating"]])

In [11]:
train_dataset = pd.concat([X_train, Y_train], axis=1)
# 영화 제목에 대한 유저들의 평가를 적어둔 행렬 반환을 위해 merge (JOIN) 작업 수행
train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID')
train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='Title')
train_ratings_matrix = train_ratings_matrix.fillna(0)

In [12]:
train_ratings_matrix_T = train_ratings_matrix.transpose()
train_ratings_matrix_T

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zero Effect (1998),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zero Kelvin (Kjærlighetens kjøtere) (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zeus and Roxanne (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# 각 영화를 기준으로 유저의 평가 양상이 얼마나 유사한지 cosine similarity 에 넣어서 평가
# (n, n)의 정사각 행렬 반환 (대각 행렬은 자기 자신에 대한 유사도, 1)
item_sim = cosine_similarity(train_ratings_matrix_T, train_ratings_matrix_T)
# cosine similarity 행렬에 제목을 붙여서 반환
item_sim_df = pd.DataFrame(data=item_sim, index=train_ratings_matrix.columns, columns=train_ratings_matrix.columns)

In [14]:
item_sim_df

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",1.000000,0.102495,0.052735,0.055060,0.054666,0.000000,0.040366,0.165725,0.169036,0.070261,...,0.054385,0.095475,0.000000,0.047408,0.0,0.063413,0.031072,0.000000,0.102177,0.029189
'Night Mother (1986),0.102495,1.000000,0.102764,0.102779,0.079855,0.000000,0.074922,0.112069,0.074447,0.077877,...,0.062388,0.058572,0.024308,0.098049,0.0,0.065833,0.053859,0.000000,0.000000,0.050680
'Til There Was You (1997),0.052735,0.102764,1.000000,0.072911,0.070434,0.087950,0.097449,0.117779,0.100487,0.083466,...,0.020741,0.044321,0.000000,0.095737,0.0,0.026674,0.060839,0.000000,0.038455,0.024910
"'burbs, The (1989)",0.055060,0.102779,0.072911,1.000000,0.108895,0.000000,0.144396,0.209382,0.165292,0.143430,...,0.115219,0.188251,0.028125,0.114707,0.0,0.036895,0.124634,0.000000,0.022306,0.074850
...And Justice for All (1979),0.054666,0.079855,0.070434,0.108895,1.000000,0.000000,0.055987,0.159615,0.079182,0.139854,...,0.085714,0.106743,0.068679,0.083766,0.0,0.089679,0.067440,0.088845,0.000000,0.081581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",0.063413,0.065833,0.026674,0.036895,0.089679,0.000000,0.013088,0.047038,0.021529,0.017252,...,0.055018,0.060592,0.092890,0.168190,0.0,1.000000,0.075778,0.134585,0.000000,0.133972
Zero Effect (1998),0.031072,0.053859,0.060839,0.124634,0.067440,0.000000,0.144262,0.107982,0.088448,0.106584,...,0.142399,0.134522,0.029477,0.161008,0.0,0.075778,1.000000,0.066732,0.012139,0.181930
Zero Kelvin (Kjærlighetens kjøtere) (1995),0.000000,0.000000,0.000000,0.000000,0.088845,0.000000,0.000000,0.040063,0.000000,0.041020,...,0.052326,0.000000,0.000000,0.000000,0.0,0.134585,0.066732,1.000000,0.000000,0.052007
Zeus and Roxanne (1997),0.102177,0.000000,0.038455,0.022306,0.000000,0.000000,0.029247,0.046559,0.029560,0.047257,...,0.000000,0.039114,0.000000,0.000000,0.0,0.000000,0.012139,0.000000,1.000000,0.000000


In [15]:
import warnings
warnings.filterwarnings("ignore")

ratings_prd = predict_rating_topsim(train_ratings_matrix.values, item_sim_df.values, n=20)

### predict_rating_topsim - ratings_arr ###
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 
 (6040, 3682)
### predict_rating_topsim - item_sim_arr ###
 [[1.         0.10249452 0.05273458 ... 0.         0.10217712 0.029189  ]
 [0.10249452 1.         0.10276375 ... 0.         0.         0.05067972]
 [0.05273458 0.10276375 1.         ... 0.         0.03845522 0.02490957]
 ...
 [0.         0.         0.         ... 1.         0.         0.05200675]
 [0.10217712 0.         0.03845522 ... 0.         1.         0.        ]
 [0.029189   0.05067972 0.02490957 ... 0.05200675 0.         1.        ]] 
 (3682, 3682)
# predict_rating_topsim: 0 / 3682  |  0 / 6040
# predict_rating_topsim: 0 / 3682  |  1000 / 6040
# predict_rating_topsim: 0 / 3682  |  2000 / 6040
# predict_rating_topsim: 0 / 3682  |  3000 / 6040
# predict_rating_topsim: 0 / 3682  |  4000 / 6040
# predict_rating_topsim: 0 / 368

In [16]:
mse, r2 = get_evaluation(ratings_prd, train_ratings_matrix.values)
print("$$ Evaluation $$\n - MSE: {}\n - R2 Score: {}".format(mse, r2))
ratings_prd_matrix = pd.DataFrame(data=ratings_prd, index=train_ratings_matrix.index, columns=train_ratings_matrix.columns)

$$ Evaluation $$
 - MSE: 4.050102924005719
 - R2 Score: -4.686387306758942


In [21]:
userNum = int(input())
user_rating_id = train_ratings_matrix.loc[userNum, :]
print("유저", userNum, "의 평가 내용:")
user_rating_id[user_rating_id > 0].sort_values(ascending=False)

유저 12 의 평가 내용:


Title
Christmas Story, A (1983)            5.0
Citizen Kane (1941)                  5.0
Godfather: Part II, The (1974)       5.0
Raiders of the Lost Ark (1981)       5.0
Taxi Driver (1976)                   5.0
2 Days in the Valley (1996)          4.0
Almost Famous (2000)                 4.0
Man Who Would Be King, The (1975)    4.0
Quatermass and the Pit (1967)        4.0
Chinatown (1974)                     3.0
Dog Day Afternoon (1975)             3.0
Full Monty, The (1997)               3.0
Larger Than Life (1996)              3.0
Scary Movie (2000)                   3.0
Father of the Bride (1950)           2.0
Dick Tracy (1990)                    1.0
Name: 12, dtype: float64

In [22]:
unseen_list = get_unseen_movies(train_ratings_matrix, 9)
recommend_movies = recomm_movie_by_userid(ratings_prd_matrix, 9, unseen_list, top_n=10)
recommend_movies = pd.DataFrame(data=recommend_movies.values, index=recommend_movies.index, columns=['prd_score'])

In [23]:
recommend_movies

Unnamed: 0_level_0,prd_score
Title,Unnamed: 1_level_1
Pulp Fiction (1994),2.948222
Apollo 13 (1995),2.947054
American History X (1998),2.701029
October Sky (1999),2.502008
Seven (Se7en) (1995),2.461026
Sling Blade (1996),2.450229
JFK (1991),2.37129
Chasing Amy (1997),2.345631
Being John Malkovich (1999),2.302325
Boogie Nights (1997),2.197925
