In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import manhattan_distances

In [5]:
# 사용자가 평점을 부여한 영화에 한해서 예측 성능 평가를 함
def get_evaluation(prd, actual):
    # ignore nonzero
    prd = prd[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(prd, actual), r2_score(prd, actual)

In [10]:
# 아이템 기반의 협업 필터링에서 개인화된 예측 평점 도출 공식
def predict_rating(ratings_arr, item_sim_arr):
    ratings_prd = ratings_arr.dot(item_sim_arr) / np.sum(np.abs(item_sim_arr))
    return ratings_prd

In [11]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 예측 행렬 0으로 초기화
    prd = np.zeros(ratings_arr.shape)
    print("### predict_rating_topsim - ratings_arr ###\n", ratings_arr, "\n", ratings_arr.shape)
    print("### predict_rating_topsim - item_sim_arr ###\n", item_sim_arr, "\n", item_sim_arr.shape)

    # 사용자-아이템 평점 행렬 열 크기만큼 반복
    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 평점 계산
        for row in range(ratings_arr.shape[0]):
            if row % 1000 == 0:
                print("# predict_rating_topsim:", col, "/", ratings_arr.shape[1], " | ", row, "/", ratings_arr.shape[0])
            ratings_prd = predict_rating(ratings_arr[row, :][top_n_items].T, item_sim_arr[col, :][top_n_items])
            prd[row, col] = ratings_prd
    return prd

In [12]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    return unseen_list

In [13]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [14]:
# movies 는 movieId, title, genres 를 column 으로 가지고 있음
movies = pd.read_table(
    './dataset/ml-1m/movies.dat',
    sep="::",
    engine="python",
    names=["MovieID", "Title", "Genres"],
    encoding='latin1'
)
# ratings 는 userId, movieId, rating, timestamp 를 column 으로 가지고 있음
ratings = pd.read_table(
    './dataset/ml-1m/ratings.dat',
    sep="::",
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"]
)

ratings = ratings[['UserID', 'MovieID', 'Rating']]

In [15]:
from sklearn.model_selection import train_test_split

# split train dataset and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(ratings[["UserID", "MovieID"]], ratings[["Rating"]], test_size=0.2, stratify=ratings[["Rating"]])

In [16]:
train_dataset = pd.concat([X_train, Y_train], axis=1)
# 영화 제목에 대한 유저들의 평가를 적어둔 행렬 반환을 위해 merge (JOIN) 작업 수행
train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID')
train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='Title')
train_ratings_matrix = train_ratings_matrix.fillna(0)

In [17]:
train_ratings_matrix_T = train_ratings_matrix.transpose()
train_ratings_matrix_T

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zero Effect (1998),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zero Kelvin (Kjærlighetens kjøtere) (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zeus and Roxanne (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def l1_normalize(v):
    norm = np.sum(v)
    return v / norm

In [26]:
norm_l1 = l1_normalize(train_ratings_matrix_T)
max_distances = max(train_ratings_matrix_T)
item_sim = manhattan_distances(norm_l1, norm_l1)
item_sim_tfd = max_distances - item_sim

# cosine similarity 행렬에 제목을 붙여서 반환
item_sim_df = pd.DataFrame(data=item_sim_tfd, index=train_ratings_matrix.columns, columns=train_ratings_matrix.columns)

In [27]:
item_sim_df

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",6040.000000,6039.634391,6039.698154,6038.562306,6038.590299,6039.844097,6034.298171,6036.546827,6037.881983,6034.521249,...,6039.606447,6038.442250,6039.863207,6039.516080,6039.861308,6039.757112,6038.671576,6039.869209,6039.773508,6037.677916
'Night Mother (1986),6039.634391,6040.000000,6039.591329,6038.461609,6038.548248,6039.724616,6034.217613,6036.422927,6037.676573,6034.440995,...,6039.489840,6038.309127,6039.746608,6039.410736,6039.741828,6039.642531,6038.560478,6039.749728,6039.649155,6037.582102
'Til There Was You (1997),6039.698154,6039.591329,6040.000000,6038.512435,6038.543582,6039.826835,6034.355238,6036.446573,6037.745435,6034.558472,...,6039.547839,6038.373233,6039.810546,6039.472692,6039.808648,6039.701190,6038.636066,6039.816549,6039.721136,6037.641187
"'burbs, The (1989)",6038.562306,6038.461609,6038.512435,6040.000000,6037.670411,6038.640466,6033.785535,6035.775024,6036.832886,6033.776734,...,6038.441624,6037.384286,6038.662841,6038.345890,6038.657678,6038.559066,6037.578962,6038.665579,6038.568184,6036.562080
...And Justice for All (1979),6038.590299,6038.548248,6038.543582,6037.670411,6040.000000,6038.682089,6033.329722,6035.771706,6036.853621,6033.979217,...,6038.456493,6037.353491,6038.709769,6038.368224,6038.699300,6038.610785,6037.553337,6038.707201,6038.606627,6036.580139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",6039.757112,6039.642531,6039.701190,6038.559066,6038.610785,6039.847133,6034.282028,6036.420118,6037.770162,6034.495362,...,6039.610725,6038.424790,6039.869125,6039.528876,6039.864344,6040.000000,6038.682063,6039.872245,6039.771671,6037.711396
Zero Effect (1998),6038.671576,6038.560478,6038.636066,6037.578962,6037.553337,6038.757315,6033.524661,6035.472738,6036.796145,6033.587323,...,6038.584548,6037.482780,6038.781827,6038.511357,6038.774527,6038.682063,6040.000000,6038.782427,6038.681853,6036.895089
Zero Kelvin (Kjærlighetens kjøtere) (1995),6039.869209,6039.749728,6039.816549,6038.665579,6038.707201,6039.962491,6034.397386,6036.523355,6037.882582,6034.595960,...,6039.717842,6038.529834,6039.981601,6039.631610,6039.979702,6039.872245,6038.782427,6040.000000,6039.887029,6037.792345
Zeus and Roxanne (1997),6039.773508,6039.649155,6039.721136,6038.568184,6038.606627,6039.861917,6034.315046,6036.440353,6037.795346,6034.503791,...,6039.617268,6038.435234,6039.881027,6039.531036,6039.879129,6039.771671,6038.681853,6039.887029,6040.000000,6037.697495


In [28]:
import warnings
warnings.filterwarnings("ignore")

ratings_prd = predict_rating_topsim(train_ratings_matrix.values, item_sim_df.values, n=20)

### predict_rating_topsim - ratings_arr ###
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 
 (6040, 3677)
### predict_rating_topsim - item_sim_arr ###
 [[6040.         6039.6343913  6039.69815423 ... 6039.86920893
  6039.77350767 6037.67791634]
 [6039.6343913  6040.         6039.59132859 ... 6039.74972844
  6039.64915454 6037.58210219]
 [6039.69815423 6039.59132859 6040.         ... 6039.8165486
  6039.72113595 6037.64118696]
 ...
 [6039.86920893 6039.74972844 6039.8165486  ... 6040.
  6039.8870294  6037.79234496]
 [6039.77350767 6039.64915454 6039.72113595 ... 6039.8870294
  6040.         6037.69749509]
 [6037.67791634 6037.58210219 6037.64118696 ... 6037.79234496
  6037.69749509 6040.        ]] 
 (3677, 3677)
# predict_rating_topsim: 0 / 3677  |  0 / 6040
# predict_rating_topsim: 0 / 3677  |  1000 / 6040
# predict_rating_topsim: 0 / 3677  |  2000 / 6040
# predict_rating_topsi

In [29]:
mse, r2 = get_evaluation(ratings_prd, train_ratings_matrix.values)
print("$$ Evaluation $$\n - MSE: {}\n - R2 Score: {}".format(mse, r2))
ratings_prd_matrix = pd.DataFrame(data=ratings_prd, index=train_ratings_matrix.index, columns=train_ratings_matrix.columns)

$$ Evaluation $$
 - MSE: 11.936424648403227
 - R2 Score: -115.57029984598437


In [30]:
userNum = int(input())
user_rating_id = train_ratings_matrix.loc[userNum, :]
print("유저", userNum, "의 평가 내용:")
user_rating_id[user_rating_id > 0].sort_values(ascending=False)

유저 12 의 평가 내용:


Title
Godfather, The (1972)                     5.0
Citizen Kane (1941)                       5.0
Silence of the Lambs, The (1991)          5.0
Raiders of the Lost Ark (1981)            5.0
Godfather: Part II, The (1974)            5.0
Wizard of Oz, The (1939)                  5.0
Christmas Story, A (1983)                 5.0
Almost Famous (2000)                      4.0
One Flew Over the Cuckoo's Nest (1975)    4.0
Quatermass and the Pit (1967)             4.0
2 Days in the Valley (1996)               4.0
Dog Day Afternoon (1975)                  3.0
Full Monty, The (1997)                    3.0
Graduate, The (1967)                      3.0
Larger Than Life (1996)                   3.0
Chinatown (1974)                          3.0
Scary Movie (2000)                        3.0
Boat, The (Das Boot) (1981)               3.0
Father of the Bride (1950)                2.0
Dick Tracy (1990)                         1.0
Name: 12, dtype: float64

In [31]:
unseen_list = get_unseen_movies(train_ratings_matrix, 9)
recommend_movies = recomm_movie_by_userid(ratings_prd_matrix, 9, unseen_list, top_n=10)
recommend_movies = pd.DataFrame(data=recommend_movies.values, index=recommend_movies.index, columns=['prd_score'])

In [32]:
recommend_movies

Unnamed: 0_level_0,prd_score
Title,Unnamed: 1_level_1
"Usual Suspects, The (1995)",0.449954
Pulp Fiction (1994),0.449951
"Godfather, The (1972)",0.200046
Terminator 2: Judgment Day (1991),0.199947
Forrest Gump (1994),0.149978
Star Wars: Episode IV - A New Hope (1977),0.149949
Star Wars: Episode V - The Empire Strikes Back (1980),0.149947
Gladiator (2000),0.099985
"$1,000,000 Duck (1971)",0.0
Once Were Warriors (1994),0.0
