In [1]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))


def CF_simple(user_id, item_id, ratings_matrix, user_similarity):
    if item_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        ratings = ratings_matrix[item_id].copy()
        none_rating_idx = ratings[ratings.isnull()].index
        ratings = ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        if sim_scores.sum() != 0:
            mean_rating = np.dot(sim_scores, ratings) / sim_scores.sum()
        else:
            mean_rating = pd.Series(np.unique(ratings_matrix)).dropna().median()
    else:
        mean_rating = pd.Series(np.unique(ratings_matrix)).dropna().median()

    return mean_rating

In [3]:
data = pd.read_csv('./Database/ncf_data.csv').fillna(0)
train, test = train_test_split(data, test_size=0.15, random_state=42)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
df_0 = train.copy()

In [4]:
encoder_dict = {}

for col in ['p_id', 'u_id']:
    encoder_dict[col] = LabelEncoder()
    df_0[col] = encoder_dict[col].fit_transform(df_0[col])

In [5]:
df_0 = df_0.groupby(['u_id', 'p_id'], as_index=False).mean()
df_1 = df_0[df_0['u_id'].isin(df_0['u_id'].value_counts()[df_0['u_id'].value_counts() > 1].index)]

In [6]:
x = df_1.copy()
y = df_1['u_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)

ratings_matrix = x_train.pivot(index='u_id', columns='p_id', values='u_rate')
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index=ratings_matrix.index,
                               columns=ratings_matrix.index)

In [7]:
# 예측값 계산 (병렬 처리 적용)
id_pairs = list(zip(x_test['u_id'], x_test['p_id']))
y_pred = np.array(Parallel(n_jobs=-1)(
    delayed(CF_simple)(user, movie, ratings_matrix, user_similarity) for (user, movie) in id_pairs
))

# 실제 평점값
y_true = np.array(x_test['u_rate'])

In [8]:
print(f"RMSE: {RMSE(y_true, y_pred):.4f}")

RMSE: 0.9507
