In [4]:
# 필요한 모듈 불러오기

import os
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action = 'ignore')

import pickle

In [5]:
# 실험 준비

## 데이터 불러오기
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
movie_cols = ['item_id', 'title', 'genre']
user_cols = ['user_id', 'sex', 'age', 'occupation', 'zip_code']

ratings = pd.read_csv('/home/ryu/thesis/data/ml-1m/ratings.dat', sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
movies = pd.read_csv('/home/ryu/thesis/data/ml-1m/movies.dat', sep='::', names=movie_cols, engine='python', encoding = "ISO-8859-1")
users = pd.read_csv('/home/ryu/thesis/data/ml-1m/users.dat', sep='::', names=user_cols, engine='python', encoding = "ISO-8859-1")

## 평점 없는 영화 제거
movies_in_rating = ratings['item_id'].unique()
movies = movies[movies['item_id'].isin(movies_in_rating)]

## 장르 정리
genres_df = movies['genre'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres_df], axis=1)
# movies = movies.drop(['genre'], axis=1)

## 필요한 정보만 추출
users = users[['user_id', 'age', 'sex', 'occupation']]

# 데이터 합병
data = pd.merge(ratings, movies, how='inner', on='item_id')
data = pd.merge(data, users, how='inner', on='user_id')
data['age'] /= 50

x = data.copy()
y = data['user_id']
ratings_train, ratings_test = train_test_split(x, test_size=0.25, stratify=y, random_state=84)


# White Sheep 대상 MF 실험 결과 불러오기 (NeuMF 실험 시 사용)
with open('/home/ryu/thesis/real_movielens/additional_var/White_FM/84p_White FM.pkl', 'rb') as f:
    white_results_loaded = pickle.load(f)

## Open saved user_gsu_dict (Gray Sheep id 불러오기)
with open('/home/ryu/thesis/real_movielens/additional_var/1_gsu_data/FM_84_pearson_gsu.pkl', 'rb') as f:
    gray_dict = pickle.load(f)

In [7]:
# RMSE 준비
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))


# 모델 준비
def Biased_Bestseller(train_data, test_data):

    train = train_data.copy()
    test = test_data.copy()

    # 아이템별 평균 평점 계산
    rating_mean = train.groupby('item_id')['rating'].mean()
    test = test.join(rating_mean, on='item_id', rsuffix='_item')

    # 전체 평균 평점 계산
    global_mean = train['rating'].mean()
    test['rating_item'].fillna(train['rating'].mean(), inplace=True)

    # 사용자별 평균 평점
    user_mean = train.groupby('user_id')['rating'].mean()
    test = test.join(user_mean, on='user_id', rsuffix='_user')
    
    
    test['predicted_rating'] = test['rating_item'] - global_mean + test['rating_user']

    rmse_result = RMSE(test['rating'], test['predicted_rating'])

    return rmse_result

In [46]:
bias_bestseller = {}
weighted_bias_bs = {}
gray_bias_bestseller = {}
weighted_gray_bias_bs = {}

for thresh in gray_dict.keys():

    
    gray_idx = gray_dict[thresh]    # thresh%에 해당하는 Gray sheep 사용자 id 가져오기
    white_rmse = white_results_loaded[thresh]    # thresh%에 해당하는 White Sheep MF 결과 가져오기 (NeuMF 실험 시에만 사용!)

    print('**************************************************')
    print(f'                {thresh}% 실험 시작                ')
    print('**************************************************')


    # white, gray sheep 사용자 분리
    white = ratings[~ratings['user_id'].isin(gray_idx)]
    gray = ratings[ratings['user_id'].isin(gray_idx)]

    white_train = ratings_train[~ratings_train['user_id'].isin(gray_idx)]
    white_test = ratings_test[~ratings_test['user_id'].isin(gray_idx)]

    gray_train = ratings_train[ratings_train['user_id'].isin(gray_idx)]
    gray_test = ratings_test[ratings_test['user_id'].isin(gray_idx)]

    #### 2. Bestseller with all ratings ####
    bias_bestseller_rmse = Biased_Bestseller(ratings_train, gray_test)
    

    print(f'{thresh}% Bestseller RMSE: {bias_bestseller_rmse}')
    bias_bestseller[f'{thresh}'] = bias_bestseller_rmse

    weight_bias_avg_bs = (white_rmse * (1 - (int(thresh)*0.01))) + (bias_bestseller_rmse * (int(thresh)*0.01))
    print(f'{thresh}% Bestseller weighted RMSE: {weight_bias_avg_bs}')
    weighted_bias_bs[f'{thresh}'] = weight_bias_avg_bs

    #### 3. Gray Sheep Only Bestseller ####
    gsu_bias_bestseller_rmse = Biased_Bestseller(gray_train, gray_test)

    print(f'{thresh}% GSU Bestseller RMSE: {gsu_bias_bestseller_rmse}')
    gray_bias_bestseller[f'{thresh}'] = gsu_bias_bestseller_rmse

    weight_avg_bias_gray_bs = (white_rmse * (1 - (int(thresh)*0.01))) + (gsu_bias_bestseller_rmse * (int(thresh)*0.01))
    print(f'{thresh}% GSU Bestseller weighted RMSE: {weight_avg_bias_gray_bs}')
    weighted_gray_bias_bs[f'{thresh}'] = weight_avg_bias_gray_bs

**************************************************
                1% 실험 시작                
**************************************************
1% Bestseller RMSE: 0.8786363648802383
1% Bestseller weighted RMSE: 0.8902193174907082
1% GSU Bestseller RMSE: 0.9589236782503325
1% GSU Bestseller weighted RMSE: 0.8910221906244091
**************************************************
                2% 실험 시작                
**************************************************
2% Bestseller RMSE: 0.9100549079802958
2% Bestseller weighted RMSE: 0.8900467746330004
2% GSU Bestseller RMSE: 0.9611219672401173
2% GSU Bestseller weighted RMSE: 0.8910681158181969
**************************************************
                3% 실험 시작                
**************************************************
3% Bestseller RMSE: 0.9136236919239346
3% Bestseller weighted RMSE: 0.8907433897292104
3% GSU Bestseller RMSE: 0.9554733964883383
3% GSU Bestseller weighted RMSE: 0.8919988808661425
*************************

In [47]:
with open('/home/ryu/thesis/real_movielens/additional_var/Bestseller/84p_Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(bias_bestseller, f)
with open('/home/ryu/thesis/real_movielens/additional_var/Bestseller/84p_Weighted Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(weighted_bias_bs, f)
with open('/home/ryu/thesis/real_movielens/additional_var/Bestseller/84p_Gray Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(gray_bias_bestseller, f)
with open('/home/ryu/thesis/real_movielens/additional_var/Bestseller/84p_Weighted Gray Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(weighted_gray_bias_bs, f)