In [1]:
# 필요한 모듈 불러오기

import os
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action = 'ignore')

import pickle

In [8]:
# 실험 준비

## 데이터 불러오기
ratings = pd.read_csv('/home/ryu/thesis/data/amazon/Amazon_ratings.csv')

## 기본 전처리
cnt = ratings.groupby('user_id').count()['rating']
keys = cnt[cnt>3].keys()
ratings = ratings[ratings['user_id'].isin(keys)]

## train, test set 나누기
x = ratings.copy()
y = ratings['user_id']
ratings_train, ratings_test = train_test_split(x, test_size=0.25, stratify=y, random_state=8)

## Black Sheep 사용자 목록 불러오기
with open('/home/ryu/thesis/real_amazon/black_id.pkl', 'rb') as f:
    black = pickle.load(f)

## train, test 에서 black sheep 사용자만 추출
black_train = ratings_train[ratings_train['user_id'].isin(black)]
black_test = ratings_test[ratings_test['user_id'].isin(black)]
black_all = ratings[ratings['user_id'].isin(black)]

## 사용자 수 구하기 (이후 가중평균 위함)
entire_pop = ratings.user_id.nunique()      # 전체 사용자 수
black_pop = len(black)                      # Black Sheep 사용자 수
rest_pop = entire_pop - black_pop           # 전체 - black sheep = white & gray 사용자 수

## black sheep 제거 데이터
ratings_train = ratings_train[~ratings_train['user_id'].isin(black)]
ratings_test = ratings_test[~ratings_test['user_id'].isin(black)]
ratings = ratings[~ratings['user_id'].isin(black)]


# White Sheep 대상 MF 실험 결과 불러오기 (NeuMF 실험 시 사용)
with open('/home/ryu/thesis/real_amazon/additional_var/White_FM/84c_White FM.pkl', 'rb') as f:
    white_results_loaded = pickle.load(f)

## Open saved user_gsu_dict (Gray Sheep id 불러오기)
with open('/home/ryu/thesis/real_amazon/additional_var/1_gsu_data/FM_84_cosine_gsu.pkl', 'rb') as f:
    gray_dict = pickle.load(f)

In [9]:
# RMSE 준비
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))


# 모델 준비
## 1. Bestseller
def Bestseller(train_data, test_data):

    train = train_data.copy()
    test = test_data.copy()

    per_item_mean = train.groupby(['item_id'])['rating'].mean()
    global_mean = train['rating'].mean()

    test = test.join(per_item_mean, on='item_id', rsuffix='_mean')
    test['rating_mean'].fillna(global_mean, inplace=True)

    rmse_result = RMSE(test['rating'], test['rating_mean'])
    
    return rmse_result

def Biased_Bestseller(train_data, test_data):

    train = train_data.copy()
    test = test_data.copy()

    # 아이템별 평균 평점 계산
    rating_mean = train.groupby('item_id')['rating'].mean()
    test = test.join(rating_mean, on='item_id', rsuffix='_item')

    # 전체 평균 평점 계산
    global_mean = train['rating'].mean()
    test['rating_item'].fillna(train['rating'].mean(), inplace=True)

    # 사용자별 평균 평점
    user_mean = train.groupby('user_id')['rating'].mean()
    test = test.join(user_mean, on='user_id', rsuffix='_user')
    
    
    test['predicted_rating'] = test['rating_item'] - global_mean + test['rating_user']

    rmse_result = RMSE(test['rating'], test['predicted_rating'])

    return rmse_result

In [10]:
bias_bestseller = {}
# weighted_bias_bs = {}
gray_bias_bestseller = {}
# weighted_gray_bias_bs = {}
# black_bestseller = {}
weighted_bs_black_w = {}
weighted_bs_black_blk = {}
weighted_graybs_black_w = {}
weighted_graybs_black_blk = {}


print('*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*')
blk_only_bs = Bestseller(black_train, black_test)
print(f'Black Bestseller (trained with only black sheep data): {blk_only_bs}')
print('*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*')

for thresh in gray_dict.keys():

    
    gray_idx = gray_dict[thresh]    # thresh%에 해당하는 Gray sheep 사용자 id 가져오기
    white_rmse = white_results_loaded[thresh]    # thresh%에 해당하는 White Sheep MF 결과 가져오기 (NeuMF 실험 시에만 사용!)

    print('**************************************************')
    print(f'                {thresh}% 실험 시작                ')
    print('**************************************************')


    # white, gray sheep 사용자 분리
    white = ratings[~ratings['user_id'].isin(gray_idx)]
    gray = ratings[ratings['user_id'].isin(gray_idx)]

    white_train = ratings_train[~ratings_train['user_id'].isin(gray_idx)]
    white_test = ratings_test[~ratings_test['user_id'].isin(gray_idx)]

    gray_train = ratings_train[ratings_train['user_id'].isin(gray_idx)]
    gray_test = ratings_test[ratings_test['user_id'].isin(gray_idx)]


    #### 0. Black Sheep Bestseller (trained with White Sheep train set)
    blk_bestseller = Bestseller(white_train, black_test)

    print(f'{thresh}% Black Bestseller RMSE (trained with white_train): {blk_bestseller}')


    #### 2. Bestseller with all ratings ####
    bias_bestseller_rmse = Biased_Bestseller(ratings_train, gray_test)
    
    print(f'{thresh}% Bestseller RMSE: {bias_bestseller_rmse}')
    bias_bestseller[f'{thresh}'] = bias_bestseller_rmse

    # white sheep trained, black sheep tested
    weight_avg_bs = (blk_bestseller * (black_pop/entire_pop)) + (white_rmse * ((rest_pop-len(gray_idx))/entire_pop)) + (bias_bestseller_rmse * (len(gray_idx)/entire_pop))
    print(f'{thresh}% Bestseller weighted RMSE (w/ White Sheep): {weight_avg_bs}')
    weighted_bs_black_w[f'{thresh}'] = weight_avg_bs

    # black sheep trained, black sheep tested
    blk_weight_avg_bs = (blk_only_bs * (black_pop/entire_pop)) + (white_rmse * ((rest_pop-len(gray_idx))/entire_pop)) + (bias_bestseller_rmse * (len(gray_idx)/entire_pop))
    print(f'{thresh}% Bestseller weighted RMSE (w/ Black Sheep): {blk_weight_avg_bs}')
    weighted_bs_black_blk[f'{thresh}'] = blk_weight_avg_bs


    #### 3. Gray Sheep Only Bestseller ####
    gsu_bias_bestseller_rmse = Biased_Bestseller(gray_train, gray_test)

    print(f'{thresh}% GSU Bestseller RMSE: {gsu_bias_bestseller_rmse}')
    gray_bias_bestseller[f'{thresh}'] = gsu_bias_bestseller_rmse

    # white sheep trained, black sheep tested
    weight_avg_bs = (blk_bestseller * (black_pop/entire_pop)) + (white_rmse * ((rest_pop-len(gray_idx))/entire_pop)) + (gsu_bias_bestseller_rmse * (len(gray_idx)/entire_pop))
    print(f'{thresh}% Bestseller weighted RMSE (w/ White Sheep): {weight_avg_bs}')
    weighted_graybs_black_w[f'{thresh}'] = weight_avg_bs

    # black sheep trained, black sheep tested
    blk_weight_avg_bs = (blk_only_bs * (black_pop/entire_pop)) + (white_rmse * ((rest_pop-len(gray_idx))/entire_pop)) + (gsu_bias_bestseller_rmse * (len(gray_idx)/entire_pop))
    print(f'{thresh}% Bestseller weighted RMSE (w/ Black Sheep): {blk_weight_avg_bs}')
    weighted_graybs_black_blk[f'{thresh}'] = blk_weight_avg_bs


*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*
Black Bestseller (trained with only black sheep data): 1.2876756802355618
*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*
**************************************************
                1% 실험 시작                
**************************************************
1% Black Bestseller RMSE (trained with white_train): 1.140174185307963
1% Bestseller RMSE: 0.9965217432964737
1% Bestseller weighted RMSE (w/ White Sheep): 1.0155500280074767
1% Bestseller weighted RMSE (w/ Black Sheep): 1.0811969048049859
1% GSU Bestseller RMSE: 1.0245796083897187
1% Bestseller weighted RMSE (w/ White Sheep): 1.0157068481120024
1% Bestseller weighted RMSE (w/ Black Sheep): 1.0813537249095115
**************************************************
                2% 실험 시작                
**************************************************
2% Black Bestseller RMSE (trained with white_train): 1.141573311593

In [11]:
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(bias_bestseller, f)
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Gray Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(gray_bias_bestseller, f)

# weighted rmse: black with black only
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Weighted Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(weighted_bs_black_blk, f)
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Weighted Biased Bestseller_Black.pkl', 'wb') as f:
    pickle.dump(weighted_graybs_black_blk, f)
    
# weighted rmse: black with white
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Weighted Gray Biased Bestseller.pkl', 'wb') as f:
    pickle.dump(weighted_bs_black_w, f)
with open('/home/ryu/thesis/real_amazon/additional_var/Biased Bestseller/84p_Weighted Gray Biased Bestseller_Black.pkl', 'wb') as f:
    pickle.dump(weighted_graybs_black_blk, f)