In [1]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings(action = 'ignore')

import pickle
import itertools

import matplotlib.pyplot as plot
import seaborn as sns

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### review embedding through sentence embedding (SBERT)

In [3]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to(device)

2023-12-05 15:03:24.094272: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-05 15:03:24.276839: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 15:03:24.338729: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-05 15:03:25.109254: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [2]:
# 데이터 불러오기
ratings = pd.read_csv('/home/ryu/thesis/data/amazon/Amazon_ratings.csv')
reviews = pd.read_csv('/home/ryu/thesis/data/amazon/Amazon_reviews.csv')

cnt = ratings.groupby('user_id').count()['rating']
keys = cnt[cnt>3].keys()
ratings = ratings[ratings['user_id'].isin(keys)]

ratings = ratings[['item_id', 'user_id', 'rating']]
reviews = reviews[['item_id', 'user_id', 'text']]

In [3]:
data = pd.merge(ratings, reviews, how='left', left_on=['user_id', 'item_id'], right_on=['user_id', 'item_id'])
data

Unnamed: 0,item_id,user_id,rating,text
0,A07936821FOVJO6NP4Q8,B0000A0AEM,3.0,"Very nice product. Sharp, clear optics. Seem..."
1,A07936821FOVJO6NP4Q8,B0000AI0N1,5.0,"Just what it said it would be, well made, wort..."
2,A07936821FOVJO6NP4Q8,B0002RSPE4,5.0,Bought a bunch of these a few years ago for th...
3,A07936821FOVJO6NP4Q8,B000CRFOMK,5.0,At first glance these seem silly. Who needs a...
4,A07936821FOVJO6NP4Q8,B000ID7QNI,5.0,Very sturdy well made wall mount.
...,...,...,...,...
241856,AZZTOUKVTUMVM,B003FVJYF8,4.0,It's 50' of coax. Seems to be well constructed...
241857,AZZTOUKVTUMVM,B00BUL4NLU,2.0,Bought this at the end of September 2016 for u...
241858,AZZTOUKVTUMVM,B00HWT8I24,3.0,It was a bit challenging to get this to synch ...
241859,AZZTOUKVTUMVM,B00IX9ZDKC,5.0,What can I say. It worked great and my compute...


In [4]:
data['text'] = data['text'].fillna('')

In [8]:
text = list(data['text'].values)
model.max_seq_length = 10

embeddings = model.encode(text)

In [9]:
print(embeddings[0])

[-3.24346013e-02 -9.56571754e-03  2.12417319e-02 -2.76257265e-02
 -5.12140989e-02 -7.77860358e-02  1.22302666e-01  5.86405359e-02
 -6.17323034e-02  2.80550290e-02  4.90620397e-02  2.57509574e-02
  9.32610035e-03  8.58614687e-03 -7.86167979e-02  2.91688144e-02
  2.83283349e-02 -7.43849576e-02  1.46041084e-02  7.81145005e-04
  9.40033048e-03  7.16467574e-03  4.32437751e-03  9.40016285e-03
 -3.05967480e-02  1.17141008e-02  5.56921251e-02  1.54577019e-02
  4.19114232e-02 -5.58091588e-02 -1.76646039e-02  1.18749365e-02
 -3.09459548e-02  8.87921639e-03  2.07308419e-02 -4.78523374e-02
  2.12100428e-02 -4.11934219e-02  7.81660806e-03  2.13949438e-02
 -1.23685608e-02 -2.33170353e-02 -3.11087933e-04 -3.00242915e-03
 -5.80964126e-02  5.50509570e-03  7.41629824e-02  2.67745964e-02
  3.72200459e-02 -6.32630140e-02 -6.80268332e-02 -3.77624221e-02
 -7.09628016e-02 -9.74218249e-02  3.39796208e-02  7.00396895e-02
 -8.04503113e-02 -1.65358055e-02  2.18628477e-02 -7.02033713e-02
  4.44306657e-02 -2.46475

In [18]:
embeddings.shape

(241861, 384)

In [11]:
# 임베딩 저장
with open('/home/ryu/thesis/new_amazon/sbert_emb.pickle', 'wb') as f:
    pickle.dump(embeddings, f)

In [5]:
with open('/home/ryu/thesis/new_amazon/sbert_emb.pickle', 'rb') as f:
    embeddings = pickle.load(f)

In [6]:
# 임베딩 내용 확인
emb = pd.DataFrame(embeddings)
emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.032435,-0.009566,0.021242,-0.027626,-0.051214,-0.077786,0.122303,0.058641,-0.061732,0.028055,...,-0.097903,-0.045616,0.089570,-0.043381,-0.045833,0.004853,0.044459,0.030812,-0.050800,0.116877
1,-0.046252,0.053345,0.023569,-0.005799,-0.001641,0.005328,0.016217,-0.065180,-0.021635,0.044293,...,0.040859,0.082676,0.012692,0.078683,-0.095079,0.007839,0.124836,-0.024412,-0.000559,0.004015
2,-0.058179,-0.028349,0.028041,-0.001780,-0.056059,-0.040762,0.035624,0.013983,-0.043915,-0.016364,...,-0.044998,-0.046685,0.003407,0.010634,0.013347,-0.002498,-0.083653,-0.098793,-0.052496,0.050222
3,0.001419,-0.005161,0.001925,-0.017756,0.003331,-0.007262,0.074666,0.010081,0.028236,0.082903,...,0.016537,-0.042396,-0.030921,0.020760,-0.070070,-0.002282,0.037053,0.009862,0.059242,0.050374
4,-0.044808,0.021749,-0.026192,-0.037803,-0.068915,-0.040948,-0.054282,0.100183,-0.018990,0.021407,...,-0.094901,-0.043734,0.030118,0.018265,0.024000,-0.043978,0.041587,0.014160,0.012837,0.041781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241856,-0.028359,0.060098,0.007660,-0.044228,-0.021465,-0.005880,-0.066491,0.068297,-0.028384,0.022716,...,0.036754,-0.022991,0.036425,-0.062824,-0.098093,-0.005447,-0.070600,0.031582,0.085118,-0.016995
241857,0.001685,0.009930,0.017188,-0.024415,0.021406,-0.028677,-0.009823,0.003675,-0.065339,0.021545,...,-0.021373,-0.032757,-0.051921,0.041006,-0.032012,-0.021292,-0.011248,-0.119632,-0.100451,0.084332
241858,-0.081250,0.072284,-0.007992,0.058251,0.044123,0.037043,0.027959,-0.051693,-0.055771,-0.011383,...,0.050207,-0.028469,-0.042262,0.019616,-0.062088,0.051815,0.021332,0.111828,-0.120888,0.056730
241859,-0.059075,0.056959,0.006054,-0.039024,-0.013510,0.021395,0.021009,0.030476,-0.099459,-0.080867,...,0.011553,-0.000094,-0.003660,-0.014688,-0.054684,0.072146,0.047209,0.017928,-0.033454,0.011537


In [7]:
# 본 데이터와 합병
data = pd.concat([data, emb], axis=1)
data

Unnamed: 0,item_id,user_id,rating,text,0,1,2,3,4,5,...,374,375,376,377,378,379,380,381,382,383
0,A07936821FOVJO6NP4Q8,B0000A0AEM,3.0,"Very nice product. Sharp, clear optics. Seem...",-0.032435,-0.009566,0.021242,-0.027626,-0.051214,-0.077786,...,-0.097903,-0.045616,0.089570,-0.043381,-0.045833,0.004853,0.044459,0.030812,-0.050800,0.116877
1,A07936821FOVJO6NP4Q8,B0000AI0N1,5.0,"Just what it said it would be, well made, wort...",-0.046252,0.053345,0.023569,-0.005799,-0.001641,0.005328,...,0.040859,0.082676,0.012692,0.078683,-0.095079,0.007839,0.124836,-0.024412,-0.000559,0.004015
2,A07936821FOVJO6NP4Q8,B0002RSPE4,5.0,Bought a bunch of these a few years ago for th...,-0.058179,-0.028349,0.028041,-0.001780,-0.056059,-0.040762,...,-0.044998,-0.046685,0.003407,0.010634,0.013347,-0.002498,-0.083653,-0.098793,-0.052496,0.050222
3,A07936821FOVJO6NP4Q8,B000CRFOMK,5.0,At first glance these seem silly. Who needs a...,0.001419,-0.005161,0.001925,-0.017756,0.003331,-0.007262,...,0.016537,-0.042396,-0.030921,0.020760,-0.070070,-0.002282,0.037053,0.009862,0.059242,0.050374
4,A07936821FOVJO6NP4Q8,B000ID7QNI,5.0,Very sturdy well made wall mount.,-0.044808,0.021749,-0.026192,-0.037803,-0.068915,-0.040948,...,-0.094901,-0.043734,0.030118,0.018265,0.024000,-0.043978,0.041587,0.014160,0.012837,0.041781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241856,AZZTOUKVTUMVM,B003FVJYF8,4.0,It's 50' of coax. Seems to be well constructed...,-0.028359,0.060098,0.007660,-0.044228,-0.021465,-0.005880,...,0.036754,-0.022991,0.036425,-0.062824,-0.098093,-0.005447,-0.070600,0.031582,0.085118,-0.016995
241857,AZZTOUKVTUMVM,B00BUL4NLU,2.0,Bought this at the end of September 2016 for u...,0.001685,0.009930,0.017188,-0.024415,0.021406,-0.028677,...,-0.021373,-0.032757,-0.051921,0.041006,-0.032012,-0.021292,-0.011248,-0.119632,-0.100451,0.084332
241858,AZZTOUKVTUMVM,B00HWT8I24,3.0,It was a bit challenging to get this to synch ...,-0.081250,0.072284,-0.007992,0.058251,0.044123,0.037043,...,0.050207,-0.028469,-0.042262,0.019616,-0.062088,0.051815,0.021332,0.111828,-0.120888,0.056730
241859,AZZTOUKVTUMVM,B00IX9ZDKC,5.0,What can I say. It worked great and my compute...,-0.059075,0.056959,0.006054,-0.039024,-0.013510,0.021395,...,0.011553,-0.000094,-0.003660,-0.014688,-0.054684,0.072146,0.047209,0.017928,-0.033454,0.011537


### 모델 준비 (인코딩)

In [8]:
# Encoding dictionaries
def create_encoding_dict(feature, start_point):
    feature_dict = {}
    for value in set(feature):
        feature_dict[value] = start_point + len(feature_dict)
    return feature_dict, start_point + len(feature_dict)

In [9]:
# 사용자, 아이템, 직업, 성별 인코딩
user_dict, start_point = create_encoding_dict(data['user_id'], 0)
item_dict, start_point = create_encoding_dict(data['item_id'], start_point)

In [10]:
# 텍스트 임베딩
text_index = start_point
start_point += 384

# 전체 특성 수 계산
num_x = start_point

# 각 특성의 개수 출력 (선택적)
print(f"Number of Users: {len(user_dict)}")
print(f"Number of Items: {len(item_dict)}")
print(f"전체 특성 수: {num_x}")

Number of Users: 17355
Number of Items: 17244
전체 특성 수: 34983


In [11]:
num_x

34983

In [12]:
x = data.copy()
y = data['user_id']
ratings_train, ratings_test = train_test_split(x, test_size=0.25, stratify=y, random_state=8)

In [13]:
# train set 평점의 평균값 -> 타겟 변수에서 빼서 평균 평점에 대한 보정 진행
w0 = np.mean(ratings_train['rating'])

In [17]:
def encode_data(input, bias, user_dict, item_dict, embeddings_start_idx):
    data = []
    target = []

    for i in range(len(input)):
        ea_case = input.iloc[i]
        x_index = []
        x_value = []

        # user id encoding
        x_index.append(user_dict[ea_case['user_id']])
        x_value.append(1.)

        # item id encoding
        x_index.append(item_dict[ea_case['item_id']])
        x_value.append(1.)
        
        # review encoding
        review_embed = ea_case[-384:]        # 해당 리뷰의 임베딩
        for j in range(384):
            x_index.append(embeddings_start_idx+j)
            x_value.append(review_embed[j])

        # target encoding
        data.append([x_index, x_value])
        target.append(ea_case['rating']-bias)

        # 진행 상황 출력
        if (i % 30000) == 0:
            print('Encoding ', i, 'cases...')
    
    return data, target

In [18]:
print('Encoding Train Set')
train_data, train_target = encode_data(ratings_train, w0, user_dict, item_dict, text_index)
print('Encoding Test Set')
test_data, test_target = encode_data(ratings_test, w0, user_dict, item_dict, text_index)

Encoding Train Set
Encoding  0 cases...
Encoding  10000 cases...
Encoding  20000 cases...
Encoding  30000 cases...
Encoding  40000 cases...
Encoding  50000 cases...
Encoding  60000 cases...
Encoding  70000 cases...
Encoding  80000 cases...
Encoding  90000 cases...
Encoding  100000 cases...
Encoding  110000 cases...
Encoding  120000 cases...
Encoding  130000 cases...
Encoding  140000 cases...
Encoding  150000 cases...
Encoding  160000 cases...
Encoding  170000 cases...
Encoding  180000 cases...
Encoding Test Set
Encoding  0 cases...
Encoding  10000 cases...
Encoding  20000 cases...
Encoding  30000 cases...
Encoding  40000 cases...
Encoding  50000 cases...
Encoding  60000 cases...


### 모델 학습

In [19]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

class FM():
    def __init__(self, N, K, train_x, train_y, test_x, test_y, alpha, beta, iterations=100, tolerance=0.005, l2_reg=True, verbose=True): # 초기화
        self.K = K                          # Number of latent factors
        self.N = N                          # Number of x (variables)
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.l2_reg = l2_reg
        self.tolerance = tolerance
        self.verbose = verbose

        # w와 v 초기화
        self.w = np.random.normal(scale=1./self.N, size=(self.N)) # 사이즈는 변수의 수만큼. 변수마다 bias 하나
        self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K)) # 변수의 수 * K

        # Train/Test 분리
        self.train_x = train_x
        self.test_x = test_x
        self.train_y = train_y
        self.test_y = test_y

    def test(self):                                     # Training 하면서 RMSE 계산 
        # SGD를 iterations 숫자만큼 수행
        best_RMSE = float('inf') # stop 위해
        best_iteration = 0
        training_process = []
        for i in range(self.iterations): # 600번
            rmse1 = self.sgd(self.train_x, self.train_y)        # SGD & Train RMSE 계산
            rmse2 = self.test_rmse(self.test_x, self.test_y)    # Test RMSE 계산     
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                       # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance:  # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process
        
    # w, v 업데이트를 위한 Stochastic gradient descent 
    def sgd(self, x_data, y_data):
        y_pred = []
        for data, y in zip(x_data, y_data): # 100,000번. x_data, y_data가 100,000개
            x_idx = data[0] # 데이터의 첫번째 (x_index, x_value)에 대한 인덱스 받아옴
            x_0 = np.array(data[1])     # xi axis=0 [1, 2, 3] (1차원)
            x_1 = x_0.reshape(-1, 1)    # xi axis=1 [[1], [2], [3]] (2차원: V matrix와 계산 위해서)
    
            # biases
            bias_score = np.sum(self.w[x_idx] * x_0) # 여기선 x_0를 1차원으로 사용. w matrix는 1차원이기 때문
    
            # score 계산
            vx = self.v[x_idx] * (x_1)          # v matrix * x (브로드캐스팅)
            sum_vx = np.sum(vx, axis=0)         # sigma(vx): 칼럼으로 쭉 더한 것 (element K개 (=350개))
            sum_vx_2 = np.sum(vx * vx, axis=0)  # ( v matrix * x )의 제곱: element 350개
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

            # 예측값 계산
            y_hat = bias_score + latent_score # bias까지 더하면 최종 예측값 (전체 평균은 전에 뺐기 때문에 따로 또 빼주지 않음)
            y_pred.append(y_hat) # y_pred 75,000개 (아까 train,test 분리함)
            error = y - y_hat # 에러 구했으니까 아래에서 업데이트 가능
            # w, v 업데이트 (week 7 수업자료에 있는 update rule)
            if self.l2_reg:     # regularization이 있는 경우
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1) - self.beta * self.v[x_idx])
            else:               # regularization이 없는 경우
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1))
        return RMSE(y_data, y_pred) 

    def test_rmse(self, x_data, y_data): # test set에 대한 RMSE
        y_pred = []
        for data , y in zip(x_data, y_data):
            y_hat = self.predict(data[0], data[1])
            y_pred.append(y_hat)
        return RMSE(y_data, y_pred)

    def predict(self, idx, x):
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1, 1)

        # biases
        bias_score = np.sum(self.w[idx] * x_0)

        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx, axis=0)
        sum_vx_2 = np.sum(vx * vx, axis=0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

    def predict_one(self, user_id, movie_id):
        x_idx = np.array([user_dict[user_id], item_dict[movie_id]])
        x_data = np.array([1, 1])
        return self.predict(x_idx, x_data) + w0

In [21]:
K = 220
fm1 = FM(num_x, K, train_data, train_target, test_data, test_target, alpha=0.0014, beta=0.003,  
         iterations=400, tolerance=0.0005, l2_reg=True, verbose=True)

result = fm1.test()

Iteration: 10 ; Train RMSE = 0.916461 ; Test RMSE = 0.945540
Iteration: 20 ; Train RMSE = 0.830261 ; Test RMSE = 0.921580
24 0.9198331413464679


In [22]:
with open('/home/ryu/thesis/new_amazon/state8/FM_model8.pkl', 'wb') as f:
    pickle.dump(fm1, f)