In [1]:
# https://www.youtube.com/watch?v=LJX5hdw-zUI&t=39s

from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
PATH = Path("ml-latest-small")
list(PATH.iterdir())

[PosixPath('ml-latest-small/links.csv'),
 PosixPath('ml-latest-small/README.txt'),
 PosixPath('ml-latest-small/movies.csv'),
 PosixPath('ml-latest-small/tags.csv'),
 PosixPath('ml-latest-small/ratings.csv')]

In [3]:
! head ml-latest-small/ratings.csv

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


In [4]:
data = pd.read_csv(PATH/"ratings.csv")

In [5]:
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
## 인코딩 데이터
# we encode the data to have contiguous ids for users and movies. you can think about this as a categorical encoding of out two categorical variables userid and movieid.
## 우리는 사용자와 영화에 대해 연속적인 ID를 갖도록 데이터를 인코딩합니다. 이것을 두 개의 범주형 변수 userid와 movieid의 범주형 인코딩으로 생각할 수 있습니다.
time_80 = np.quantile(data.timestamp.values,0.8)

time_80

1458635171.0

In [7]:
train = data[data["timestamp"] < time_80].copy()
val = data[data["timestamp"] >= time_80].copy()

In [8]:
print(len(train),len(val))

80668 20168


In [9]:
## 유저아이디를 유니크, 정렬해줌
train_user_ids = np.sort(np.unique(train.userId.values))
# train_user_ids

In [10]:
## 유니크 아이디 숫자
num_users = len(train_user_ids)
num_users

522

In [11]:
userid2idx = {o:i for i,o in enumerate(train_user_ids)}
# userid2idx
# 왼쪽이 실데이터의 유저아이디, 오른쪽이 라벨번호(유저)

In [12]:
## 데이터 정제
train["userId"] = train["userId"].apply(lambda x:userid2idx[x])
train

Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,4.0,964982703
1,0,3,4.0,964981247
2,0,6,4.0,964982224
3,0,47,5.0,964983815
4,0,50,5.0,964982931
...,...,...,...,...
99529,521,892,3.0,847221080
99530,521,1056,3.0,847221080
99531,521,1059,3.0,847221054
99532,521,1150,4.0,847221054


In [13]:
## 데이터 정제
## -1 유저는 트래이닝 하지 않음
val["userId"] = val["userId"].apply(lambda x:userid2idx.get(x,-1))
val

Unnamed: 0,userId,movieId,rating,timestamp
1434,14,1,2.5,1510577970
1436,14,47,3.5,1510571970
1440,14,260,5.0,1510571946
1441,14,293,3.0,1510571962
1442,14,296,4.0,1510571877
...,...,...,...,...
100831,-1,166534,4.0,1493848402
100832,-1,168248,5.0,1493850091
100833,-1,168250,5.0,1494273047
100834,-1,168252,5.0,1493846352


In [14]:
val = val[val["userId"] >= 0].copy()
val

Unnamed: 0,userId,movieId,rating,timestamp
1434,14,1,2.5,1510577970
1436,14,47,3.5,1510571970
1440,14,260,5.0,1510571946
1441,14,293,3.0,1510571962
1442,14,296,4.0,1510571877
...,...,...,...,...
95960,513,170705,5.0,1521397596
95961,513,172591,4.5,1521467819
95962,513,174055,4.0,1521397739
95963,513,176371,4.0,1521397623


In [15]:
## 임베딩 무비 아이디

train_movie_ids = np.sort(np.unique(train.movieId.values))
num_items = len(train_movie_ids)
print(num_items)
train_movie_ids
## 실제 영화 종류는 7867개임

7867


array([     1,      2,      3, ..., 150548, 152711, 155168])

In [16]:
movieid2idx = {o:i for i,o in enumerate(train_movie_ids)}
train["movieId"] = train["movieId"].apply(lambda x: movieid2idx[x])
val["movieId"] = val["movieId"].apply(lambda x: movieid2idx.get(x,-1))


In [17]:
val = val[val["movieId"] >= 0 ].copy()
val

Unnamed: 0,userId,movieId,rating,timestamp
1434,14,0,2.5,1510577970
1436,14,43,3.5,1510571970
1440,14,224,5.0,1510571946
1441,14,254,3.0,1510571962
1442,14,257,4.0,1510571877
...,...,...,...,...
95945,513,7352,4.0,1521397830
95946,513,7507,4.5,1521397833
95949,513,7619,4.0,1521467672
95950,513,7653,4.5,1521398264


In [18]:
## 임베딩 레이어
import torch
import torch.nn as nn
import torch.nn.functional as F

In [19]:
embed = nn.Embedding(10,3)
print("임베딩 웨이트",embed.weight)

임베딩 웨이트 Parameter containing:
tensor([[-0.5456,  0.3833, -0.8424],
        [ 0.6062, -0.1263,  0.7278],
        [ 0.6806,  0.4229,  1.0881],
        [-1.5685,  0.4669, -1.7190],
        [ 0.5448, -0.5164,  0.3161],
        [-1.7242,  0.3768,  0.5971],
        [-1.1567,  0.1724, -0.2945],
        [-0.5785,  1.3830, -1.1507],
        [-1.7807, -0.1004, -1.8281],
        [-0.8879,  0.6091, -2.3706]], requires_grad=True)


In [20]:
class MF(nn.Module):
    def __init__(self,num_users,num_items,emb_size=512):
        super(MF,self).__init__()
        self.user_emb = nn.Embedding(num_users,emb_size)
        self.item_emb = nn.Embedding(num_items,emb_size)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
    
    def forward(self,u,v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        
        return (u*v).sum(1)        
        

In [21]:
# MF 모델 디버깅
df = pd.DataFrame({"userId":[0,0,1,1,3,4],"movieId":[0,1,2,1,3,0],"rating":[4,5,3,1,3,4]})
print(df)

   userId  movieId  rating
0       0        0       4
1       0        1       5
2       1        2       3
3       1        1       1
4       3        3       3
5       4        0       4


In [22]:
num_users = 5
num_item = 4
emb_size = 3

user_emb = nn.Embedding(num_users,emb_size)
item_emb = nn.Embedding(num_items,emb_size)
users = torch.LongTensor(df.userId.values)
items = torch.LongTensor(df.movieId.values)

In [23]:
U = user_emb(users)
V = item_emb(items)

In [24]:
print(U.shape,V.shape)
print(U,"\n",V)

torch.Size([6, 3]) torch.Size([6, 3])
tensor([[-0.0454,  1.3591,  0.4848],
        [-0.0454,  1.3591,  0.4848],
        [ 1.0756,  0.6630,  1.9769],
        [ 1.0756,  0.6630,  1.9769],
        [-0.3693,  1.4941, -0.7260],
        [ 0.5897, -2.0387, -1.2035]], grad_fn=<EmbeddingBackward>) 
 tensor([[-1.5235, -0.8728, -0.5908],
        [-1.0782,  0.1864, -0.6271],
        [-0.6850,  0.2223, -2.0722],
        [-1.0782,  0.1864, -0.6271],
        [-0.2778,  0.8583, -1.1754],
        [-1.5235, -0.8728, -0.5908]], grad_fn=<EmbeddingBackward>)


In [25]:
## 단순 행렬 요소끼리의 곱 element wise multiplication
print(U*V,"\n",(U*V).shape)
print((U*V).sum(1))

tensor([[ 0.0692, -1.1862, -0.2865],
        [ 0.0489,  0.2533, -0.3041],
        [-0.7368,  0.1474, -4.0965],
        [-1.1598,  0.1236, -1.2397],
        [ 0.1026,  1.2824,  0.8533],
        [-0.8984,  1.7793,  0.7111]], grad_fn=<MulBackward0>) 
 torch.Size([6, 3])
tensor([-1.4035e+00, -1.8407e-03, -4.6859e+00, -2.2759e+00,  2.2383e+00,
         1.5920e+00], grad_fn=<SumBackward1>)


In [26]:
## 학습
num_users=  len(train.userId.unique())
num_items = len(train.movieId.unique())
print(num_users,num_items)

522 7867


In [27]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = MF(num_users,num_items,emb_size=512)
model.to(device)

MF(
  (user_emb): Embedding(522, 512)
  (item_emb): Embedding(7867, 512)
)

In [28]:
epochs = 600
lr = 0.01
wd = 0


optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=wd)
# t_loss = nn.MSELoss()
# v_loss = nn.MSELoss()

for i in range(epochs):
    model.train()
    
    ## 임베딩레이어는 long -> int 형 자료형으로 넣어야함
    users = torch.LongTensor(train.userId.values).to(device)
    items = torch.LongTensor(train.movieId.values).to(device)
    ratings = torch.FloatTensor(train.rating.values).to(device)
    
    

    y_hat = model(users,items)
    y_hat = y_hat.type(torch.FloatTensor).to(device)
    t_loss = F.mse_loss(y_hat,ratings)
    
    
    
    optimizer.zero_grad()
    t_loss.backward()
    optimizer.step()
    
    
    ## 팽가
    model.eval()
    with torch.no_grad():
        users = torch.LongTensor(val.userId.values).to(device)
        items = torch.LongTensor(val.movieId.values).to(device)
        ratings = torch.FloatTensor(val.rating.values).to(device)
        y_hat = model(users,items)
        
        
        correct = (ratings == y_hat.to(device)).sum().item() 
        
        v_loss = F.mse_loss(y_hat,ratings)
    
    print(f"{i} epoch , train loss : {t_loss:.3f} \t val_loss : {v_loss:.3f} \t {(correct/len(ratings))*100}")
    
    

0 epoch , train loss : 11.234 	 val_loss : 9.562 	 0.0
1 epoch , train loss : 9.370 	 val_loss : 7.374 	 0.0
2 epoch , train loss : 7.180 	 val_loss : 5.105 	 0.0
3 epoch , train loss : 4.908 	 val_loss : 3.068 	 0.0
4 epoch , train loss : 2.868 	 val_loss : 1.641 	 0.0
5 epoch , train loss : 1.439 	 val_loss : 1.158 	 0.0
6 epoch , train loss : 0.961 	 val_loss : 1.636 	 0.0
7 epoch , train loss : 1.430 	 val_loss : 2.568 	 0.0
8 epoch , train loss : 2.240 	 val_loss : 3.221 	 0.0
9 epoch , train loss : 2.655 	 val_loss : 3.242 	 0.0
10 epoch , train loss : 2.464 	 val_loss : 2.762 	 0.0
11 epoch , train loss : 1.907 	 val_loss : 2.107 	 0.0
12 epoch , train loss : 1.311 	 val_loss : 1.546 	 0.0
13 epoch , train loss : 0.901 	 val_loss : 1.208 	 0.0
14 epoch , train loss : 0.752 	 val_loss : 1.096 	 0.0
15 epoch , train loss : 0.819 	 val_loss : 1.138 	 0.0
16 epoch , train loss : 1.001 	 val_loss : 1.244 	 0.0
17 epoch , train loss : 1.197 	 val_loss : 1.338 	 0.0
18 epoch , train lo

In [29]:
users

tensor([ 14,  14,  14,  ..., 513, 513, 513], device='cuda:0')

In [30]:
items

tensor([   0,   43,  224,  ..., 7619, 7653, 7803], device='cuda:0')

In [31]:
ratings

tensor([2.5000, 3.5000, 5.0000,  ..., 4.0000, 4.5000, 5.0000], device='cuda:0')

In [32]:
y_hat

tensor([3.4426, 3.2395, 3.9963,  ..., 4.8663, 4.3530, 4.6669], device='cuda:0')

In [33]:
print(y_hat)
y_hat_numpy = y_hat.detach().cpu().numpy()
# print(np.round(x,1))

def refine_rating(x):
    rating_list = np.array([0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5])
    result = []    
    for item in x:
        idx = (np.abs(rating_list - item)).argmin()
        result.append(rating_list[idx])   
    return result

refine_y_hat = refine_rating(y_hat_numpy)
refine_y_hat = torch.tensor(refine_y_hat)
print(refine_y_hat)


tensor([3.4426, 3.2395, 3.9963,  ..., 4.8663, 4.3530, 4.6669], device='cuda:0')
tensor([3.5000, 3.0000, 4.0000,  ..., 5.0000, 4.5000, 4.5000],
       dtype=torch.float64)


In [34]:
epochs = 600
lr = 0.01
wd = 0


optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=wd)
# t_loss = nn.MSELoss()
# v_loss = nn.MSELoss()

for i in range(epochs):
    model.train()
    
    ## 임베딩레이어는 long -> int 형 자료형으로 넣어야함
    users = torch.LongTensor(train.userId.values).to(device)
    items = torch.LongTensor(train.movieId.values).to(device)
    ratings = torch.FloatTensor(train.rating.values).to(device)
    
    

    y_hat = model(users,items)
    y_hat = y_hat.type(torch.FloatTensor).to(device)
    t_loss = F.mse_loss(y_hat,ratings)
    
    
    
    optimizer.zero_grad()
    t_loss.backward()
    optimizer.step()
    
    
    ## 팽가
    model.eval()
    with torch.no_grad():
        users = torch.LongTensor(val.userId.values).to(device)
        items = torch.LongTensor(val.movieId.values).to(device)
        ratings = torch.FloatTensor(val.rating.values).to(device)
        y_hat = model(users,items)
        
        refine_y_hat = refine_rating(y_hat_numpy)
        refine_y_hat = torch.tensor(refine_y_hat)
        
        correct = (ratings == refine_y_hat.to(device)).sum().item() 
        
        v_loss = F.mse_loss(y_hat,ratings)
    
    print(f"{i} epoch , train loss : {t_loss:.3f} \t val_loss : {v_loss:.3f} \t {(correct/len(ratings))*100:.2f}%")
    
    

0 epoch , train loss : 0.000 	 val_loss : 0.889 	 24.26%
1 epoch , train loss : 0.000 	 val_loss : 1.097 	 24.26%
2 epoch , train loss : 0.109 	 val_loss : 0.893 	 24.26%
3 epoch , train loss : 0.028 	 val_loss : 0.871 	 24.26%
4 epoch , train loss : 0.073 	 val_loss : 0.881 	 24.26%
5 epoch , train loss : 0.064 	 val_loss : 0.908 	 24.26%
6 epoch , train loss : 0.030 	 val_loss : 0.934 	 24.26%
7 epoch , train loss : 0.021 	 val_loss : 0.949 	 24.26%
8 epoch , train loss : 0.033 	 val_loss : 0.942 	 24.26%
9 epoch , train loss : 0.036 	 val_loss : 0.915 	 24.26%
10 epoch , train loss : 0.025 	 val_loss : 0.891 	 24.26%
11 epoch , train loss : 0.015 	 val_loss : 0.880 	 24.26%
12 epoch , train loss : 0.014 	 val_loss : 0.878 	 24.26%
13 epoch , train loss : 0.019 	 val_loss : 0.878 	 24.26%
14 epoch , train loss : 0.019 	 val_loss : 0.881 	 24.26%
15 epoch , train loss : 0.014 	 val_loss : 0.889 	 24.26%
16 epoch , train loss : 0.009 	 val_loss : 0.899 	 24.26%
17 epoch , train loss : 

In [40]:
y1 = ratings.detach().cpu().tolist()
y2 = refine_y_hat.detach().cpu().tolist()

print(len(refine_y_hat))

count=0
for i in range(0,len(y1)):
    if y1[i] == y2[i]:
        count+=1
        # print(f"{i}번째 맞음")
        
print(count)

1311
318
