In [None]:
import numpy as np

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/content/배달_Dataset_csv.csv', encoding = 'cp949')

df.columns = ['User', 'Store', 'Category', 'Order_num', 'Picture', 'Rating', 'Comment']
df.head()

Unnamed: 0,User,Store,Category,Order_num,Picture,Rating,Comment
0,1,남매컵밥,한식,3,O,5,
1,1,화덕에구운족발신선생,"족발, 보쌈",1,O,4,
2,1,감성커피,디저트,6,O,5,
3,1,옥수동묵은지김치찜,한식,4,O,5,
4,1,장독대항아리보쌈,"족발, 보쌈",1,O,5,먹는데 정신팔려 사진을못남겼숩니다.이해해주세요.고기좋은건 누구나 다인정할겁니다. 보...


In [None]:
df.shape

(487, 9)

In [None]:
def prepare_dataset(df):

    df['user_id'] = df['User'].astype("category").cat.codes
    df['item_id'] = df['Store'].astype("category").cat.codes

    pivot = df.pivot_table(index = 'user_id', columns = 'item_id')['Order_num']
    pivot.fillna(0, inplace = True)

    df = df[['user_id', 'item_id', 'Order_num']] 
    df_train, df_test = train_test_split(df)

    users = list(np.sort(df.user_id.unique())) 
    items = list(np.sort(df.item_id.unique())) 

    rows = df_train['user_id'].astype(int)   
    cols = df_train['item_id'].astype(int)
    
    uids = np.array(rows.tolist())
    iids = np.array(cols.tolist())

    df_neg = get_negatives(uids, iids, items, df_test)

    return uids, iids, df_train, df_test, df_neg, users, items, pivot
    

In [None]:
label = pivot.stack()
print(type(label))
label = pd.DataFrame(label)
label

NameError: ignored

In [None]:
def get_negatives(uids, iids, items, df_test) :

  negativeList = []
  
  test_u = df_test['user_id'].values.tolist()
  test_i = df_test['item_id'].values.tolist()

  test_ratings = list(zip(test_u, test_i))
  zipped = set(zip(uids, iids))

  for (u, i) in test_ratings :

    negative = []
    negative.append((u, i))

    for t in range(100) :

      j = np.random.randint(len(items))

      while (u, j) in zipped :
        j = np.random.randint(len(items))

      negative.append(j)
    negativeList.append(negative)

  df_neg = pd.DataFrame(negativeList)

  return df_neg

### 훈련데이터 로더

In [None]:
# Train
def get_train_instances(uids, iids, num_neg, num_items) :

  user_input, item_input, labels = [], [], [] 
  zipped = set(zip(uids, iids))

  for (u, i) in zip(uids, iids) :

    user_input.append(u)
    item_input.append(i)
    labels.append(1)

    for t in range(num_neg) :

      j = np.random.randint(num_items)
      while (u, j) in zipped :

        j = np.random.randint(num_items)
      user_input.append(u)
      item_input.append(j)
      labels.append(0)

  return user_input, item_input, labels

In [None]:
uids, iids, df_train, df_test, df_neg, users, items, pivot = prepare_dataset(df)
user_input, item_input, labels = get_train_instances(uids, iids, num_neg = 4, num_items = len(items))

In [None]:
len(user_input), len(item_input), len(labels)

(1825, 1825, 1825)

In [None]:
len(items), len(users)

(214, 30)

In [None]:
len(labels)

1825

In [None]:
class CustomDataset(Dataset) :

  def __init__(self) :
    self.users = user_input
    self.items = item_input
    self.labels = labels

  def __len__(self) :

    return len(self.users)

  def __getitem__(self, idx) :
    user = self.users[idx]
    item = self.items[idx]
    label = self.labels[idx]

    return user, item, label

dataset = CustomDataset()

loader = DataLoader(dataset = dataset, 
                    batch_size = 128, 
                    shuffle = True, 
                    drop_last = True)

### 테스트 데이터로더

In [None]:
def get_test_instances(num_neg, num_items) :

  user_test, item_test, labels = [], [], []

  zipped = set(test_ratings)

  for (u, i) in test_ratings :

    user_test.append(u)
    item_test.append(i)
    labels.append(1)

    for t in range(num_neg) :

      j = np.random.randint(num_items)
      while (u, j) in zipped :

        j = np.random.randint(num_items)

        user_test.append(u)
        item_test.append(j)
        labels.append(0)

  return labels

labels = get_test_instances(num_neg = 4, num_items = len(items))

In [None]:
# Test
test_u = df_test['user_id'].values.tolist()
test_i = df_test['item_id'].values.tolist()

test_ratings = list(zip(test_u, test_i))
df_neg = df_neg.drop(df_neg.columns[0], axis = 1)
test_negatives = df_neg.values.tolist()

def eval_rating(idx, test_ratings, test_negatives) :

  items = test_negatives[idx]
  users = test_ratings[idx][0]
  holdout = test_ratings[idx][1]
  items.append(holdout)

  return items, users

for user_idx in range(len(test_ratings)) :

  test_items, test_users = eval_rating(user_idx, test_ratings, test_negatives)

In [None]:
class CustomDataset(Dataset) :

  def __init__(self) :
    self.users = test_users
    self.items = test_items
    self.labels = labels

  def __len__(self) :

    return len(self.users)

  def __getitem__(self, idx) :

    user_test = self.users[idx]
    item_test = self.items[idx]
    label_test = self.labels[idx]

    return user_test, item_test, label_test

test_dataset = CustomDataset()

test_loader = DataLoader(dataset = dataset, 
                         batch_size = 128, 
                         shuffle = True, 
                         drop_last = True)

## 모델 생성

In [None]:
class  GMF_and_MLP(nn.Module) :

  def __init__(self, user_num, item_num) :

    super(GMF_and_MLP, self).__init__()

    self.user_num = user_num
    self.item_num = item_num

    nf =  10

    self.gmf_user_embedding = nn.Embedding(user_num, nf)
    self.gmf_item_embedding = nn.Embedding(item_num, nf)

    self.mlp_user_embedding = nn.Embedding(user_num, nf)
    self.mlp_item_embedding = nn.Embedding(item_num, nf)

    self.dropout = nn.Dropout(0.2)
    input_size = nf * 2

    self.layer1 = nn.Sequential(
        nn.Linear(input_size, input_size // 2),
        nn.ReLU(),
        nn.Dropout(0.2)
    )

    input_size1 = input_size // 2

    self.layer2 = nn.Sequential(
        nn.Linear(input_size1, input_size1 // 2),
        nn.ReLU(),
        nn.Dropout(0.2)
    )

    input_size2 = input_size1 // 2

    self.layer3 = nn.Sequential(
        nn.Linear(input_size2, input_size2 // 2),
        nn.ReLU()
    )

    input_size3 = input_size2 // 2

    self.fc_layer = nn.Sequential(
        nn.Linear(nf + input_size3, 1)
    )

    for m in self.layer3 :
      if isinstance(m, nn.Linear) :
        nn.init.xavier_uniform_(m.weight)

    for m in self.fc_layer :
      if isinstance(m, nn.Linear) :
        nn.init.xavier_uniform_(m.weight)

  def forward(self, user, item):

    gmf_user_embedding = self.gmf_user_embedding(user)
    gmf_item_embedding = self.gmf_item_embedding(item)
    output_gmf = (gmf_user_embedding * gmf_item_embedding)

    mlp_user_embedding = self.mlp_user_embedding(user)
    mlp_item_embedding = self.mlp_item_embedding(item)

    mlp_concat = torch.cat((mlp_user_embedding, mlp_item_embedding), -1)

    out_mlp = self.dropout(mlp_concat)
    out_mlp = self.layer1(out_mlp)
    out_mlp = self.layer2(out_mlp)
    out_mlp = self.layer3(out_mlp)

    self.merged = torch.cat((out_mlp, output_gmf), -1)

    out = self.fc_layer(self.merged)
    out = out.view(-1)

    return out

## 모델 적용

In [None]:
user_num = len(users)
item_num = len(items)

model = GMF_and_MLP(user_num = user_num, item_num = item_num)

## 훈련

In [None]:
lr = 0.01
epochs = 20

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = lr)

In [None]:
from datetime import datetime
start_time = datetime.now()

for epoch in range(1, epochs + 1) :
  train_loss = 0

  for user, item, label in loader :
    label = label.float()

    optimizer.zero_grad()
    output = model(user, item)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

  train_loss /= len(loader)
  print(f"\n[Epoch : {epoch}], \tTrain Loss : {train_loss:.4f}")

end_time = datetime.now()
print('\nelapsed time', end_time - start_time)


[Epoch : 1], 	Train Loss : 1.1876

[Epoch : 2], 	Train Loss : 0.5467

[Epoch : 3], 	Train Loss : 0.3134

[Epoch : 4], 	Train Loss : 0.2207

[Epoch : 5], 	Train Loss : 0.1864

[Epoch : 6], 	Train Loss : 0.1697

[Epoch : 7], 	Train Loss : 0.1616

[Epoch : 8], 	Train Loss : 0.1589

[Epoch : 9], 	Train Loss : 0.1557

[Epoch : 10], 	Train Loss : 0.1552

[Epoch : 11], 	Train Loss : 0.1516

[Epoch : 12], 	Train Loss : 0.1513

[Epoch : 13], 	Train Loss : 0.1487

[Epoch : 14], 	Train Loss : 0.1453

[Epoch : 15], 	Train Loss : 0.1452

[Epoch : 16], 	Train Loss : 0.1417

[Epoch : 17], 	Train Loss : 0.1402

[Epoch : 18], 	Train Loss : 0.1374

[Epoch : 19], 	Train Loss : 0.1339

[Epoch : 20], 	Train Loss : 0.1325

elapsed time 0:00:00.673356


##평가

In [None]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, _ in test_loader:

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).numpy().tolist()
        gt_item = item[0].item()

        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [None]:
metrics(model, test_loader, 10)

(0.07142857142857142, 0.021502142547427227)

## 데이터 적용

In [None]:
user_id = 0
user_candidate_item = np.array(iids).reshape(-1, 1)
user_input = np.full(len(user_candidate_item), user_id, dtype='int32').reshape(-1, 1)

class CustomDataset(Dataset) :

  def __init__(self) :
    self.users = user_input
    self.items = user_candidate_item

  def __len__(self) :
    return len(self.users)

  def __getitem__(self, idx) :

    user = self.users[idx]
    item = self.items[idx]

    return user, item

dataset = CustomDataset()

loader = DataLoader(dataset = dataset, 
                    batch_size = 128)

In [None]:
def recommend(model, test_loader, top_k):

    for user, item in loader:

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).numpy().tolist()

    return recommends

recommends = metrics(model, loader, 10)
recommends

[24, 178, 167, 156, 166, 178, 195, 65, 11, 14]

In [None]:
for idx in recommends :

  store = []
  store.append(df['Store'][idx])

  print(store)

['롯데리아']
['후라이드참잘하는집']
['지코바숯불치킨']
['이마미야 요코하마 이에케이 라멘']
['불향 제육']
['후라이드참잘하는집']
['순살만공격']
['분식쌀롱']
['삼첩분식']
['피자나라치킨공주']
