In [1]:
import pickle
import d3rlpy
import pandas as pd
import numpy as np
# from sklearn.metric import mean_absolute_error

In [2]:
df = pd.read_parquet('gs://leo_tapas/primary/full_20240115.parquet')

## Metric 1: 이벤트를 주면 읽고, 아니면 안 읽는다 (Hard Condition)
## Metric 2: 이벤트를 안 줘도 읽고, 안 주면 이벤트 캐쉬만 아낀다 (Naive Condition)

In [3]:
unique_series = np.unique(df.series_id)

In [6]:
train_series = unique_series[:200]
valid_series = unique_series[200:250]
test_series = unique_series[250:300]

In [7]:
sorted_df = df.sort_values("series_id")

In [10]:
train_df = df.loc[(df.series_id >= np.min(train_series)) & (df.series_id <= np.max(train_series))]
valid_df = df.loc[(df.series_id >= np.min(valid_series)) & (df.series_id <= np.max(valid_series))]
test_df = df.loc[(df.series_id >= np.min(test_series)) & (df.series_id <= np.max(test_series))]

## User Embedding MF

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


class MF(nn.Module):
    def __init__(self, num_users, emb_size=16):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)

    def forward(self, user, item_emb):
        u = self.user_emb(user)
        return (u * item_emb).sum(1)

In [12]:
train_df

Unnamed: 0,user_id,series_id,main_genre,meta_tag,sales,cost,profit,onehot,pca
0,6059,260434,29,"Romance,Fantasy,Romance Fantasy,Romance Comedy...",0.0,300,-300.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.7331182860243817, -0.8613053367988697, -0.1..."
1,6059,260455,29,"Romance,Drama,Historical Fantasy,Romance Comed...",0.0,300,-300.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.3151955733899762, -0.7281312396229039, 0.16..."
2,6442,234811,3,,0.0,300,-300.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.4194401856699383, -0.6399378741580425, -0...."
3,6646,262666,29,"Romance,Fantasy,Romance Fantasy,Fantasy,Romanc...",15120.0,300,14820.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.1174302162223018, -0.7307430254823483, 0.10..."
4,10489,234933,25,,5040.0,200,4840.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.4240087662669842, -0.6428052730862609, -0...."
...,...,...,...,...,...,...,...,...,...
3850737,14654537,237382,29,"Romance,Fantasy,High Fantasy,Sword and Sorcery...",0.0,500,-500.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4954338757972274, 0.3361357444477873, -0.94..."
3850750,14683069,237382,29,"Romance,Fantasy,High Fantasy,Sword and Sorcery...",0.0,500,-500.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4954338757972274, 0.3361357444477873, -0.94..."
3850792,14756878,237382,29,"Romance,Fantasy,High Fantasy,Sword and Sorcery...",0.0,500,-500.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4954338757972274, 0.3361357444477873, -0.94..."
3850816,14792332,237382,29,"Romance,Fantasy,High Fantasy,Sword and Sorcery...",4875.0,500,4375.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4954338757972274, 0.3361357444477873, -0.94..."


In [13]:
unique_user_id = np.unique(train_df.user_id.values)

In [24]:
mapper = {u:i for i, u in enumerate(unique_user_id)}

In [26]:
def mapping(x):
    try:
        return mapper[x]
    except KeyError:
        return np.nan

In [28]:
train_df['user_id_index'] = train_df['user_id'].apply(lambda x: mapping(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id_index'] = train_df['user_id'].apply(lambda x: mapping(x)).values


In [34]:
valid_df['user_id_index'] = valid_df['user_id'].apply(lambda x: mapping(x))
valid_df = valid_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['user_id_index'] = valid_df['user_id'].apply(lambda x: mapping(x))


In [35]:
test_df['user_id_index'] = test_df['user_id'].apply(lambda x: mapping(x))
test_df = test_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['user_id_index'] = test_df['user_id'].apply(lambda x: mapping(x))


In [37]:
model = MF(len(unique_user_id), emb_size=256)
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters())
model.train()

MF(
  (user_emb): Embedding(144881, 256)
)

In [38]:
class SimpleDataset(Dataset):
    def __init__(self, df):
        self.item_embeddings = np.stack(df.pca.values)
        self.user_indices = df.user_id_index.values
        self.profit = df.profit.values/1000
        
    def __len__(self):
        return len(self.profit)
    
    def __getitem__(self, idx):
        ie = torch.tensor(self.item_embeddings[idx])
        ui = torch.tensor(self.user_indices[idx])
        pf = torch.tensor(self.profit[idx])
        return ie, ui, pf

In [39]:
trainds = SimpleDataset(train_df)
validds = SimpleDataset(valid_df)
testds = SimpleDataset(test_df)

In [44]:
traindl = DataLoader(trainds, batch_size=50000, shuffle=True)
validdl = DataLoader(validds, batch_size=10000, shuffle=False)
testdl = DataLoader(testds, batch_size=10000, shuffle=False)

In [45]:
epoch = 100

In [46]:
from tqdm import tqdm

In [None]:
pbar = tqdm(range(epoch))

for _ in pbar:

    for ie, ui, pf in traindl:
        optimizer.zero_grad()
        y_hat = model(ui.cuda(), ie.cuda())
        loss = F.mse_loss(y_hat,pf.cuda())
        loss.backward()
        optimizer.step()
    
    valid_hard = 0
    valid_naive = 0
    for ie, ui, pf in validdl:
        y_hat = model(ui.cuda().long(), ie.cuda())
        np_y_hat = y_hat.cpu().detach().numpy()
        np_pf = pf.detach().numpy()
        valid_hard += np.sum(np_pf[np.where(np_y_hat>0)])
        valid_naive += np.sum(np_pf[np.where((np_y_hat>0)|(np_pf>0))])
        
    test_hard = 0
    test_naive = 0
    for ie, ui, pf in testdl:
        y_hat = model(ui.cuda().long(), ie.cuda())
        np_y_hat = y_hat.cpu().detach().numpy()
        np_pf = pf.detach().numpy()
        test_hard += np.sum(np_pf[np.where(np_y_hat>0)])
        test_naive += np.sum(np_pf[np.where((np_y_hat>0)|(np_pf>0))])        

    pbar.set_postfix(avg=str(int(np.mean([test_hard,test_naive]))), hard=str(int(test_hard)), naive=str(int(test_naive))) 
        

100%|██████████| 100/100 [3:12:56<00:00, 115.76s/it, avg=481366, hard=332626, naive=630107] 


In [48]:
torch.save(model.state_dict(), "MF20240118.pth")

In [49]:
uri = 'gs://leo_tapas/primary/train_20240118.parquet'
train_df.to_parquet(uri)
uri = 'gs://leo_tapas/primary/valid_20240118.parquet'
valid_df.to_parquet(uri)
uri = 'gs://leo_tapas/primary/test_20240118.parquet'
test_df.to_parquet(uri)
