In [100]:
import pickle
import pandas as pd
import numpy as np
from lightfm import LightFM
from scipy.sparse import coo_matrix, csr_matrix

In [2]:
traindf = pd.read_parquet('gs://leo_tapas/primary/train_20240118.parquet')
validdf = pd.read_parquet('gs://leo_tapas/primary/valid_20240118.parquet')
testdf = pd.read_parquet('gs://leo_tapas/primary/test_20240118.parquet')

In [55]:
item_embeddings = pd.concat([
    traindf.drop_duplicates('series_id')[['series_id','onehot']],
    validdf.drop_duplicates('series_id')[['series_id','onehot']],
    testdf.drop_duplicates('series_id')[['series_id','onehot']],
]).sort_values("series_id")

In [56]:
item_mapper = pd.Series(np.arange(len(item_embeddings)), index=item_embeddings.series_id)

In [57]:
item_embeddings = np.stack(item_embeddings.onehot.values)

In [58]:
traindf['series_id_index'] = item_mapper[traindf.series_id.values].values

In [89]:
row = traindf.user_id_index.values
col = traindf.series_id_index.values
data = np.sign(traindf.profit.values)

In [90]:
data

array([-1., -1., -1., ..., -1.,  1., -1.])

In [109]:
interactions = coo_matrix((data, (row,col)), dtype='int')
item_features = csr_matrix(item_embeddings)

In [110]:
model = LightFM()

In [None]:
model.fit(interactions=interactions, item_features=item_features, epochs=1000, num_threads=8, verbose=True)

Epoch: 100%|██████████| 1000/1000 [2:22:27<00:00,  8.55s/it] 


<lightfm.lightfm.LightFM at 0x7fdb28f197b0>

In [112]:
with open('lightfm_onehot_20240118.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [107]:
model.predict(user_ids=validdf.user_id_index.values, item_ids=validdf.series_id_index.values)

AttributeError: 'DataFrame' object has no attribute 'series_id_index'

In [38]:
class SimpleDataset(Dataset):
    def __init__(self, df):
        self.item_embeddings = np.stack(df.pca.values)
        self.user_indices = df.user_id_index.values
        self.profit = df.profit.values/1000
        
    def __len__(self):
        return len(self.profit)
    
    def __getitem__(self, idx):
        ie = torch.tensor(self.item_embeddings[idx])
        ui = torch.tensor(self.user_indices[idx])
        pf = torch.tensor(self.profit[idx])
        return ie, ui, pf

In [39]:
trainds = SimpleDataset(train_df)
validds = SimpleDataset(valid_df)
testds = SimpleDataset(test_df)

In [44]:
traindl = DataLoader(trainds, batch_size=50000, shuffle=True)
validdl = DataLoader(validds, batch_size=10000, shuffle=False)
testdl = DataLoader(testds, batch_size=10000, shuffle=False)

In [45]:
epoch = 100

In [46]:
from tqdm import tqdm

In [None]:
pbar = tqdm(range(epoch))

for _ in pbar:

    for ie, ui, pf in traindl:
        optimizer.zero_grad()
        y_hat = model(ui.cuda(), ie.cuda())
        loss = F.mse_loss(y_hat,pf.cuda())
        loss.backward()
        optimizer.step()
    
    valid_hard = 0
    valid_naive = 0
    for ie, ui, pf in validdl:
        y_hat = model(ui.cuda().long(), ie.cuda())
        np_y_hat = y_hat.cpu().detach().numpy()
        np_pf = pf.detach().numpy()
        valid_hard += np.sum(np_pf[np.where(np_y_hat>0)])
        valid_naive += np.sum(np_pf[np.where((np_y_hat>0)|(np_pf>0))])
        
    test_hard = 0
    test_naive = 0
    for ie, ui, pf in testdl:
        y_hat = model(ui.cuda().long(), ie.cuda())
        np_y_hat = y_hat.cpu().detach().numpy()
        np_pf = pf.detach().numpy()
        test_hard += np.sum(np_pf[np.where(np_y_hat>0)])
        test_naive += np.sum(np_pf[np.where((np_y_hat>0)|(np_pf>0))])        

    pbar.set_postfix(avg=str(int(np.mean([test_hard,test_naive]))), hard=str(int(test_hard)), naive=str(int(test_naive))) 
        

100%|██████████| 100/100 [3:12:56<00:00, 115.76s/it, avg=481366, hard=332626, naive=630107] 


In [48]:
torch.save(model.state_dict(), "MF20240118.pth")

In [49]:
uri = 'gs://leo_tapas/primary/train_20240118.parquet'
train_df.to_parquet(uri)
uri = 'gs://leo_tapas/primary/valid_20240118.parquet'
valid_df.to_parquet(uri)
uri = 'gs://leo_tapas/primary/test_20240118.parquet'
test_df.to_parquet(uri)
