In [1]:
 import sys
sys.path.append('../')
import os
import logging
from fuxictr import datasets
from datetime import datetime
from fuxictr.utils import load_config, set_logger, print_to_json
from fuxictr.features import FeatureMap
from fuxictr.pytorch.torch_utils import seed_everything
from fuxictr.pytorch.dataloaders import H5DataLoader
from model_zoo import DeepFM
import pickle
# ^^suppress output

# Load params from config files
config_dir = './config/full_h5_config'
experiment_id = 'full_h5_initial' # corresponds to h5 input `data/tiny_h5`
params = load_config(config_dir, experiment_id)

# set up logger and random seed
set_logger(params)
logging.info("Params: " + print_to_json(params))
seed_everything(seed=params['seed'])

# Load feature_map from json
data_dir = os.path.join(params['data_root'], params['dataset_id'])
feature_map_json = os.path.join(data_dir, "feature_map.json")
feature_map = FeatureMap(params['dataset_id'], data_dir)
_ = feature_map.load(feature_map_json, params)
_ = logging.info("Feature specs: " + print_to_json(feature_map.features))


In [7]:
# Get train and validation data generators from h5
train_gen, valid_gen = H5DataLoader(feature_map,
                                    stage='train',
                                    train_data=params['train_data'],
                                    valid_data=params['valid_data'],
                                    batch_size=params['batch_size'],
                                    shuffle=params['shuffle']).make_iterator()


2023-05-07 19:16:57,136 P22431 INFO Loading data...
2023-05-07 19:16:57,141 P22431 INFO Loading data from h5: ../data/full_h5/smadex_train.h5
2023-05-07 19:20:20,177 P22431 INFO Train samples: total/307843, blocks/1
2023-05-07 19:20:20,192 P22431 INFO Loading data from h5: ../data/full_h5/smadex_val.h5
2023-05-07 19:23:13,358 P22431 INFO Validation samples: total/241205, blocks/1
2023-05-07 19:23:13,383 P22431 INFO Loading train and validation data done.


In [11]:
with open('train_gen.pkl', 'wb') as outp:
    pickle.dump(train_gen, outp, pickle.HIGHEST_PROTOCOL)

del train_gen

with open('train_gen.pkl', 'rb') as inp:
    train_gen = pickle.load(inp)


In [10]:
with open('valid.pkl', 'wb') as outp:
    pickle.dump(valid_gen, outp, pickle.HIGHEST_PROTOCOL)

del valid_gen

with open('valid.pkl', 'rb') as inp:
    valid_gen = pickle.load(inp)

In [2]:
with open('train_gen.pkl', 'rb') as inp:
    train_gen = pickle.load(inp)

with open('valid.pkl', 'rb') as inp:
    valid_gen = pickle.load(inp)

In [3]:
  # Model initialization and fitting
model = DeepFM(feature_map, **params)
model.fit(train_gen, validation_data=valid_gen, epochs=params['epochs'])
logging.info('***** Validation evaluation *****')

100%|██████████| 1885/1885 [01:51<00:00, 16.95it/s]
100%|██████████| 2406/2406 [20:19<00:00,  1.97it/s]


In [4]:
with open('model.pkl', 'wb') as outp:
    pickle.dump(model, outp, pickle.HIGHEST_PROTOCOL)

del model

In [7]:
with open('model.pkl', 'rb') as inp:
    model = pickle.load(inp)

In [8]:
y_pred = model.predict(valid_gen)


100%|██████████| 1885/1885 [01:50<00:00, 17.07it/s]


array([6.13082252e-09, 2.73190963e-08, 2.38500100e-08, ...,
       5.25770139e-09, 2.38784121e-08, 6.35501820e-08])

In [21]:
import numpy as np
y_true=[]
for batch_data in valid_gen:
    y_true.extend(model.get_labels(batch_data).data.cpu().numpy().reshape(-1))
y_true = np.array(y_true, np.float64)
valid_weights = []

y_valid = y_true
for t in y_valid:
    if t == 0:
        valid_weights.append(200)
    else:
        valid_weights.append(1)

def real_prob(k,p):
    if p == 0:
        return 0
    return 1/(1-k+(k/p))
from sklearn.metrics import log_loss
baseline_ll = log_loss(y_valid, [real_prob(200,np.mean(y_valid))]*len(y_valid), sample_weight=valid_weights)
model_ll = log_loss(y_valid, y_pred, sample_weight=valid_weights)


In [22]:
print('NLL: ', 1 - model_ll/baseline_ll)

NLL:  0.996097080829555
