In [None]:
!pip install n2
!pip install buffalo
!pip install --upgrade tables
from buffalo.algo.als import ALS
from buffalo.misc import aux, log
from buffalo.algo.options import ALSOption
import buffalo.data
from buffalo.data.mm import MatrixMarketOptions

import numpy as np
import pandas as pd
import helper as hp
from scipy.io import mmwrite
from scipy.io import mmread
from scipy.sparse import csr_matrix
import json

In [None]:
train_path = '/content/drive/My Drive/coc_contest/train.json'
qst_path = '/content/drive/My Drive/coc_contest/question.json'
ans_path = '/content/drive/My Drive/coc_contest/answer.json'

train, valid, answer, all_users = hp.load_data2(train_path, qst_path, ans_path)

In [None]:
ratings = hp.get_userbook_map(all_users)

In [None]:
user_items, uid_to_idx, idx_to_uid, mid_to_idx, idx_to_mid = hp.df_to_matrix(ratings, 'user_id', 'book_id')

In [None]:
user_items

<2000x1699 sparse matrix of type '<class 'numpy.float64'>'
	with 68247 stored elements in Compressed Sparse Row format>

In [None]:
mmwrite('./main.mtx', user_items)

In [None]:
iid = list(idx_to_mid.values())
uid = list(idx_to_uid.values())

In [None]:
with open("./uid", "w") as f:
    for val in uid:
        print(val, file=f)

with open("./iid", "w") as f:
    for val in iid:
        print(val, file=f)

In [None]:
opt = ALSOption().get_default_option() 
opt.evaluation_on_learning =  True
opt.validation = aux.Option({'topk': 10})
opt.num_workers = 4
opt.reg_u = 0.192
opt.reg_i = 0.56638
opt.alpha = 3
opt.d = 11
opt.save_best = True
opt.model_path = './buffalo_mf.model'
opt.evaluation_period = 10

In [None]:
data_opt = MatrixMarketOptions().get_default_option()
data_opt.input.main = './main.mtx'
data_opt.input.iid = './iid'
data_opt.input.uid = './uid'
data_opt.data.validation.p = 0.1
data_opt.data.validation.max_samples = 10000

In [None]:
data = buffalo.data.load(data_opt)
data.create()

[INFO    ] 2020-08-22 13:08:21 [mm.py:192] Create the database from matrix market file.
[INFO    ] 2020-08-22 13:08:21 [mm.py:205] Creating working data...
[PROGRESS] 100.00% 1.4/1.4secs 1,445,662.36it/s
[INFO    ] 2020-08-22 13:08:24 [mm.py:210] Building data part...
[INFO    ] 2020-08-22 13:08:24 [base.py:402] Building compressed triplets for rowwise...
[INFO    ] 2020-08-22 13:08:24 [base.py:403] Preprocessing...
[INFO    ] 2020-08-22 13:08:24 [base.py:406] In-memory Compressing ...
[INFO    ] 2020-08-22 13:08:24 [base.py:289] Load triplet files. Total job files: 5
[INFO    ] 2020-08-22 13:08:24 [base.py:436] Finished
[INFO    ] 2020-08-22 13:08:24 [base.py:402] Building compressed triplets for colwise...
[INFO    ] 2020-08-22 13:08:24 [base.py:403] Preprocessing...
[INFO    ] 2020-08-22 13:08:24 [base.py:406] In-memory Compressing ...
[INFO    ] 2020-08-22 13:08:24 [base.py:289] Load triplet files. Total job files: 5
[INFO    ] 2020-08-22 13:08:24 [base.py:436] Finished
[INFO    ] 

In [None]:
model = ALS(opt, data=data)
#model = ALS(opt, data_opt=data_opt)
model.initialize()
val_res = model.train()
val_res

[INFO    ] 2020-08-22 13:08:27 [als.py:57] ALS({
  "evaluation_on_learning": true,
  "compute_loss_on_training": true,
  "early_stopping_rounds": 0,
  "save_best": true,
  "evaluation_period": 10,
  "save_period": 10,
  "random_seed": 0,
  "validation": {
    "topk": 10
  },
  "adaptive_reg": false,
  "save_factors": false,
  "accelerator": false,
  "d": 11,
  "num_iters": 10,
  "num_workers": 4,
  "hyper_threads": 256,
  "num_cg_max_iters": 3,
  "reg_u": 0.192,
  "reg_i": 0.56638,
  "alpha": 3,
  "optimizer": "manual_cg",
  "cg_tolerance": 1e-10,
  "eps": 1e-10,
  "model_path": "./buffalo_mf.model",
  "data_opt": {}
})
[INFO    ] 2020-08-22 13:08:27 [als.py:59] MatrixMarket Header(2000, 1699, 61423) Validation(6824 samples)
[INFO    ] 2020-08-22 13:08:27 [buffered_data.py:71] Set data buffer size as 67108864(minimum required batch size is 435).
[INFO    ] 2020-08-22 13:08:27 [als.py:187] Iteration 1: RMSE 0.252 Elapsed 0.017 secs
[INFO    ] 2020-08-22 13:08:27 [als.py:187] Iteration 2

{'train_loss': 0.19876532106499217,
 'val_accuracy': 0.13730390981050614,
 'val_auc': 0.5659161848347642,
 'val_error': 0.6048220755993132,
 'val_map': 0.0566941451260461,
 'val_ndcg': 0.10423410779241579,
 'val_rmse': 0.6298395084065116}

In [None]:
def select_candi(df):
  result = pd.DataFrame()

  for i in range(len(df)):
    try:
      id = df['user_id'][i]      
      pool = iid.copy()      
      seen = df['books'][i]
      
      for val in seen:
        pool.remove(val)
      
      recomm_books = model.topk_recommendation(id, topk=100, pool=pool)
      
      result = result.append({'user_id': id, 'books': recomm_books}, ignore_index=True)
    
    except TypeError:
      pass
    except ValueError:
      pass

  return result 

In [None]:
result = select_candi(valid)
result = result.reset_index()

In [None]:
class Evaluator:
    def load_json(self, fname):
      with open(fname) as f:
          json_obj = json.load(f)

      return json_obj

    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]
        self.hit_cnt = 0

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)
                self.hit_cnt += 1

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = self.load_json(gt_fname)
        gt_dict = {g["user_id"]: g for g in gt_playlists}
        rec_playlists = rec_fname

        gt_ids = set([g["user_id"] for g in gt_playlists])
        rec_ids = set([val for val in rec_playlists['user_id']])

        rec_song_counts = [len(songs) for songs in rec_playlists['books']]        
        rec_unique_song_counts = [len(set(songs)) for songs in rec_playlists['books']]
        
        if set(rec_unique_song_counts) != set([100]):
            raise Exception("중복된 책 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        
        #for rec in rec_playlists:
        for idx, rec in rec_playlists.iterrows():
            gt = gt_dict[rec["user_id"]]
            music_ndcg += self._ndcg(gt["books"], rec["books"][:100])
        
        music_ndcg = music_ndcg / len(rec_playlists)
        
        return music_ndcg

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg = self._eval(gt_fname, rec_fname)
            print(f"Hit Count: {self.hit_cnt}")
            print(f"nDCG: {music_ndcg:.6}")            
        except Exception as e:
            print(e)

In [None]:
eval = Evaluator()
eval.evaluate('/content/drive/My Drive/coc_contest/answer.json', result)

Hit Count: 7911
nDCG: 0.461417


In [None]:
# 이하 하이퍼 파라미터 튜닝
model.opt.optimize = aux.Option({
        'loss': 'val_ndcg',
        'max_trials': 100,
        'deployment': True,
        'start_with_default_parameters': True,
        'space': {
            'd': ['randint', ['d', 10, 128]],
            'reg_u': ['uniform', ['reg_u', 0.1, 1.0]],
            'reg_i': ['uniform', ['reg_i', 0.1, 1.0]],
            'alpha': ['randint', ['alpha', 1, 10]],
        }
    })

In [None]:
log.set_log_level(log.INFO)
model.opt.model_path = './als.optimize.bin'
print(json.dumps({'alpha': model.opt.alpha, 'd': model.opt.d, 'reg_u': model.opt.reg_u, 'reg_i': model.opt.reg_i}, indent=2))
model.optimize()
optimization_res = model.get_optimization_data()
best_parameters = optimization_res['best_parameters']

print(json.dumps(optimization_res['best'], indent=2))
print(json.dumps({'alpha': best_parameters['alpha'], 'd': best_parameters['d'], 'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i']}, indent=2))

[optimizing... ] 0.00% 0.0/0.0secs 0.00it/s

{
  "alpha": 8,
  "d": 20,
  "reg_u": 0.1,
  "reg_i": 0.1
}


[INFO    ] 2020-08-22 12:57:48 [optimize.py:46] Starting with default parameter result: {'train_loss': 0.2212548154740863, 'val_ndcg': 0.09433559092761766, 'val_map': 0.04960845051823691, 'val_accuracy': 0.12407562892442987, 'val_auc': 0.5592812720585036, 'val_rmse': 0.43066229257398136, 'val_error': 0.4000703167227061, 'loss': -0.09433559092761766, 'status': 'ok'}
[INFO    ] 2020-08-22 12:57:48 [optimize.py:49] Saving model... to ./als.optimize.bin
[optimizing... ] 5.00% 5.5/109.7secs 0.91it/s[INFO    ] 2020-08-22 12:57:53 [optimize.py:73] Found new best parameters: {'alpha': 7, 'd': 17, 'reg_i': 0.7377830658891181, 'reg_u': 0.8833644901289732} @ iter 6
[INFO    ] 2020-08-22 12:57:53 [optimize.py:79] Saving model... to ./als.optimize.bin
[optimizing... ] 7.00% 7.5/107.3secs 0.93it/s[INFO    ] 2020-08-22 12:57:55 [optimize.py:73] Found new best parameters: {'alpha': 4, 'd': 11, 'reg_i': 0.19706435221403173, 'reg_u': 0.13713795249152017} @ iter 8
[INFO    ] 2020-08-22 12:57:55 [optimize

{
  "train_loss": 0.15900076481077818,
  "val_ndcg": 0.10471890828969825,
  "val_map": 0.054688712965393155,
  "val_accuracy": 0.13989485199078514,
  "val_auc": 0.5671964303653766,
  "val_rmse": 0.7539389711029305,
  "val_error": 0.7385173760712636,
  "loss": -0.10471890828969825,
  "status": "ok"
}
{
  "alpha": 1,
  "d": 11,
  "reg_u": 0.17301816622086028,
  "reg_i": 0.8622016557347852
}
