In [24]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle
import numpy as np

In [26]:
tr = pd.read_csv('./parsed/listen_count.txt', sep=' ', header=None, dtype=str)
tr.columns = ['uid', 'sid', 'cnt']
tr['cnt'] = tr['cnt']

In [27]:
uid2idx = {_id: i for (i, _id) in enumerate(tr.uid.unique())}
sid2idx = {_id: i for (i, _id) in enumerate(tr.sid.unique())}
idx2uid = {i: _id for (_id, i) in uid2idx.items()}
idx2sid = {i: _id for (_id, i) in sid2idx.items()}

In [28]:
n_user, n_item = len(uid2idx), len(sid2idx)

In [29]:
tr['uidx'] = tr.uid.apply(lambda x: uid2idx[x])
tr['sidx'] = tr.sid.apply(lambda x: sid2idx[x])

In [59]:
with open('./user_id.txt', 'w') as f:
    print('\n'.join(tr.uid.tolist()), file=f)

In [30]:
vali_list = val.groupby('uidx').sidx.apply(list).reset_index()

In [31]:
X = csr_matrix((tr.cnt, (tr.uidx, tr.sidx)), shape=(n_user, n_item), dtype=np.float32)

In [32]:
X.data[:] = 1.0 + np.log(1.0 + X.data[:])

In [33]:
def gen_top_reco(X, lamb=10.0):
    G = (X.T @ X).toarray()
    diags = np.diag_indices(G.shape[0])
    G[diags] += lamb
    P = np.linalg.inv(G)
    B = P / -np.diag(P)
    scores = X @ B
    scores = np.asarray((scores - X.astype(bool).astype(int) * 10000))
    top_reco = (-scores).argsort(-1)[:, :100]
    return top_reco

In [44]:
top_reco = gen_top_reco(X, lamb=30.0)

In [45]:
ret = []
for idx, rec_list in enumerate(top_reco):
    uid = idx2uid[idx]
    rec_sids = [str(idx2sid[sidx]) for sidx in rec_list]
    ret.append("%s " % uid + ' '.join(rec_sids))

In [46]:
with open('./parsed/rec_result.txt', 'w') as f:
    for w in ret:
        print(w, file=f)

In [48]:
def load_res(fname):
    ret = {}
    with open(fname, 'r') as f:
        for l in f:
            l = l.strip().split()
            uid, sids = l[0], l[1:]
            ret[uid] = sids
    return ret

In [49]:
recs = load_res('./parsed/rec_result.txt')

In [50]:
gt = load_res('./parsed/TEST_DATA.txt')

In [51]:
import math
def ndcg(recs, gt):
    Q, S = 0.0, 0.0
    for u, vs in gt.items():
        rec = recs.get(u, [])
        if not rec:
            continue

        idcg = sum([1.0 / math.log(i + 2, 2) for i in range(len(vs))])
        dcg = 0.0
        for i, r in enumerate(rec):
            if r not in vs:
                continue
            rank = i + 1
            dcg += 1.0 / math.log(rank + 1, 2)
        ndcg = dcg / idcg
        S += ndcg
        Q += 1
    return S / Q


In [52]:
ndcg(recs, gt)

0.2559553152855044