# Matrix Factorization 라이브러리인 Implicit을 이용한 Jukebox 풀이

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
from implicit.als import AlternatingLeastSquares as ALS

In [2]:
tr = pd.read_csv('./parsed/listen_count.txt', sep=' ', header=None, dtype=str)
tr.columns = ['uid', 'sid', 'cnt']
tr['cnt'] = tr['cnt']

## user/song id ↔︎ index mapper 생성

In [3]:
uid2idx = {_id: i for (i, _id) in enumerate(tr.uid.unique())}
sid2idx = {_id: i for (i, _id) in enumerate(tr.sid.unique())}
idx2uid = {i: _id for (_id, i) in uid2idx.items()}
idx2sid = {i: _id for (_id, i) in sid2idx.items()}

In [4]:
n_users, n_items = len(uid2idx), len(sid2idx)

In [5]:
tr['uidx'] = tr.uid.apply(lambda x: uid2idx[x])
tr['sidx'] = tr.sid.apply(lambda x: sid2idx[x])

In [6]:
with open('./user_id.txt', 'w') as f:
    print('\n'.join(list(uid2idx)), file=f)

In [7]:
X = csr_matrix((tr.cnt, (tr.uidx, tr.sidx)), shape=(n_users, n_items), dtype=np.float32)
X.data[:] = 1.0 + np.log(1.0 + X.data[:])

In [8]:
model = ALS(64, regularization=8.0, alpha=16.0)
model.fit(X)

  0%|          | 0/15 [00:00<?, ?it/s]

In [9]:
top_reco, scores = model.recommend(np.arange(n_users), X, N=100)

## 추천 생성 후 저장

In [10]:
ret = []
for idx, rec_list in enumerate(top_reco):
    uid = idx2uid[idx]
    rec_sids = [str(idx2sid[sidx]) for sidx in rec_list]
    ret.append("%s " % uid + ' '.join(rec_sids))
with open('./parsed/rec_result.txt', 'w') as f:
    for w in ret:
        print(w, file=f)

In [11]:
## 저장된 추천 결과, 그리고 테스트 데이터를 dictionary 형태로 읽어옵니다.
def load_res(fname):
    ret = {}
    with open(fname, 'r') as f:
        for l in f:
            l = l.strip().split()
            uid, sids = l[0], l[1:]
            ret[uid] = sids
    return ret

recs = load_res('./parsed/rec_result.txt')
gt = load_res('./parsed/TEST_DATA.txt')

In [12]:
import math
def ndcg(recs, gt):
    Q, S = 0.0, 0.0
    for u, vs in gt.items():
        rec = recs.get(u, [])
        if not rec:
            continue

        idcg = sum([1.0 / math.log(i + 2, 2) for i in range(len(vs))])
        dcg = 0.0
        for i, r in enumerate(rec):
            if r not in vs:
                continue
            rank = i + 1
            dcg += 1.0 / math.log(rank + 1, 2)
        ndcg = dcg / idcg
        S += ndcg
        Q += 1
    return S / Q


In [13]:
ndcg(recs, gt)

0.230165318680204