# ⛳ **Word2Vec을 이용한 추천 [Reference](https://arena.kakao.com/forum/topics/232)**

```
Music nDCG: 0.143337
Tag nDCG: 0.419873
Score: 0.184817
```

> 모든 item, id는 str으로 넣자.  

1. corpus = songs + tags로 두고 word2vec을 학습시킨다. (item2vec)
  * window size는 전체를 고려하도록 크게 잡는다.
2. `playlist embedding = 속한 song, tag embedding의 합`으로 embedding시킬 수 있다. 모두 계산하여 `WordEmbeddingsKeyedVectors`에 add해준다.

* 노래만 점수 낮은 이유 => int형 str형 섞여있었음 omg
* worker를 여러개로 하면 돌릴때마다 다르다. (worker가 thread려나?)

# 🧰 **Utils**

In [204]:
PYTHONHASHSEED = 0

In [None]:
import warnings

warnings.filterwarnings(action='ignore') 

In [1]:
import os,io
import json
import time
import pickle

from collections import Counter
from itertools import chain

from tqdm import tqdm
import numpy as np

from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

dir = '/content/drive/MyDrive/Melon-PL-Continuation/11월 수정중   카카오 아레나 (멜론 추천)/1206 word2vec'

In [2]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)

def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj

In [None]:
class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _load_json(self,fname):
        with open(fname, encoding="utf-8") as f:
            json_obj = json.load(f)

        return json_obj

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = self._load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = self._load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

# 📚 **Base Results 생성하기**
baseline  결과를 저장해놓고 이용할 정보가 없는 경우 끌어와 쓴다.

In [60]:
class GenreExpPopular:

    def remove_seen(self, seen, recommend):
        res = []
        for item in recommend:
            if item not in seen:
                res.append(item)
                if len(res) == 100:
                    break
        return res

    def _song_mp_per_genre(self, song_meta, global_mp):

        # 1. train data를 기반으로, 장르별 노래 dict를 생성한다.
        res = {} # {'댄스' : [0번째 노래, 40번째 노래, ...], '팝' : [1번째 노래, 2번째 노래 , ...],...}

        for song_id, song_info in song_meta.items(): # id_to_song info인 dict에서 하나씩 꺼내서
            for genre in song_info['song_gn_gnr_basket']: # 해당 노래의 대분류를 하나씩 꺼내서
                res.setdefault(genre, []).append(song_id) # 기본값 주고 dict만들기

        # 2. 장르별로 노래들의 count의 상위만 추출해서 dict를 수정한다
        for genre, song_id_list in res.items(): # 대분류와 곡 list에서 하나씩 꺼내서
            frequency = Counter({song_id: global_mp.get(int(song_id), 0) for song_id in song_id_list})  #
            res[genre] = [k for k, v in frequency.most_common(200)]

        return res

    def most_popular(self,playlists, feature, topk_count):
        c = Counter()

        for plylist in playlists:
            c.update(plylist[feature])

        topk = c.most_common(topk_count)
        return c, [k for k, v in topk]

    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json} # id_to_노래정보 dictionary
        song_mp_counter, song_mp = self.most_popular(train, "songs", 200) # 가장 많이 등장한 노래 일단 200개 추출
        tag_mp_counter, tag_mp = self.most_popular(train, "tags", 100) # 가장 많이 등장한 태그 일단 100개 추출
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):

            # step 1 : 가장 많이 등장한 장르 상위 k개 + softmax
            genre_counter = Counter()

            for song_id in q["songs"]: # 현재 query의 등장 장르 수 계산
                for genre in song_meta[song_id]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            most_popular_topk = genre_counter.most_common(10) # [ (k,v), (k,v) ,... ]

            if len(most_popular_topk) != 0:
                genres,scores = zip(*most_popular_topk)
                exp_scores = np.exp(scores)
                scores = exp_scores/exp_scores.sum()*100

                query_songs,cur_songs = q['songs'],[]

                for genre, score in zip(genres,scores):
                    recommend = self.remove_seen(query_songs, song_mp_per_genre[genre])[:int(score)+1]
                    cur_songs.extend(recommend)
                    query_songs.extend(recommend)

                cur_songs = cur_songs[:100]

            else:
                cur_songs = self.remove_seen(q["songs"], song_mp)[:100]

            answers.append({
                "id": q["id"],
                "songs": cur_songs,
                "tags": self.remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers

    def run(self,result_fname, test = True):

        if test:
            train_fname = os.path.join(dir, 'train.json')
            question_fname = os.path.join(dir, 'val_questions.json')
        else:
            pass

        song_meta_fname ='/content/drive/My Drive/Melon-PL-Continuation/0802/train_split/song_meta.json'

        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        self.answers = self._generate_answers(song_meta_json, train_data, questions)
        write_json(self.answers, os.path.join(dir,result_fname))
        print("DONE")

In [61]:
basemodel = GenreExpPopular()
basemodel.run('base_results_gep.json', test = True)

Loading song meta...
Loading train file...
Loading question file...
Writing answers...


100%|██████████| 23015/23015 [00:18<00:00, 1231.14it/s]


DONE


In [62]:
evaluator.evaluate(dir + "/val_answers.json",dir + "/base_results_gep.json")

Music nDCG: 0.0417793
Tag nDCG: 0.162134
Score: 0.0598324


# 🏃‍♀ **Train, Validation data 준비하기**

In [3]:
def get_data(test = True):
    if test:
        train_path = os.path.join(dir, 'train.json')
        val_path = os.path.join(dir, 'val_questions.json')
    else:
        pass

    base_path = os.path.join(dir, 'base_results_gep.json')

    train = load_json(train_path)
    val = load_json(val_path)
    base_res = load_json(base_path)

    return train, val, base_res

In [4]:
train, val, base_res = get_data(test = True)

# 🎶 **Playlist2Vec 정의**
* popular로만 전체를 채우게 되는 경우가 존재한다.
  * train에 아예 없거나 min_counts에 의해 몇개 노래/태그가 제거되고
  * val ply를 등록할 때, 원래 query song이 없거나 걸러진 노래/태그만 존재하여 등록이 되지 않고
  * sim ply를 찾지 못한다.

In [5]:
class Playlist2Vec:
    def __init__(self,train, val, results):
        self.train = train
        self.val = val
        self.results = {}
        for ply in results:
            self.results[ply['id']] = {'songs':list(map(str,ply['songs'])),'tags':ply['tags']}
        self.data = self.train + self.val

        print('*** Build Vocab ***')
        self.build_vocab()

    def build_vocab(self):
        self.id_to_songs = {}
        self.id_to_tags = {}
        self.corpus = []

        for ply in self.data:
            self.id_to_songs[str(ply['id'])] = [*map(str,ply['songs'])]
            self.id_to_tags[str(ply['id'])] = [*map(str,ply['tags'])]

            items = self.id_to_songs[str(ply['id'])] + self.id_to_tags[str(ply['id'])]
            if len(items) > 1:
                self.corpus.append(items)

        self.songs = set(chain.from_iterable(self.id_to_songs.values())) 
        self.tags = set(chain.from_iterable(self.id_to_tags.values())) 

        print("> Corpus :", len(self.corpus))
        print(f'> Songs + Tags = {len(self.songs)} + {len(self.tags)} = {len(self.songs) + len(self.tags)}')
        print("> Playlist Id Type :", type(list(self.id_to_songs.keys())[0]),type(list(self.id_to_tags.keys())[0]))

    def register_w2v(self, w2v_model):
        self.w2v_model = w2v_model
        self.p2v_model = WordEmbeddingsKeyedVectors(self.w2v_model.trainables.layer1_size)

    def train_w2v(self, min_count = 3, size = 128, window = 210, negative = 5, sg = 1, hs = 0, workers = 1):
        # workers = 1 ; for consistency
        self.w2v_model = Word2Vec(sentences = self.corpus, min_count= min_count , size = size , window = window, negative = negative , sg = sg, hs = hs, workers = workers)
        self.p2v_model = WordEmbeddingsKeyedVectors(self.w2v_model.trainables.layer1_size)

    def build_p2v(self):
        start = time.time()
        pids = []
        playlist_embedding = []

        # for pid in tqdm(self.id_to_songs.keys()):
        for pid in self.id_to_songs.keys():
            if len(self.id_to_songs[pid]) < 1:
                continue

            ply_embedding = 0

            for item in self.id_to_songs[pid] + self.id_to_tags[pid]:
                if self.w2v_model.wv.vocab.get(str(item)) is None:
                    # train data에 없었거나, w2v 학습 과정에서 min_counts에 걸러진 경우
                    continue

                ply_embedding += self.w2v_model.wv.get_vector(str(item))

            if type(ply_embedding) != int: # 한 번이라도 update 되었다면
                pids.append(str(pid)) # ! string !
                playlist_embedding.append(ply_embedding)

        self.p2v_model.add(pids,playlist_embedding)

        print(f'> running time : {time.time()-start:.3f}')
        print(f'> Register (ply update) : {len(pids)} / {len(self.id_to_songs)}')
        val_ids = set([str(p["id"]) for p in self.val])
        print(f'> Only {len( val_ids - set(pids) )} of validation set ( total : {len(val_ids)} ) can not find similar playlist in train set.')

    def remove_seen(self,seen, recommend, attr):
        res = []
        limit = [100,10][attr == 'tags']

        for item in recommend:
            if item not in seen and item not in res:
                res.append(item)
                if len(res) == limit:
                    break
        return res

    def build_answers(self):
        self.answers = []
        use_popular = 0
        lack_info = Counter()

        # for ply in tqdm(self.val):
        for ply in self.val:
            if self.p2v_model.vocab.get(str(ply['id'])) is not None:

                ply_candidates = self.p2v_model.most_similar(str(ply['id']), topn = 200)
                song_candidates = []
                tag_candidates = []

                for cid , _ in ply_candidates:
                    song_candidates.extend(self.id_to_songs[str(cid)])
                    tag_candidates.extend(self.id_to_tags[str(cid)])

                song_most_common = [song for song,_ in Counter(song_candidates).most_common()]
                tag_most_common = [tag for tag,_ in Counter(tag_candidates).most_common()]

                if len(song_most_common) < 100:
                    lack_info[len(song_most_common)] += 1
            else:
                use_popular += 1
                song_most_common = []
                tag_most_common = []

            song_rec = self.remove_seen(ply['songs'], song_most_common + self.results[ply['id']]['songs'], 'songs')
            tag_rec = self.remove_seen(ply['tags'], tag_most_common + self.results[ply['id']]['tags'], 'tags')

            assert sum([type(s) != str for s in song_rec]) == 0
            assert len(set(song_rec)) == 100
            assert len(set(tag_rec)) == 10

            self.answers.append({
                'id' : ply['id'],
                'songs' : list(map(int,song_rec)),
                'tags' : tag_rec
            })

        print('> use_all_popular :', use_popular)
        print('> lack_info :', sum(lack_info.values()))
        print('            :',lack_info)

## **Word2Vec 학습**
word : song, tag 정보

In [8]:
start = time.time()
w2v_model = Word2Vec(sentences = model.corpus, min_count= 3 , size = 128 , window = 210, negative = 5 , sg = 1, hs = 0, workers = 1)
print(f'> running time : {time.time()-start:.3f}')

> running time : 2803.283


In [None]:
# 학습 결과 살짝 확인해보기

print("힙합과 랩의 cosine 유사도는? :",model.w2v_model.wv.similarity('랩','힙합'))

힙합과 랩의 cosine 유사도는? : 0.9693299


In [10]:
# with open(os.path.join(dir,'w2v_128.pkl'), 'wb') as f:
#     pickle.dump(w2v_model,f)

## **Playlist2Vec build하기**
* w2v를 따로 학습시키지 않을거라면 `self.train_w2v()` 이용

In [12]:
model = Playlist2Vec(train, val, base_res)

*** Build Vocab ***
> Corpus : 112010
> Songs + Tags = 521583 + 23421 = 545004
> Playlist Id Type : <class 'str'> <class 'str'>


In [16]:
# 학습후 저장해놓은 w2v 모델 등록

with open(os.path.join(dir,'w2v_128.pkl'), 'rb') as f:
    w2v_model = pickle.load(f)

model.register_w2v(w2v_model)

In [17]:
model.build_p2v()

> running time : 13.401
> Register (ply update) : 91930 / 96639
> Only 4697 of validation set ( total : 23015 ) can not find similar playlist in train set.


In [18]:
model.build_answers()

  if np.issubdtype(vec.dtype, np.int):


> use_all_popular : 4697
> lack_info : 64
            : Counter({10: 34, 11: 30})


# 🧐 **[실험] Hidden size와 acc**
Hidden size가 커져도 acc에는 유의한 차이가 없다고 판단됨! 128로 하자.

In [263]:
model2 = Playlist2Vec(train, val, base_res)

for size in [128,256,512]:

    start = time.time()
    w2v_model2 = Word2Vec(sentences = model2.corpus, min_count= 3 , size = size , window = 210, negative = 5 , sg = 1, hs = 0, workers = 1)
    print(f'> running time : {time.time()-start:.3f}')
    print("힙합과 랩의 cosine 유사도는? :",w2v_model2.wv.similarity('랩','힙합'))

    model2.register_w2v(w2v_model2)
    model2.build_p2v()
    model2.build_answers()

    write_json(model2.answers , f"results_tmp_{size}.json")
    evaluator.evaluate(dir + "/val_answers.json",f"results_tmp_{size}.json")

*** Build Vocab ***
> Corpus : 112010
> Songs + Tags = 521583 + 23421 = 545004
> Playlist Id Type : <class 'str'> <class 'str'>
> running time : 2943.168
힙합과 랩의 cosine 유사도는? : 0.96212125
> running time : 12.201
> Register (ply update) : 91930 / 96639
> Only 4697 of validation set ( total : 23015 ) can not find similar playlist in train set.


  if np.issubdtype(vec.dtype, np.int):


> use_all_popular : 4697
> lack_info : 64
            : Counter({10: 35, 11: 29})
Music nDCG: 0.143271
Tag nDCG: 0.420335
Score: 0.18483
> running time : 3761.576
힙합과 랩의 cosine 유사도는? : 0.87137836
> running time : 13.108
> Register (ply update) : 91930 / 96639
> Only 4697 of validation set ( total : 23015 ) can not find similar playlist in train set.
> use_all_popular : 4697
> lack_info : 64
            : Counter({10: 34, 11: 29, 20: 1})
Music nDCG: 0.142963
Tag nDCG: 0.421411
Score: 0.18473
> running time : 4965.761
힙합과 랩의 cosine 유사도는? : 0.82157165
> running time : 13.440
> Register (ply update) : 91930 / 96639
> Only 4697 of validation set ( total : 23015 ) can not find similar playlist in train set.
> use_all_popular : 4697
> lack_info : 64
            : Counter({10: 32, 11: 28, 20: 4})
Music nDCG: 0.142701
Tag nDCG: 0.421699
Score: 0.18455


# 💯 **점수 확인**

In [21]:
evaluator = ArenaEvaluator()

In [None]:
write_json(model.answers , "results_mine.json")
evaluator.evaluate(dir + "/val_answers.json", "results_mine.json")

Music nDCG: 0.143337
Tag nDCG: 0.419873
Score: 0.184817
