# ⛳ **Word2Vec을 이용한 추천 [Reference](https://arena.kakao.com/forum/topics/232)**

```
Music nDCG: 0.125168
Tag nDCG: 0.405524
Score: 0.167222
```

> 모든 item, id는 str으로 넣자.  

1. corpus = songs + tags로 두고 word2vec을 학습시킨다. (item2vec)
  * window size는 전체를 고려하도록 크게 잡는다.
2. `playlist embedding = 속한 song, tag embedding의 합`으로 embedding시킬 수 있다. 모두 계산하여 `WordEmbeddingsKeyedVectors`에 add해준다.

* 노래만 점수 낮은 이유 => int형 str형 섞여있었음 omg
* worker를 여러개로 하면 돌릴때마다 다르다. (worker가 thread려나?)

# **Utils**

In [None]:
# PYTHONHASHSEED = 0

In [293]:
import warnings
warnings.filterwarnings(action='ignore') # default : 다시 나오게

In [1]:
import os,io
import json
import time
import re

from collections import Counter
from itertools import chain

from tqdm import tqdm
import numpy as np

from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

dir = '/content/drive/MyDrive/Melon-PL-Continuation/11월 수정중   카카오 아레나 (멜론 추천)/1206 word2vec'

In [2]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)

def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj

# **Base Results 생성하기**
baseline  결과를 저장해놓고 이용할 정보가 없는 경우 끌어와 쓴다.

In [3]:
class GenreExpPopular:

    def remove_seen(self, seen, recommend):
        res = []
        for item in recommend:
            if item not in seen:
                res.append(item)
                if len(res) == 100:
                    break
        return res

    def _song_mp_per_genre(self, song_meta, global_mp):

        # 1. train data를 기반으로, 장르별 노래 dict를 생성한다.
        res = {} # {'댄스' : [0번째 노래, 40번째 노래, ...], '팝' : [1번째 노래, 2번째 노래 , ...],...}

        for song_id, song_info in song_meta.items(): # id_to_song info인 dict에서 하나씩 꺼내서
            for genre in song_info['song_gn_gnr_basket']: # 해당 노래의 대분류를 하나씩 꺼내서
                res.setdefault(genre, []).append(song_id) # 기본값 주고 dict만들기

        # 2. 장르별로 노래들의 count의 상위만 추출해서 dict를 수정한다
        for genre, song_id_list in res.items(): # 대분류와 곡 list에서 하나씩 꺼내서
            frequency = Counter({song_id: global_mp.get(int(song_id), 0) for song_id in song_id_list})  #
            res[genre] = [k for k, v in frequency.most_common(200)]

        return res

    def most_popular(self,playlists, feature, topk_count):
        c = Counter()

        for plylist in playlists:
            c.update(plylist[feature])

        topk = c.most_common(topk_count)
        return c, [k for k, v in topk]

    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json} # id_to_노래정보 dictionary
        song_mp_counter, song_mp = self.most_popular(train, "songs", 200) # 가장 많이 등장한 노래 일단 200개 추출
        tag_mp_counter, tag_mp = self.most_popular(train, "tags", 100) # 가장 많이 등장한 태그 일단 100개 추출
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):

            # step 1 : 가장 많이 등장한 장르 상위 k개 + softmax
            genre_counter = Counter()

            for song_id in q["songs"]: # 현재 query의 등장 장르 수 계산
                for genre in song_meta[song_id]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            most_popular_topk = genre_counter.most_common(10) # [ (k,v), (k,v) ,... ]

            if len(most_popular_topk) != 0:
                genres,scores = zip(*most_popular_topk)
                exp_scores = np.exp(scores)
                scores = exp_scores/exp_scores.sum()*100

                query_songs,cur_songs = q['songs'],[]

                for genre, score in zip(genres,scores):
                    recommend = self.remove_seen(query_songs, song_mp_per_genre[genre])[:int(score)+1]
                    cur_songs.extend(recommend)
                    query_songs.extend(recommend)

                cur_songs = cur_songs[:100]

            else:
                cur_songs = self.remove_seen(q["songs"], song_mp)[:100]

            answers.append({
                "id": q["id"],
                "songs": cur_songs,
                "tags": self.remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers

    def run(self,result_fname, test = True):

        if test:
            train_fname = os.path.join(dir, 'train.json')
            question_fname = os.path.join(dir, 'val_questions.json')
        else:
            pass

        song_meta_fname ='/content/drive/My Drive/Melon-PL-Continuation/0802/train_split/song_meta.json'

        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        self.answers = self._generate_answers(song_meta_json, train_data, questions)
        write_json(self.answers, os.path.join(dir,result_fname))
        print("DONE")

In [None]:
basemodel = GenreExpPopular()
basemodel.run('base_results_gep.json', test = True)

Loading song meta...
Loading train file...
Loading question file...
Writing answers...


100%|██████████| 23015/23015 [00:18<00:00, 1231.14it/s]


DONE


In [None]:
evaluator.evaluate(dir + "/val_answers.json",dir + "/base_results_gep.json")

Music nDCG: 0.0417793
Tag nDCG: 0.162134
Score: 0.0598324


# **Train, Validation data 준비하기**

In [8]:
def get_data(test = True):
    if test:
        train_path = os.path.join(dir, 'train.json')
        val_path = os.path.join(dir, 'val_questions.json')
    else:
        pass

    base_path = os.path.join(dir, 'base_results_gep.json')

    train = load_json(train_path)
    val = load_json(val_path)
    base_res = load_json(base_path)

    return train, val, base_res

In [9]:
train, val, base_res = get_data(test = True)

# 🎶 **Playlist2Vec 정의**
* w2v은 따로 학습시켜놓는다.
* 다음 절차로 인해 popular로만 전체를 채우게 된다.
  * train에 아예 없거나 min_counts에 의해 몇개 노래/태그가 제거되고
  * val ply를 등록할 때, 원래 query song이 없거나 걸러진 노래/태그만 존재하여 등록이 되지 않고
  * sim ply를 찾지 못한다.

In [10]:
class Playlist2Vec:
    def __init__(self,train, val, results):
        self.train = train
        self.val = val
        self.results = {}
        for ply in results:
            self.results[ply['id']] = {'songs':list(map(str,ply['songs'])),'tags':ply['tags']}
        self.data = self.train + self.val

        print('*** Build Vocab ***')
        self.build_vocab()

    def build_vocab(self):
        self.id_to_songs = {}
        self.id_to_tags = {}
        self.corpus = []

        for ply in self.data:
            self.id_to_songs[str(ply['id'])] = [*map(str,ply['songs'])]
            self.id_to_tags[str(ply['id'])] = [*map(str,ply['tags'])]

            items = self.id_to_songs[str(ply['id'])] + self.id_to_tags[str(ply['id'])]
            if len(items) > 1:
                self.corpus.append(items)

        self.songs = set(chain.from_iterable(self.id_to_songs.values())) 
        self.tags = set(chain.from_iterable(self.id_to_tags.values())) 

        print("> Corpus :", len(self.corpus))
        print(f'> Songs + Tags = {len(self.songs)} + {len(self.tags)} = {len(self.songs) + len(self.tags)}')
        print("> Playlist Id Type :", type(list(self.id_to_songs.keys())[0]),type(list(self.id_to_tags.keys())[0]))

    def register_w2v(self, w2v_model):
        self.w2v_model = w2v_model
        self.p2v_model = WordEmbeddingsKeyedVectors(self.w2v_model.trainables.layer1_size)

    def train_w2v(self, min_count = 3, size = 128, window = 210, negative = 5, sg = 1, hs = 0, workers = 1):
        # workers = 1 ; for consistency
        self.w2v_model = Word2Vec(sentences = self.corpus, min_count= min_count , size = size , window = window, negative = negative , sg = sg, hs = hs, workers = workers)
        self.p2v_model = WordEmbeddingsKeyedVectors(self.w2v_model.trainables.layer1_size)

    def build_p2v(self):
        start = time.time()
        pids = []
        playlist_embedding = []

        # for pid in tqdm(self.id_to_songs.keys()):
        for pid in self.id_to_songs.keys():
            if len(self.id_to_songs[pid]) < 1:
                continue

            ply_embedding = 0

            for item in self.id_to_songs[pid] + self.id_to_tags[pid]:
                if self.w2v_model.wv.vocab.get(str(item)) is None:
                    # train data에 없었거나, w2v 학습 과정에서 min_counts에 걸러진 경우
                    continue

                ply_embedding += self.w2v_model.wv.get_vector(str(item))

            if type(ply_embedding) != int: # 한 번이라도 update 되었다면
                pids.append(str(pid)) # ! string !
                playlist_embedding.append(ply_embedding)

        self.p2v_model.add(pids,playlist_embedding)

        print(f'> running time : {time.time()-start:.3f}')
        print(f'> Register (ply update) : {len(pids)} / {len(self.id_to_songs)}')
        val_ids = set([str(p["id"]) for p in self.val])
        print(f'> Only {len( val_ids - set(pids) )} of validation set ( total : {len(val_ids)} ) can not find similar playlist in train set.')

    def remove_seen(self,seen, recommend, attr):
        res = []
        limit = [100,10][attr == 'tags']

        for item in recommend:
            if item not in seen and item not in res:
                res.append(item)
                if len(res) == limit:
                    break
        return res

    def build_answers(self):
        self.answers = []
        use_popular = 0
        lack_info = Counter()

        # for ply in tqdm(self.val):
        for ply in self.val:
            if self.p2v_model.vocab.get(str(ply['id'])) is not None:

                ply_candidates = self.p2v_model.most_similar(str(ply['id']), topn = 200)
                song_candidates = []
                tag_candidates = []

                for cid , _ in ply_candidates:
                    song_candidates.extend(self.id_to_songs[str(cid)])
                    tag_candidates.extend(self.id_to_tags[str(cid)])

                song_most_common = [song for song,_ in Counter(song_candidates).most_common()]
                tag_most_common = [tag for tag,_ in Counter(tag_candidates).most_common()]

                if len(song_most_common) < 100:
                    lack_info[len(song_most_common)] += 1
            else:
                use_popular += 1
                song_most_common = []
                tag_most_common = []

            song_rec = self.remove_seen(ply['songs'], song_most_common + self.results[ply['id']]['songs'], 'songs')
            tag_rec = self.remove_seen(ply['tags'], tag_most_common + self.results[ply['id']]['tags'], 'tags')

            assert sum([type(s) != str for s in song_rec]) == 0
            assert len(set(song_rec)) == 100
            assert len(set(tag_rec)) == 10

            self.answers.append({
                'id' : ply['id'],
                'songs' : list(map(int,song_rec)),
                'tags' : tag_rec
            })

        print('> use_all_popular :', use_popular)
        print('> lack_info :', sum(lack_info.values()))
        print('            :',lack_info)

In [11]:
model = Playlist2Vec(train, val, base_res)

*** Build Vocab ***
> Corpus : 112010
> Songs + Tags = 521583 + 23421 = 545004
> Playlist Id Type : <class 'str'> <class 'str'>


In [12]:
# model.register_w2v(w2v_model)
model.train_w2v(workers = 10)

## **songtag2vec 성능 체크**

In [13]:
# 학습 결과 살짝 확인해보기

v1 = model.w2v_model.wv.get_vector('힙합')
v2 = model.w2v_model.wv.get_vector('랩')
print("힙합과 랩의 cosine 유사도는? :",v1.dot(v2)/np.linalg.norm(v1)/np.linalg.norm(v2))

힙합과 랩의 cosine 유사도는? : 0.96159214


In [14]:
v1 = model.w2v_model.wv.get_vector('525514')
v2 = model.w2v_model.wv.get_vector('129701')
print("ply 0에 들어있던 두 노래의 유사도는? :",v1.dot(v2)/np.linalg.norm(v1)/np.linalg.norm(v2))

ply 0에 들어있던 두 노래의 유사도는? : 0.75479937


In [15]:
model.build_p2v()

> running time : 11.875
> Register (ply update) : 91930 / 96639
> Only 4697 of validation set ( total : 23015 ) can not find similar playlist in train set.


In [16]:
model.build_answers()

  if np.issubdtype(vec.dtype, np.int):


> use_all_popular : 4697
> lack_info : 64
            : Counter({10: 35, 11: 29})


In [20]:
write_json(model.answers , "results_mine.json")
evaluator.evaluate(dir + "/val_answers.json","results_mine.json")

Music nDCG: 0.143441
Tag nDCG: 0.419879
Score: 0.184907


# **method 살펴보기**
### 1. model.w2v_model.wv.**similar_by_word**('랩')
```python
[('힙합', 0.9615921378135681),
 ('HipHop', 0.7985354661941528),
 ('Rap', 0.7893173694610596),
 ('국내힙합', 0.7886797189712524)]
```
### 2. **most_similar** : sum(positive) - sum(negative)
p = ['아빠','여성'] , n = ['남성'] => 결과 :['엄마']
```python
[('우울', 0.8385971188545227),
 ('외로움', 0.8382763862609863),
 ('쓸쓸함', 0.8005260229110718),
 ('외로운', 0.8002836108207703)]
```

### 3. model.w2v_model.wv.**similarity**('랩','힙합')
```python
0.96xxx
```

### 4. model.w2v_model.wv.**similar_by_vector**
```python
v1 = model.w2v_model.wv.get_vector('랩')
v2 = model.w2v_model.wv.get_vector('힙합')

model.w2v_model.wv.similar_by_vector(v1+v2)
```
```python
[('랩', 0.9919853806495667),
 ('힙합', 0.988567054271698),
 ('국내힙합', 0.8164964318275452),
 ('HipHop', 0.8128796815872192),
 ('Rap', 0.800581157207489),
 ('국힙', 0.7753417491912842)]
```

In [52]:
tags = [k for k in model.w2v_model.wv.vocab.keys() if k.isalpha()]
print(len(tags))
print(tags[:30])

6746
['불쾌지수', '친구들과', '눈꽃', '추천합니다', '사랑스러운노래', '맥주펍', '피쳐링미침', 'CF음악', '월요병저리가', 'NAS', '록스피릿', '컬트무비', '힐림', '모아나', '나윤권', 'SHINEE', '두근거리는', '찰떡궁합', '부스터', '뮤지컬음악', '카페플레이리스트', 'ROMANTIC', '반려묘', '찰리XCX', '여름노래모음', '써머', '광주', '댄스힙합', '집에서즐기는', '슈크박스']


In [59]:
model.w2v_model.wv.most_similar(['쓸쓸'])

  if np.issubdtype(vec.dtype, np.int):


[('우울', 0.8182780742645264),
 ('외로움', 0.7527623772621155),
 ('눈물', 0.7503553032875061),
 ('헤어짐', 0.7471939921379089),
 ('쓸쓸한', 0.7417927384376526),
 ('쓸쓸함', 0.7417812347412109),
 ('센치', 0.7397032976150513),
 ('혼자', 0.7139500975608826),
 ('차분', 0.7002543210983276),
 ('그리움', 0.698684811592102)]

In [57]:
model.w2v_model.wv.most_similar(['쓸쓸','쓸쓸한'])

  if np.issubdtype(vec.dtype, np.int):


[('우울', 0.8385971188545227),
 ('외로움', 0.8382763862609863),
 ('쓸쓸함', 0.8005260229110718),
 ('외로운', 0.8002836108207703),
 ('헤어짐', 0.7965164184570312),
 ('혼자', 0.7933205366134644),
 ('눈물', 0.7925050258636475),
 ('센치', 0.7924936413764954),
 ('그리움', 0.7756912708282471),
 ('슬픈', 0.7600284218788147)]

In [58]:
model.w2v_model.wv.most_similar(['쓸쓸','쓸쓸한','쓸쓸함'])

  if np.issubdtype(vec.dtype, np.int):


[('외로움', 0.8746573328971863),
 ('우울', 0.8421176671981812),
 ('헤어짐', 0.8126764297485352),
 ('눈물', 0.8091583251953125),
 ('그리움', 0.8079662322998047),
 ('센치', 0.7993037104606628),
 ('외로운', 0.7963045239448547),
 ('혼자', 0.7916884422302246),
 ('우울한', 0.7630760669708252),
 ('슬픈', 0.7467443943023682)]

In [43]:
model.w2v_model.wv.most_similar(['비오는날','외로움'])

  if np.issubdtype(vec.dtype, np.int):


[('센치', 0.8679744005203247),
 ('비', 0.8412796258926392),
 ('우울', 0.8176730871200562),
 ('쓸쓸한', 0.7884265184402466),
 ('쓸쓸함', 0.7786596417427063),
 ('혼자', 0.7754300832748413),
 ('흐린날', 0.7744088172912598),
 ('쓸쓸', 0.7739285230636597),
 ('눈물', 0.759524941444397),
 ('헤어짐', 0.7594777345657349)]

In [45]:
model.w2v_model.wv.most_similar(['비'])

  if np.issubdtype(vec.dtype, np.int):


[('비오는날', 0.8555936813354492),
 ('장마', 0.8403699398040771),
 ('비올때', 0.8138465881347656),
 ('rain', 0.8061612844467163),
 ('우산', 0.7818849086761475),
 ('센치', 0.7765259742736816),
 ('흐린날', 0.768670916557312),
 ('빗소리', 0.7590976357460022),
 ('rainyday', 0.7486249208450317),
 ('비오는', 0.7432389259338379)]

### **뽑은 태그를 어떻게 넣을 것인가?**
1. 뽑은 그대로 모두 넣자. => `['비', '비오는', '비오는날', '듣고', '듣고싶은', '쓸쓸', '쓸쓸한', '발라드']`

2. 최대로 매칭시켜 넣자. `['비오는날','듣고싶은','쓸쓸한', '발라드']`

In [None]:
model.w2v_model.wv.most_similar(extracted)

  if np.issubdtype(vec.dtype, np.int):


[('비내리는날음악', 0.8404432535171509),
 ('비내리는날노래', 0.8255271315574646),
 ('비오는날음악', 0.8196240067481995),
 ('비가오는날', 0.8098689317703247),
 ('비_오는_날', 0.8053878545761108),
 ('비내릴때', 0.801423966884613),
 ('비도오고그래서', 0.7990456223487854),
 ('비올때', 0.798870325088501),
 ('rain', 0.7935051321983337),
 ('rainyday', 0.7922885417938232)]

In [None]:
model.w2v_model.wv.most_similar(['비', '비오는', '비오는날', '듣고', '쓸쓸', '쓸쓸한', '발라드'])

  if np.issubdtype(vec.dtype, np.int):


[('센치', 0.8220512866973877),
 ('우울', 0.8118997812271118),
 ('비내리는날음악', 0.8067814707756042),
 ('비올때', 0.7983118295669556),
 ('비내리는날노래', 0.7932559251785278),
 ('비내릴때', 0.7845121026039124),
 ('눈물', 0.7821574211120605),
 ('rainyday', 0.7817319631576538),
 ('헤어짐', 0.7795099020004272),
 ('센치함', 0.7786605358123779)]

In [None]:
model.w2v_model.wv.most_similar(['비', '비오는', '비오는날', '쓸쓸', '쓸쓸한', '발라드'])

  if np.issubdtype(vec.dtype, np.int):


[('센치', 0.816281795501709),
 ('우울', 0.8121845722198486),
 ('비내리는날음악', 0.7984887361526489),
 ('눈물', 0.7947648763656616),
 ('헤어짐', 0.7944736480712891),
 ('비올때', 0.790336549282074),
 ('rain', 0.7844325304031372),
 ('비내리는날노래', 0.7784298658370972),
 ('비내릴때', 0.7754480838775635),
 ('흐린날', 0.7752397656440735)]

In [None]:
model.w2v_model.wv.most_similar(['듣고싶은' ,'쓸쓸', '쓸쓸한', '발라드'])

  if np.issubdtype(vec.dtype, np.int):


[('눈물', 0.7937376499176025),
 ('그리움', 0.7879326343536377),
 ('헤어짐', 0.7823807001113892),
 ('외로운', 0.7649414539337158),
 ('슬픈', 0.7643162608146667),
 ('우울할', 0.7586601376533508),
 ('비내리는날음악', 0.7579798698425293),
 ('이별하다', 0.7559695243835449),
 ('이별후', 0.750156044960022),
 ('절절한', 0.7416282296180725)]

In [None]:
model.w2v_model.wv.most_similar(['여름'])

  if np.issubdtype(vec.dtype, np.int):


[('더위', 0.7738407254219055),
 ('휴가', 0.7679039239883423),
 ('시원한', 0.7590504884719849),
 ('여름노래', 0.7480428218841553),
 ('바캉스', 0.7433536052703857),
 ('해변', 0.7349833846092224),
 ('무더위', 0.7162644863128662),
 ('바다', 0.7126994132995605),
 ('시원', 0.7119068503379822),
 ('청량', 0.6945145130157471)]

In [None]:
model.w2v_model.wv.most_similar(['여름','듣고싶은'])

  if np.issubdtype(vec.dtype, np.int):


[('여름송', 0.7075084447860718),
 ('614183', 0.6763818264007568),
 ('470608', 0.6753440499305725),
 ('427195', 0.6748813390731812),
 ('327931', 0.6566426753997803),
 ('11200', 0.6534104347229004),
 ('시원한', 0.6529701352119446),
 ('205426', 0.6521424055099487),
 ('더워', 0.6474078297615051),
 ('여름노래모음', 0.6442521214485168)]

# **Trie**

In [269]:
class Node:
    def __init__(self, value):
        self.value = value
        self.children = {}
        self.is_terminal = False

class Trie:
    def __init__(self, items):
        self.head = Node(None)

        print("********* DB 구성중입니다 *********")
        for item in items:
            self.insert(item)
        print("************ 입력 완료 ************")

    def insert(self, query):
        curr_node = self.head

        for q in query:
            if curr_node.children.get(q) is None:
                curr_node.children[q] = Node(q)
            curr_node = curr_node.children[q]
        curr_node.is_terminal = query

    def extract(self, query, biggest_token = False):
        query += '*' # padding
        curr_node = self.head
        extracted_tags = []

        i = 0
        while i < len(query):
            curr_node = curr_node.children.get(query[i])

            if curr_node is None:
                if biggest_token and prev_node.is_terminal:
                    extracted_tags.append(prev_node.is_terminal)
                curr_node = self.head
                prev_node = self.head

                if self.head.children.get(query[i]) is None:
                    # query[i]로 시작하는 단어가 애초에 없으면
                    i += 1
            else:
                if curr_node.is_terminal:
                    if not biggest_token:
                        extracted_tags.append(curr_node.is_terminal)
                    prev_node = curr_node

                i += 1
 
        return list(set(extracted_tags))

In [None]:
trie = Trie(model.w2v_model.wv.vocab.keys())

********* DB 구성중입니다 *********
************ 입력 완료 ************


In [None]:
query = '비오는 날에 듣고 싶은 꿱 비오는날 쓸쓸한 발라드'
query = query.replace(' ','')
query

'비오는날에듣고싶은꿱비오는날쓸쓸한발라드'

In [None]:
extracted_all = trie.extract_all(query, False)
extracted_max = trie.extract_all(query, True)
print(extracted_all)
print(extracted_max)

['비오는날', '듣고싶은', '쓸쓸한', '듣고', '발라드', '쓸쓸', '비오는', '비']
['비오는날', '듣고싶은', '발라드', '쓸쓸한']


In [362]:
trie.extract_all('아이유 노래만 모아모아')

['아이유', '노래']

In [355]:
trie.extract_all('레드벨벳 노래만 모아모아')

['레드벨벳', '노래']

# **TitleBasedRecommender**

* 노래, 듣고싶은, 어울리는 등 정보가 없는 tag는 어떻게 거르지?

In [None]:
import pandas as pd

song_meta = pd.read_json('/content/drive/MyDrive/Melon-PL-Continuation/0802/train_split/song_meta.json', typ = 'frame')

In [None]:
model.w2v_model.wv.simi

In [547]:
from itertools import combinations
from collections import defaultdict

class TitleBasedRecommender:
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model
        self.trie = Trie(self.w2v_model.wv.vocab.keys())

    def filter_tags(self, tags):
        if len(tags) <= 2:
            return tags

        pairs = []

        for tag1, tag2 in combinations(tags,2):
            sim = self.w2v_model.wv.similarity(tag1, tag2)
            if sim < 0.1:
                continue
            pairs.append((sim, (tag1, tag2)))

        print('> sims :',pairs)

        # 0.75 quantile
        M = max(sim for sim, _ in pairs)
        m = min(sim for sim, _ in pairs)
        threshold = m + 0.3*(M-m)

        print('> threshold :',threshold)

        # vote = defaultdict(int)
        res = []
        for sim, (tag1, tag2) in pairs:
            if sim < threshold:
                continue
            res.append(tag1)
            res.append(tag2)
        #     vote[tag1] += 1
        #     vote[tag2] += 1

        # print('> vote:',vote)
        # minimum_vote = 2 if len(vote) > 2 else 1

        # return [tag for tag, count in vote.items() if count >= minimum_vote]
        return list(set(res))

    def title_based_recommend(self, title, topk , biggest_token = True):
        title = "".join(re.findall('\w', title))
        print('> raw title :', title)
        extracted_tags = self.trie.extract(title,biggest_token)
        print('> Extracted tags :', extracted_tags)
        extracted_tags = self.filter_tags(extracted_tags)
        print("> Filtered tags :",extracted_tags)

        candidates = model.w2v_model.wv.most_similar(extracted_tags,topn = topk*20)
        print('> Similar items :',[(item, round(sim,3)) for item,sim in candidates[:10]])
        songs = [int(item) for item,sim in candidates if item.isdigit()]
        if len(songs) < topk:
            print(f'[Warning] Less than {topk}')
        
        songs = songs[:topk]

        display(song_meta.iloc[songs])

        return songs[:topk]


In [548]:
rec = TitleBasedRecommender(model.w2v_model)

********* DB 구성중입니다 *********
************ 입력 완료 ************


## **추출한 태그 중에서 유용한 태그만 쓰자**
1. tag가 2개 이하면 그대로 쓴다.
* 레드벨벳+노래 / 아이유+노래 => 부적절한 결과가 나옴. 두개일때도 거를 필요가 있다.
* 듣기좋은, 좋은, 좋다 등 역시 걸러내야할듯.
2. combination을 이용해 모든 pair의 similarity를 계산한다.
* 0.1 미만인 경우 제외한다.
3. 0.3 qunatile 이상인 단어만 선택한다.
4. 선택된 단어가 2개 이하인 경우 return한다. 만약 3개 이상인 경우 자기자신과 유사도가 높은 태그가 2개 이상인 경우만 골라 return한다.

## **Vote 적용**
버리고 싶은게 여러개가 생길 때 걸러주려고 만든건데 어짜피 버리고 싶은 애들은 voting 결과도 높은듯..ㅠㅠ

In [489]:
# '듣기좋은'이 제거된다.

rec.filter_tags(['비오는날','듣기좋은','발라드'])

> sims : [(0.27830976, ('비오는날', '듣기좋은')), (0.35439962, ('비오는날', '발라드')), (0.26695967, ('듣기좋은', '발라드'))]
> threshold : 0.29319165349006654
> vote: defaultdict(<class 'int'>, {'비오는날': 1, '발라드': 1})


['비오는날', '발라드']

**듣기좋은을 버리려다가 vote 때문에 다 버렸다 ㅠㅠ**

In [511]:
rec.filter_tags(['비오는날','듣기좋은','발라드','쓸쓸한'])

> sims : [(0.27830976, ('비오는날', '듣기좋은')), (0.35439962, ('비오는날', '발라드')), (0.5973489, ('비오는날', '쓸쓸한')), (0.26695967, ('듣기좋은', '발라드')), (0.46125117, ('듣기좋은', '쓸쓸한')), (0.51886594, ('발라드', '쓸쓸한'))]
> threshold : 0.3660764455795288
> vote: defaultdict(<class 'int'>, {'비오는날': 1, '쓸쓸한': 3, '듣기좋은': 1, '발라드': 1})


['쓸쓸한']

**어울리는의 vote가 가장 높다.**

In [None]:
# 모두 조건을 만족하므로 그대로 return한다.

rec.filter_tags(['비오는날','어울리는','감성','발라드','듣기좋은'])

> sims : [(0.20181474, ('비오는날', '어울리는')), (0.5378175, ('비오는날', '감성')), (0.35439962, ('비오는날', '발라드')), (0.27830976, ('비오는날', '듣기좋은')), (0.34657863, ('어울리는', '감성')), (0.37699884, ('어울리는', '발라드')), (0.4067713, ('어울리는', '듣기좋은')), (0.45224562, ('감성', '발라드')), (0.5519384, ('감성', '듣기좋은')), (0.26695967, ('발라드', '듣기좋은'))]
> threshold : 0.30685184299945834
> vote: defaultdict(<class 'int'>, {'비오는날': 2, '감성': 4, '발라드': 3, '어울리는': 3, '듣기좋은': 2})


['비오는날', '감성', '발라드', '어울리는', '듣기좋은']

In [490]:
# '아이' tag가 제거된다.

rec.filter_tags(['아이','아이유','노래','듣기좋은'])

> sims : [(0.14882997, ('아이', '아이유')), (0.20757104, ('아이', '듣기좋은')), (0.33139578, ('아이유', '노래')), (0.28083703, ('아이유', '듣기좋은')), (0.39229074, ('노래', '듣기좋은'))]
> threshold : 0.22186819911003114
> vote: defaultdict(<class 'int'>, {'아이유': 2, '노래': 2, '듣기좋은': 2})


['아이유', '노래', '듣기좋은']

In [510]:
# 차라리 그룹을 나누는게 낫나?
print(rec.filter_tags(['알앤비','크러쉬','아이유','걸그룹'])) # => 힙합,크러쉬 / 아이유,걸그룹으로 나누어 추천할 수 있도록?
print()
print(rec.filter_tags(['알앤비','크러쉬','아이유'])) # 크러쉬랑 힙합은 낮구나 생각보다?
print()
print(rec.filter_tags(['힙합','크러쉬','아이유'])) # 힙합,크러쉬가 더 적절하지 않나? 힙합,아이유를 sim < 0.1로 제거해버려서 이런듯 ㅠㅠ

> sims : [(0.31463957, ('알앤비', '크러쉬')), (0.28994042, ('알앤비', '걸그룹')), (0.30506712, ('크러쉬', '아이유')), (0.24227764, ('크러쉬', '걸그룹')), (0.32081372, ('아이유', '걸그룹'))]
> threshold : 0.265838460624218
> vote: defaultdict(<class 'int'>, {'알앤비': 2, '크러쉬': 2, '걸그룹': 2, '아이유': 2})
['알앤비', '크러쉬', '걸그룹', '아이유']

> sims : [(0.31463957, ('알앤비', '크러쉬')), (0.30506712, ('크러쉬', '아이유'))]
> threshold : 0.30793885588645936
> vote: defaultdict(<class 'int'>, {'알앤비': 1, '크러쉬': 1})
['알앤비', '크러쉬']

> sims : [(0.2191701, ('힙합', '크러쉬')), (0.30506712, ('크러쉬', '아이유'))]
> threshold : 0.2449392020702362
> vote: defaultdict(<class 'int'>, {'크러쉬': 1, '아이유': 1})
['크러쉬', '아이유']


## **Vote 안할래!**

In [549]:
# '듣기좋은'이 제거된다.

rec.filter_tags(['비오는날','듣기좋은','발라드'])

> sims : [(0.27830976, ('비오는날', '듣기좋은')), (0.35439962, ('비오는날', '발라드')), (0.26695967, ('듣기좋은', '발라드'))]
> threshold : 0.29319165349006654


['비오는날', '발라드']

In [550]:
rec.filter_tags(['비오는날','듣기좋은','발라드','쓸쓸한'])

> sims : [(0.27830976, ('비오는날', '듣기좋은')), (0.35439962, ('비오는날', '발라드')), (0.5973489, ('비오는날', '쓸쓸한')), (0.26695967, ('듣기좋은', '발라드')), (0.46125117, ('듣기좋은', '쓸쓸한')), (0.51886594, ('발라드', '쓸쓸한'))]
> threshold : 0.3660764455795288


['듣기좋은', '비오는날', '발라드', '쓸쓸한']

In [551]:
# 모두 조건을 만족하므로 그대로 return한다.

rec.filter_tags(['비오는날','어울리는','감성','발라드','듣기좋은'])

> sims : [(0.20181474, ('비오는날', '어울리는')), (0.5378175, ('비오는날', '감성')), (0.35439962, ('비오는날', '발라드')), (0.27830976, ('비오는날', '듣기좋은')), (0.34657863, ('어울리는', '감성')), (0.37699884, ('어울리는', '발라드')), (0.4067713, ('어울리는', '듣기좋은')), (0.45224562, ('감성', '발라드')), (0.5519384, ('감성', '듣기좋은')), (0.26695967, ('발라드', '듣기좋은'))]
> threshold : 0.30685184299945834


['비오는날', '감성', '듣기좋은', '어울리는', '발라드']

In [552]:
# '아이' tag가 제거된다.

rec.filter_tags(['아이','아이유','노래','듣기좋은'])

> sims : [(0.14882997, ('아이', '아이유')), (0.20757104, ('아이', '듣기좋은')), (0.33139578, ('아이유', '노래')), (0.28083703, ('아이유', '듣기좋은')), (0.39229074, ('노래', '듣기좋은'))]
> threshold : 0.22186819911003114


['아이유', '노래', '듣기좋은']

In [553]:
# 여기선 아이가 제거 안되지만 ... biggest = True로 할거니까 일단 pass하자

rec.filter_tags(['아이','아이유','노래','듣기좋은'])

> sims : [(0.14882997, ('아이', '아이유')), (0.20757104, ('아이', '듣기좋은')), (0.33139578, ('아이유', '노래')), (0.28083703, ('아이유', '듣기좋은')), (0.39229074, ('노래', '듣기좋은'))]
> threshold : 0.22186819911003114


['아이유', '노래', '듣기좋은']

In [554]:
# 차라리 그룹을 나누는게 낫나?
print(rec.filter_tags(['알앤비','크러쉬','아이유','걸그룹'])) # => 힙합,크러쉬 / 아이유,걸그룹으로 나누어 추천할 수 있도록?
print()
print(rec.filter_tags(['알앤비','크러쉬','아이유'])) # 크러쉬랑 힙합은 낮구나 생각보다?
print()
print(rec.filter_tags(['힙합','크러쉬','아이유'])) # 힙합,크러쉬가 더 적절하지 않나? 힙합,아이유를 sim < 0.1로 제거해버려서 이런듯 ㅠㅠ

> sims : [(0.31463957, ('알앤비', '크러쉬')), (0.28994042, ('알앤비', '걸그룹')), (0.30506712, ('크러쉬', '아이유')), (0.24227764, ('크러쉬', '걸그룹')), (0.32081372, ('아이유', '걸그룹'))]
> threshold : 0.265838460624218
['알앤비', '크러쉬', '걸그룹', '아이유']

> sims : [(0.31463957, ('알앤비', '크러쉬')), (0.30506712, ('크러쉬', '아이유'))]
> threshold : 0.30793885588645936
['알앤비', '크러쉬']

> sims : [(0.2191701, ('힙합', '크러쉬')), (0.30506712, ('크러쉬', '아이유'))]
> threshold : 0.2449392020702362
['아이유', '크러쉬']


## **결과 체쿠체쿠**

In [492]:
# 모든 태그를 이용해서

songs = rec.title_based_recommend('비오는 날에 듣고 싶은 ~~~! 쓸쓸한 발라드',10)

> raw title : 비오는날에듣고싶은쓸쓸한발라드
> Extracted tags : ['비오는날', '듣고싶은', '발라드', '쓸쓸한']
> sims : [(0.27691364, ('비오는날', '듣고싶은')), (0.35439962, ('비오는날', '발라드')), (0.5973489, ('비오는날', '쓸쓸한')), (0.25197995, ('듣고싶은', '발라드')), (0.24387904, ('듣고싶은', '쓸쓸한')), (0.51886594, ('발라드', '쓸쓸한'))]
> threshold : 0.3499200075864792
> vote: defaultdict(<class 'int'>, {'비오는날': 2, '발라드': 2, '쓸쓸한': 2})
> Filtered tags : ['비오는날', '발라드', '쓸쓸한']
> Similar items : [('눈물', 0.782), ('헤어짐', 0.778), ('그리움', 0.769), ('외로운', 0.767), ('비', 0.764), ('쓸쓸', 0.757), ('감성발라드', 0.757), ('이별', 0.754), ('비내리는날음악', 0.752), ('우울', 0.75)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
440014,"[GN0501, GN0101, GN0506, GN0509]",20181204,틈,10228573,[790810],틈 (Vocal by J은),"[GN0500, GN0100]",[platz (플랏츠)],440014
120381,"[GN0501, GN0304, GN0505, GN0301]",20170428,City Of Mine,10058391,[426500],너의 그 말 (Feat. 김태현),"[GN0500, GN0300]",[블랙 스트라이크],120381
469475,"[GN0501, GN0101, GN0506, GN0509]",20160422,담담하게,2680897,"[967065, 967067]",담담하게,"[GN0500, GN0100]","[임슬기, 최강현]",469475
552037,[GN1801],20180807,비를 좋아하는 그대에게,10192167,[1816598],비를 좋아하는 그대에게,[GN1800],[김윤아],552037
352079,[GN0101],20191216,이제야 알았어,10364593,[2735253],이제야 알았어 (Feat. 유성현),[GN0100],[유나잇],352079
459600,"[GN0105, GN0101]",20170505,한동근 1ST ALBUM `Your Diary`,10059997,[711476],지겹다,[GN0100],[한동근],459600
478358,[GN0101],20170919,END,10097192,[1910712],END,[GN0100],[Moonde],478358
172479,"[GN0501, GN0101, GN0506, GN0509]",20170307,Happy Together,10043583,[1382184],Happy Together,"[GN0500, GN0100]",[인환],172479
264312,[GN0101],20180710,’Melody for you’,10183023,[2018151],MOM (겨울나무),[GN0100],[홍아],264312
465818,"[GN0401, GN0402]",20190111,alone,10241580,[2559506],alone,[GN0400],[1F (퍼스트플로어)],465818


**정보가 없는 tag에 의해 결과가 부적절하게 바뀐다.**
* 레드벨벳 + 노래 => 왜 세븐틴이?

In [496]:
songs = rec.title_based_recommend('레드벨벳만 모아모아',15, True)

> raw title : 레드벨벳만모아모아
> Extracted tags : ['레드벨벳']
> Filtered tags : ['레드벨벳']
> Similar items : [('20854', 0.855), ('516240', 0.853), ('591528', 0.849), ('247529', 0.838), ('389292', 0.838), ('212807', 0.835), ('610809', 0.834), ('685425', 0.834), ('483052', 0.833), ('56170', 0.833)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
20854,"[GN2503, GN2501, GN2504, GN0301]",20171117,Perfect Velvet - The 2nd Album,10112486,[780066],Attaboy,"[GN2500, GN0300]",[Red Velvet (레드벨벳)],20854
516240,"[GN1912, GN1902, GN1901]",20190424,Power Up (Japanese Ver.),10277331,[780066],Power Up (Japanese Ver.),[GN1900],[Red Velvet (레드벨벳)],516240
591528,"[GN1912, GN1902, GN1901]",20180704,#Cookie Jar,10181640,[780066],Russian Roulette,[GN1900],[Red Velvet (레드벨벳)],591528
247529,"[GN1912, GN1902, GN1901]",20180704,#Cookie Jar,10181640,[780066],Red Flavor,[GN1900],[Red Velvet (레드벨벳)],247529
389292,"[GN1912, GN1902, GN1901]",20180704,#Cookie Jar,10181640,[780066],Dumb Dumb,[GN1900],[Red Velvet (레드벨벳)],389292
212807,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20180806,Summer Magic - Summer Mini Album,10191694,[780066],Mosquito,"[GN2500, GN0200]",[Red Velvet (레드벨벳)],212807
610809,"[GN1912, GN1902, GN1901]",20190530,SAPPY,10291753,[780066],Power Up (Japanese Ver.),[GN1900],[Red Velvet (레드벨벳)],610809
685425,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20191223,‘The ReVe Festival’ Finale,10368053,[780066],음파음파 (Umpah Umpah),"[GN2500, GN0200]",[Red Velvet (레드벨벳)],685425
483052,"[GN0401, GN2503, GN0403, GN2501]",20160317,The Velvet - The 2nd Mini Album,2673322,[780066],7월 7일 (One Of These Nights) (Joe Millionaire V...,"[GN0400, GN2500]",[Red Velvet (레드벨벳)],483052
56170,"[GN0401, GN2503, GN0403, GN2501]",20160317,The Velvet - The 2nd Mini Album,2673322,[780066],7월 7일 (One Of These Nights) (De-Capo Ver.),"[GN0400, GN2500]",[Red Velvet (레드벨벳)],56170


**아이유는 특이한 케이스지 않을까..**

In [559]:
songs = rec.title_based_recommend('아이유',15)

> raw title : 아이유
> Extracted tags : ['아이유']
> Filtered tags : ['아이유']
> Similar items : [('IU', 0.788), ('아이유콘서트', 0.765), ('이지금', 0.748), ('아이유노래모음', 0.744), ('유애나', 0.743), ('LovePoem', 0.74), ('팔레트', 0.74), ('아이유히트곡', 0.737), ('dlwlrma', 0.736), ('갓이유', 0.735)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
646278,[GN1701],20131124,Nigerian Beats and Drums,3123703,[712938],The End,[GN1700],[Africana],646278
630534,[GN0301],20090702,Suga Luv,598564,[299169],Suga Luv (Feat. 아이유) (Inst.),[GN0300],[비즈니즈],630534
166009,"[GN0105, GN0101]",20121221,내 생애 마지막 오디션 - 제3라운드 최후의 듀엣대결 1탄,2170893,"[713967, 713963]","잔소리 (원곡가수 아이유, 임슬옹)",[GN0100],"[이도진, 김연준]",166009
582913,[GN0201],20131008,Modern Times,2208448,[261143],기다려,[GN0200],[아이유],582913
332962,[GN0101],20101117,피아노 가요 연주곡 13,1078052,[465221],"그대네요 (성시경, 아이유)",[GN0100],[뮤직 쿠키],332962
352289,[GN0101],20090618,음악여행 라라라 Live Vol.5,588952,"[197928, 261143, 236880]",All You Need Is Love,[GN0100],"[짙은, 아이유, 란 (RAN)]",352289
575265,"[GN0801, GN0802]",20120806,"송창식 (가위, 바위, 보)",2171065,[833],가위 바위 보,[GN0800],[송창식],575265
68324,[GN0201],20091112,IU...IM,718505,[261143],아침 눈물 (Inst.),[GN0200],[아이유],68324
522124,[GN2001],20110829,Tassili,2008791,[551807],Takkest Tamidaret,[GN2000],[Tinariwen],522124
429741,"[GN0601, GN0606]",20191101,Love poem,10346650,[261143],시간의 바깥,[GN0600],[아이유],429741


**아이유 + 좋다 => 엉뚱한 vector가 되면서 많이 다른 결과가 된다.**

In [560]:
songs = rec.title_based_recommend('아이유 참 좋다',15, True)

> raw title : 아이유참좋다
> Extracted tags : ['좋다', '아이유']
> Filtered tags : ['좋다', '아이유']
> Similar items : [('LovePoem', 0.72), ('아이유셋리스트', 0.717), ('아이유히트곡', 0.688), ('아이유콘서트', 0.683), ('갓이유', 0.676), ('이지금', 0.665), ('dlwlrma', 0.664), ('광주', 0.66), ('밤편지', 0.658), ('팔레트', 0.654)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
360045,"[GN0501, GN0601, GN0503, GN0606, GN0509]",20170716,Home to Stay (효리네 민박 타이틀),10079737,[727771],Home to Stay,"[GN0500, GN0600]",[쏠라티],360045
646833,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20150331,다시 시작,2311660,[717128],달자 (Feat. 림 Of 쏠라티),"[GN0500, GN0800]",[플레이모드],646833
575265,"[GN0801, GN0802]",20120806,"송창식 (가위, 바위, 보)",2171065,[833],가위 바위 보,[GN0800],[송창식],575265
332962,[GN0101],20101117,피아노 가요 연주곡 13,1078052,[465221],"그대네요 (성시경, 아이유)",[GN0100],[뮤직 쿠키],332962
8706,"[GN0805, GN0801]",20161222,제 27회 유재하 음악경연대회,10025324,[1273983],나무에 걸린 물고기,[GN0800],[장희원팀],8706
646278,[GN1701],20131124,Nigerian Beats and Drums,3123703,[712938],The End,[GN1700],[Africana],646278
272355,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20160726,게다가 주말,2699764,[944301],게다가 주말,"[GN0500, GN0800]",[천석만],272355
639160,"[GN0105, GN0101]",20170922,꽃갈피 둘,10096855,[261143],비밀의 화원,[GN0100],[아이유],639160
119850,"[GN1806, GN1801]",20131216,눈 오는 날 골목길 피아노 (Snow),2221597,[727946],군고구마와 군밤,[GN1800],[Littlepiano],119850
522124,[GN2001],20110829,Tassili,2008791,[551807],Takkest Tamidaret,[GN2000],[Tinariwen],522124


**크러쉬 정보가 전혀 없다 ..**
벡터를 더해버렸으니 그럴 수 있지만 .... 그래두 ....

In [564]:
songs = rec.title_based_recommend('크러쉬가 부르는 잔잔 발라드',10)

> raw title : 크러쉬가부르는잔잔발라드
> Extracted tags : ['발라드', '잔잔', '크러쉬']
> sims : [(0.23998842, ('발라드', '잔잔')), (0.18575108, ('발라드', '크러쉬')), (0.20158195, ('잔잔', '크러쉬'))]
> threshold : 0.20202228128910066
> Filtered tags : ['발라드', '잔잔']
> Similar items : [('씁쓸', 0.723), ('어쿠스틱음악', 0.706), ('47356', 0.704), ('밤에듣기좋은곡', 0.698), ('인디발라드', 0.697), ('172479', 0.697), ('낙옆', 0.697), ('670465', 0.696), ('보고싶음', 0.69), ('610513', 0.69)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
47356,"[GN0509, GN0105, GN0101, GN0506, GN0501]",20171114,그런 날,10110979,[672314],그런 날,"[GN0500, GN0100]",[신기남],47356
172479,"[GN0501, GN0101, GN0506, GN0509]",20170307,Happy Together,10043583,[1382184],Happy Together,"[GN0500, GN0100]",[인환],172479
670465,"[GN0105, GN0101]",20171103,가을 햇살속에서,10108422,[764300],가을 햇살속에서,[GN0100],[더 라임],670465
610513,"[GN0105, GN0101]",20170602,잊지 못해,10067953,[1758498],잊지 못해,[GN0100],[더블오케이],610513
556041,"[GN0401, GN0403]",20180112,Drama,10128138,[1703493],Drama (Feat. Min:D),[GN0400],[노이케이 (Noi.K)],556041
181249,"[GN0509, GN0105, GN0101, GN0506, GN0501]",20190421,독백,10276274,[2112076],희망고문 (Vocal by 잎샘),"[GN0500, GN0100]",[필름아일랜드],181249
463400,"[GN0501, GN0101, GN0506, GN0509]",20180326,널,10150803,[905133],널 (Feat. 이한솔),"[GN0500, GN0100]",[조우리 (샘샘트리오)],463400
591427,"[GN0501, GN0101, GN0506, GN0509]",20171011,내 방 침대위엔,10101127,[1625333],내 방 침대 위엔,"[GN0500, GN0100]",[코튼페이퍼],591427
478358,[GN0101],20170919,END,10097192,[1910712],END,[GN0100],[Moonde],478358
459600,"[GN0105, GN0101]",20170505,한동근 1ST ALBUM `Your Diary`,10059997,[711476],지겹다,[GN0100],[한동근],459600


**크러쉬가 혼자 있다면 괜찮은 결과**

In [565]:
songs = rec.title_based_recommend('크러쉬',10)

> raw title : 크러쉬
> Extracted tags : ['크러쉬']
> Filtered tags : ['크러쉬']
> Similar items : [('딘', 0.752), ('밀릭', 0.724), ('fanxychild', 0.723), ('팬시차일드', 0.722), ('페노메코', 0.71), ('DEAN', 0.709), ('Crush', 0.704), ('231100', 0.68), ('470732', 0.655), ('225139', 0.647)]


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
231100,[GN0401],20191205,From Midnight To Sunrise,10361387,[674710],Wonderlust (Feat. Band Wonderlust),[GN0400],[Crush],231100
470732,"[GN0401, GN0402]",20191205,From Midnight To Sunrise,10361387,[674710],Cloth,[GN0400],[Crush],470732
225139,"[GN0401, GN0403]",20160617,후미등 (Taillight),2691778,[861257],후미등 (Taillight),[GN0400],[충완],225139
508973,"[GN0303, GN0301]",20181220,Garden,10235475,[787227],COOL (Feat. Tobi Lou),[GN0300],[페노메코 (PENOMECO)],508973
67386,"[GN0303, GN0301]",20170916,쇼미더머니 6 Special,10096841,[340669],"bestdriverZ (Feat. Zion.T, DEAN)",[GN0300],[행주],67386
561157,"[GN0401, GN0402]",20180713,wonderlost,10184916,[674710],"RYO (Feat. CIFIKA, Byung Un of Balming Tiger)",[GN0400],[Crush],561157
145755,"[GN0501, GN0304, GN0505, GN0301]",20171020,1AM (Have A Good Night) (Prod.by Wildwhip),10104013,[873668],1AM (Have A Good Night) (Prod.by Wildwhip),"[GN0500, GN0300]",[Lil Prince],145755
127406,"[GN0401, GN0403, GN0402]",20161216,On And On,10023711,[735515],Lust (Feat. ELO),[GN0400],[Hoody (후디)],127406
454522,"[GN0401, GN0403, GN0402]",20160324,130 mood : TRBL,2674623,[880630],bonnie & clyde,[GN0400],[DEAN],454522
487705,"[GN0401, GN0403, GN0402]",20160324,130 mood : TRBL,2674623,[880630],i love it (Feat. Dok2),[GN0400],[DEAN],487705


# **점수 체크**

In [18]:
import numpy as np

class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _load_json(self,fname):
        with open(fname, encoding="utf-8") as f:
            json_obj = json.load(f)

        return json_obj

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = self._load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = self._load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

In [19]:
evaluator = ArenaEvaluator()