In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import time
import tqdm
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# os.chdir('./melon-playlist-continuation-master/')
import arena_util
import evaluate

In [2]:
train = pd.read_json('./arena_data/orig/train.json')
val = pd.read_json('./arena_data/questions/val.json')
song_meta = pd.read_json('./res/song_meta.json')
answer = pd.read_json('./arena_data/answers/val.json')

In [3]:
# make song_coo.txt
maxrow = train['id'].max()
num_song = len(song_meta)
N = sum(train['songs'].apply(len)) + sum(val['songs'].apply(len))

f = open('./song_coo.txt', 'w')
f.write("%d %d %d\n" % (maxrow+1, num_song, N))
for i, q in train.iterrows():
    for song in q['songs']:
        f.write("%d %d %d %d\n" % (q['id'], song, 1, 0))
f.close()

f = open("./song_coo.txt", 'a')
for i, q in val.iterrows():
    for song in q['songs']:
        f.write("%d %d %d %d\n" % (q['id'], song, 1, 1))
f.close()

In [4]:
# make csr_matrix
rows = []
cols = []
scores = []
mask = []

with open("./song_coo.txt", 'r') as f:
    num_row, num_col, N = map(int, f.readline().split())
    for _ in range(N):
      r, c, s, m = map(int, f.readline().split())
      if m == 1: # 0 : train / 1 : val, test
          rows.append(r)
          cols.append(c)
          scores.append(float(s))
    
ratings = csr_matrix((np.array(scores, dtype=np.float32), 
                    (np.array(rows), np.array(cols))), 
                    shape = (num_row, num_col))

ratings.data = np.ones(len(ratings.data))

In [5]:
# make als_model

als_model_50 = AlternatingLeastSquares(factors = 50)
als_model_80 = AlternatingLeastSquares(factors = 80)
als_model_100 = AlternatingLeastSquares(factors = 100)



In [6]:
# make model_50, 80, 100
als_model_50.fit(ratings.T)
als_model_80.fit(ratings.T)
als_model_100.fit(ratings.T)

with open('./als_model_50.pkl', 'wb') as f :
    pickle.dump(als_model_50, f)
with open('./als_model_80.pkl', 'wb') as f :
    pickle.dump(als_model_80, f)
with open('./als_model_100.pkl', 'wb') as f :
    pickle.dump(als_model_100, f)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [7]:
# make song_rec
song_rec_50 = {}
song_rec_80 = {}
song_rec_100 = {}

for i in val['id'] :
    plst_rec_song = als_model_50.recommend(i, ratings, 100)
    song_rec_50[i] = plst_rec_song
for i in val['id'] :
    plst_rec_song = als_model_80.recommend(i, ratings, 100)
    song_rec_80[i] = plst_rec_song
for i in val['id'] :
    plst_rec_song = als_model_100.recommend(i, ratings, 100)
    song_rec_100[i] = plst_rec_song

In [8]:
val.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[휴식, 카페, 분위기]",130541,"여유와 커피 한 모금, 분위기 있는 카페로 순간이동시키는 음악",[],3,2017-11-01 10:05:49.000
1,[상큼함],78849,차분한 화요일을 만들어줄 플레이리스트 :),"[602814, 482241, 430106, 285104, 570294, 67091...",4,2017-08-08 11:02:30.000
2,[매장음악],29457,트렌디한 샵에서 흘러나오던 그 POP,[],528,2016-01-18 12:32:46.000
3,[잔잔한],15576,감성적인 노래 모음,"[707724, 574455, 670194, 187200, 331016, 14042...",6,2016-09-27 23:12:12.000
4,[],37588,스트레스 강제진압 53,"[421614, 695187, 588277, 246291, 7175, 78505, ...",0,2019-03-27 15:28:48.000


In [9]:
print('50 :', song_rec_50[80563][0])
print('80 :', song_rec_80[80563][0])
print('100 :', song_rec_100[80563][0])

50 : (701557, 0.13896546)
80 : (133143, 0.25187457)
100 : (133143, 0.21207003)


In [10]:
answer.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[집중, 피아노, 일상]",130541,"여유와 커피 한 모금, 분위기 있는 카페로 순간이동시키는 음악","[11924, 407825, 523291, 591403, 140867, 507936...",3,2017-11-01 10:05:49.000
1,"[열대야, 차분함]",78849,차분한 화요일을 만들어줄 플레이리스트 :),"[521899, 116573, 82844, 315578, 159636, 448742...",4,2017-08-08 11:02:30.000
2,[클럽],29457,트렌디한 샵에서 흘러나오던 그 POP,"[234295, 694350, 561270, 634998, 403020, 71842...",528,2016-01-18 12:32:46.000
3,"[추억, 회상]",15576,감성적인 노래 모음,"[627363, 567383, 27469, 64052, 481669, 132247,...",6,2016-09-27 23:12:12.000
4,[인디],37588,스트레스 강제진압 53,"[374286, 679886, 277177, 635851, 88753, 302833...",0,2019-03-27 15:28:48.000


In [11]:
for i, s in song_rec_50[80563] :
    print(i, end=' ')

701557 418935 164508 461476 300087 256956 440160 362966 196849 333595 464051 130110 457124 133143 493762 466529 385234 249229 112399 643628 625875 236393 589983 573179 482512 351342 274504 515964 277034 95059 617900 200877 379112 455945 601037 417368 118049 207232 475319 446812 65053 376435 213435 677187 178044 19364 553437 567991 53369 346969 26083 326204 204547 581105 671154 451489 335520 459165 165281 78180 695032 311 493804 425945 2188 396233 512098 495821 695494 5049 621388 524956 409058 6546 55791 592826 498452 114387 330553 232045 335756 699518 358186 544743 656512 521330 590012 243850 590379 348200 43471 696317 96310 387859 496394 461680 650494 443902 337267 175059 

In [12]:
for i in answer[answer['id'] == 80563]['songs'] :
    print(i)

[196972, 263417, 311, 439691, 666852, 381083, 536464, 217707, 499747, 149980, 545585, 371635, 729, 116573, 643070, 249990, 346913, 53369, 237407, 4828, 398955, 596663, 20900, 701557, 164508, 19234, 103588, 236767, 198610, 339594, 409563, 637303, 310687, 1814, 171498, 151136, 356211, 267559, 672550, 592295, 465901, 280187, 17502, 58836, 207558, 661896, 197168, 506359, 673110, 458216, 566257]


In [13]:
for i, s in song_rec_50[80563] :
    if i in answer[answer['id'] == 80563]['songs'] :
        print(i)

In [14]:
# make tag2id, id2tag dict
tag_set = set()

for i, q in train.iterrows():
  for s in q['tags']:
    tag_set.add(s)
for i, q in val.iterrows():
  for s in q['tags']:
    tag_set.add(s)

tag2id = {x : i for i, x in enumerate(list(tag_set))}
id2tag = {i : x for i, x in enumerate(list(tag_set))}

In [15]:
# make tag_coo.txt
maxrow = train['id'].max()
num_tag = len(tag_set)
N = sum(train['tags'].apply(len)) + sum(val['tags'].apply(len))

f = open('./tag_coo.txt', 'w')
f.write("%d %d %d\n" % (maxrow+1, num_tag, N))
for i, q in train.iterrows():
    for tag in q['tags']:
        f.write("%d %d %d %d\n" % (q['id'], tag2id[tag], 1, 0))
f.close()

f = open("./tag_coo.txt", 'a')
for i, q in val.iterrows():
    for tag in q['tags']:
        f.write("%d %d %d %d\n" % (q['id'], tag2id[tag], 1, 1))
f.close()

In [16]:
# make csr_matrix
rows = []
cols = []
scores = []
mask = []

with open("./tag_coo.txt", 'r') as f:
    num_row, num_col, N = map(int, f.readline().split())
    for _ in range(N):
      r, c, s, m = map(int, f.readline().split())
      if m == 1: # 0 : train / 1 : val, test
          rows.append(r)
          cols.append(c)
          scores.append(float(s))
    
tag_ratings = csr_matrix((np.array(scores, dtype=np.float32), 
                    (np.array(rows), np.array(cols))), 
                    shape = (num_row, num_col))

tag_ratings.data = np.ones(len(tag_ratings.data))

In [17]:
# make als_model

als_model_50_tag = AlternatingLeastSquares(factors = 50)
als_model_80_tag = AlternatingLeastSquares(factors = 80)
als_model_100_tag = AlternatingLeastSquares(factors = 100)

In [18]:
# make model_50, 80, 100
als_model_50_tag.fit(tag_ratings.T)
als_model_80_tag.fit(tag_ratings.T)
als_model_100_tag.fit(tag_ratings.T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [19]:
# make tag_rec
tag_rec_50 = {}
tag_rec_80 = {}
tag_rec_100 = {}

for i in val['id'] :
    plst_rec_tag = als_model_50_tag.recommend(i, tag_ratings, 10)
    tag_rec_50[i] = plst_rec_tag
for i in val['id'] :
    plst_rec_tag = als_model_80_tag.recommend(i, tag_ratings, 10)
    tag_rec_80[i] = plst_rec_tag
for i in val['id'] :
    plst_rec_tag = als_model_100_tag.recommend(i, tag_ratings, 10)
    tag_rec_100[i] = plst_rec_tag

In [20]:
for i, s in tag_rec_50[80563] :
    print(id2tag[i], end=' ')

커피 감각적인 느낌있는 여유 여행산책 주말 취향저격 방콕 재즈힙합 오후 

In [21]:
for i in answer[answer['id'] == 80563]['tags'] :
    print(i)

['힐링', '까페']


In [22]:
# make final_rec dict
final_rec_lst_50 = []
final_rec_lst_80 = []
final_rec_lst_100 = []

for key in song_rec_50.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_50[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_50.append(final_rec)

for key in song_rec_80.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_80[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_80[key]]
    final_rec_lst_80.append(final_rec)

for key in song_rec_100.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_100[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_100[key]]
    final_rec_lst_100.append(final_rec)

In [23]:
# make final_rec json
arena_util.write_json(final_rec_lst_50, 'rec50.json')
arena_util.write_json(final_rec_lst_80, 'rec80.json')
arena_util.write_json(final_rec_lst_100, 'rec100.json')

In [24]:
evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec50.json')

Music nDCG: 0.0796247
Tag nDCG: 0.0321261
Score: 0.0724999


In [25]:
evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec80.json')

Music nDCG: 0.0852301
Tag nDCG: 0.0267591
Score: 0.0764594


In [26]:
evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec100.json')

Music nDCG: 0.0867067
Tag nDCG: 0.0221192
Score: 0.0770186


In [27]:
# song_rec = 100 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_100.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_100[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_com.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_com.json')

Music nDCG: 0.0867067
Tag nDCG: 0.0321261
Score: 0.0785196


In [28]:
# song_rec_200

als_model_200 = AlternatingLeastSquares(factors = 200)
als_model_200.fit(ratings.T)
with open('./als_model_200.pkl', 'wb') as f :
    pickle.dump(als_model_200, f)

song_rec_200 = {}
for i in val['id'] :
    plst_rec_song = als_model_200.recommend(i, ratings, 100)
    song_rec_200[i] = plst_rec_song

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [30]:
# song_rec = 200 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_200.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_200[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_200_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_200_50.json')

Music nDCG: 0.0917026
Tag nDCG: 0.0321261
Score: 0.0827661


In [29]:
# song_rec_300

als_model_300 = AlternatingLeastSquares(factors = 300)
als_model_300.fit(ratings.T)
with open('./als_model_300.pkl', 'wb') as f :
    pickle.dump(als_model_300, f)

song_rec_300 = {}
for i in val['id'] :
    plst_rec_song = als_model_300.recommend(i, ratings, 100)
    song_rec_300[i] = plst_rec_song

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [31]:
# song_rec = 300 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_300.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_300[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_300_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_300_50.json')

Music nDCG: 0.0940249
Tag nDCG: 0.0321261
Score: 0.0847401


In [34]:
# song_rec_400
als_model_400 = AlternatingLeastSquares(factors = 400)
als_model_400.fit(ratings.T)
with open('./als_model_400.pkl', 'wb') as f :
    pickle.dump(als_model_400, f)

song_rec_400 = {}
for i in val['id'] :
    plst_rec_song = als_model_400.recommend(i, ratings, 100)
    song_rec_400[i] = plst_rec_song

# song_rec = 400 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_400.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_400[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_400_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_400_50.json')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


Music nDCG: 0.0952848
Tag nDCG: 0.0321261
Score: 0.085811


In [35]:
# song_rec_500
als_model_500 = AlternatingLeastSquares(factors = 500)
als_model_500.fit(ratings.T)
with open('./als_model_500.pkl', 'wb') as f :
    pickle.dump(als_model_500, f)

song_rec_500 = {}
for i in val['id'] :
    plst_rec_song = als_model_500.recommend(i, ratings, 100)
    song_rec_500[i] = plst_rec_song

# song_rec = 500 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_500.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_500[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_500_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_500_50.json')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


Music nDCG: 0.0958654
Tag nDCG: 0.0321261
Score: 0.0863045


In [38]:
# song_rec_600
als_model_600 = AlternatingLeastSquares(factors = 600)
als_model_600.fit(ratings.T)
with open('./als_model_600.pkl', 'wb') as f :
    pickle.dump(als_model_600, f)

song_rec_600 = {}
for i in val['id'] :
    plst_rec_song = als_model_600.recommend(i, ratings, 100)
    song_rec_600[i] = plst_rec_song

# song_rec = 600 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_600.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_600[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_600_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_600_50.json')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


Music nDCG: 0.0959749
Tag nDCG: 0.0321261
Score: 0.0863976


In [39]:
# song_rec_700
als_model_700 = AlternatingLeastSquares(factors = 700)
als_model_700.fit(ratings.T)
with open('./als_model_700.pkl', 'wb') as f :
    pickle.dump(als_model_700, f)

song_rec_700 = {}
for i in val['id'] :
    plst_rec_song = als_model_700.recommend(i, ratings, 100)
    song_rec_700[i] = plst_rec_song

# song_rec = 700 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_700.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_700[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_700_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_700_50.json')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


Music nDCG: 0.0958212
Tag nDCG: 0.0321261
Score: 0.0862669


In [36]:
# song_rec_800
als_model_800 = AlternatingLeastSquares(factors = 800)
als_model_800.fit(ratings.T)
with open('./als_model_800.pkl', 'wb') as f :
    pickle.dump(als_model_800, f)

song_rec_800 = {}
for i in val['id'] :
    plst_rec_song = als_model_800.recommend(i, ratings, 100)
    song_rec_800[i] = plst_rec_song

# song_rec = 800 / tag_rec = 50
final_rec_lst_com = []

for key in song_rec_800.keys() :
    final_rec = {}
    final_rec['id'] = key
    final_rec['songs'] = [int(i) for i, s in song_rec_800[key]]
    final_rec['tags'] = [id2tag[int(i)] for i, s in tag_rec_50[key]]
    final_rec_lst_com.append(final_rec)

arena_util.write_json(final_rec_lst_com, 'rec_800_50.json')

evaluater = evaluate.ArenaEvaluator()
evaluater.evaluate('./arena_data/answers/val.json', './arena_data/rec_800_50.json')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


Music nDCG: 0.0955438
Tag nDCG: 0.0321261
Score: 0.0860312
