In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
import scipy.sparse as spr
import json
from sklearn.metrics.pairwise import cosine_similarity
import datetime
import pickle

import warnings

warnings.filterwarnings( 'ignore' )

pd.set_option( 'display.max.row' , 200 )


genre = pd.read_json('data/genre_gn_all.json',typ = 'series' )
train = pd.read_json( 'data/train.json'  , encoding = 'utf-8' )
val = pd.read_json( 'data/val.json'  , encoding = 'utf-8' )
song_meta = pd.read_json( 'data/song_meta.json')


# 최종
test = pd.read_json('data/test.json')

**분야별 집계함수 만들기**

In [2]:
def get_cluster_matrix( df  , cls_data , by = 'song' ):
    """
    # df = dataframe
            play list 
    # cls_dat = dataframe
            clustered data
    """
    
    df = df
    cls_data = cls_data

    n_cluster = len(cls_data.label.unique()) 
    n_plylst = len( df )
    cls_data = cls_data[ cls_data.columns[:2] ]

    cl_val = dict( zip( cls_data.iloc[:,0] , cls_data.iloc[:,1]))
        
    if by == 'song':
        df = df[['id' , 'songs']]
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs])) for vs in df.songs ]
        
    elif by == 'word':
        df = df[['id' , 'complex_col']]
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs if cl_val.get(str(v)) != None])) for vs in df.complex_col ]
    
    elif by == 'singer':
        df = df[['id' , 'complex_col']]
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs if cl_val.get(str(v)) != None])) for vs in df.complex_col ]
        
    elif by == 'tag':
        df = df[['id' , 'tags']].reset_index( drop = True )
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs if cl_val.get(str(v)) != None])) for vs in df.tags ]

    elif by == 'genre':
        df = df[['id' , 'gnr']]
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs if cl_val.get(str(v)) != None])) for vs in df.gnr ]

    elif by == 'album':## add
        df = df[['id' , 'albums']]
        cl_ply_values = [ dict(Counter([ cl_val.get(str(v)) for v in vs if cl_val.get(str(v)) != None])) for vs in df.albums ]

        
    col_dot = [list(d.keys()) for d in cl_ply_values] 
    row = np.repeat( range( n_plylst )  , [ len( c ) for c in col_dot ] ) 
    col = [ c for cols in col_dot for c in cols ]
    data = [ d for k in cl_ply_values for d in list( k.values() ) ]
    matrix_cluster = spr.csr_matrix( ( data , ( row , col )) , shape = ( n_plylst , n_cluster ) )
    
    return pd.concat( [ df.id ,pd.DataFrame(matrix_cluster.toarray() , columns = [f'{by}_cl{i}' for i in range( n_cluster ) ] )]  ,axis = 1)
    

<h3></h3>
<h3></h3>

### 후보군 선별하기

**사전에 날짜 거르기** 

후보군을 뽑을때, 플레이 리스트보다 날짜가 나중이라면, 그 노래를 제외한다.

- 노래와 날짜가 매핑된 변수를 생성하고 ( get_song_date_dict ) -->  song_date!! 변수
- 후보군을 뽑는과정 ( get_candidate ) 에서 걸러준다.

In [3]:
def get_song_date_dict( song_meta ): # song_meta를 받아서 날짜와 매핑한다.
    # 간혹 20050200 같은 놈들은 20050101
    # 0 인놈들은 1990 으로 변경
    a = []
    for t in song_meta.issue_date:
        try:
            a.append(datetime.datetime.strptime( str(t) , '%Y%m%d' ).date())
        except:
            try:
                a.append(datetime.datetime.strptime( str(t)[:4] , '%Y' ).date())
            except:
                a.append(datetime.datetime.strptime( '1990' , '%Y' ).date())
    return dict(zip(song_meta.id , a))

song_date = get_song_date_dict(song_meta) ### 



<h3></h3>
<h3></h3>

**유사한 플레이리스트를 찾는 함수와, 플레이리스트가 담고 있는 노래, 태그들을 모아 주는 함수**

In [4]:
def get_sim_ply(val_ply , train_ply , top_ply = 30): # 유사도 높은 playlist뽑기
    """
    val_ply : df
        예측할 playlist ( 노래기반 val_song , word 기반 val_word , 둘다 val_ply )
    train_ply : df
        비교대상 playlist ( 노래기반 train_song , word 기반 train_word , 둘다 train_ply )
    top_ply : int
        상위 노래 갯수
    """
    
    n_train = len(train_ply)
    n_val = len( val_ply )
    train_key = dict(zip( range(n_train) , train_ply.id  ))
    val_key = dict(zip( range(n_val) , val_ply.id  ))

    sim_matrix = cosine_similarity(val_ply.set_index('id') , train_ply.set_index('id'))

    sim_ply_dict = {}
    for i , val_ in tqdm(enumerate(sim_matrix)):
        sim_ply_dict[val_key[i]] = [train_key[p] for p in np.argsort(-val_)[:top_ply]]

    return sim_ply_dict

<h3></h3>
<h3></h3>

**나름의 분산처리**

In [5]:
def dist_get_sim_ply( start,inter , val_ply  , train_ply   , tn = 30 ):
    res = {}
    max_n = len(val_ply)
    for i in range( start , max_n ,  inter ):
        try:
            tmp = get_sim_ply( val_ply = val_ply.iloc[i : i + inter] ,train_ply = train_ply , top_ply = tn )
            res.update( tmp )
            print(f'{i+inter} 만큼 했다')
        except:
            print( f'ERROR { i } 에서 Save')
            return res
    return res

유틸

In [6]:
def save_to_pickle( ob , f_path ):
    with open( f_path , 'wb') as f:
        pickle.dump( ob , f)

<h3></h3>
<h3></h3>

**song matrix만들기**

In [7]:
song_vec = pd.read_pickle( 'data/song_clustering/clust_song200.pickle') # train + val + test 기반 임베딩
song_vec = song_vec[['song_id' , 'label'] + list(song_vec.columns[3:12])]

train_song = get_cluster_matrix( train , song_vec , by ='song') # song ( train )
val_song = get_cluster_matrix( val , song_vec , by ='song') # song ( val )

# 최종!
# test_song = get_cluster_matrix( test , song_vec  , by = 'song' ) 

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_song , train_ply = train_song , tn = 7 )
save_to_pickle( sim_ply , 'candi/res_ver34_song.pickle')

<h3></h3>
<h3></h3>

**word matrix만들기**
- tag + title + genre !!

In [8]:
# 단어들로 이루어진 ply df ( train + val )
ply_words = pd.read_pickle( 'data/transfromation/tag_gnr_title_df.pickle')

#임베딩된 개별 단어 df
word_vec = pd.read_pickle( 'data/word_clustering/clu_tag_gnr_title_emb_100' )


total_word = get_cluster_matrix( ply_words , word_vec , by = 'word')

val_word = total_word.loc[ total_word.id.isin( list(val.id.values) ) ] # word ( val )
train_word = total_word.loc[ total_word.id.isin( list(train.id.values ) ) ] # word ( 

# 최종!
# test_word = total_word.loc[ total_word.id.isin( list(test.id.values ) ) ]
# del total_word #( 메모리 최적화를 위해 )

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_word , train_ply = train_word , tn = 7 )
save_to_pickle( sim_ply , 'candi/res_ver34_word.pickle')

<h3></h3>
<h3></h3>

**singer matrix만들기**


In [9]:
# 가수와 제목으로 이루어진 ply df ( train + val + test )
ply_singer = pd.read_pickle( 'data/transfromation/title_singer_df.pickle')

#임베딩된 개별 단어,가수
singer_vec = pd.read_pickle( 'data/word_clustering/clu_singer_emb_100')


total_singer = get_cluster_matrix( ply_singer , singer_vec , by = 'singer')

val_singer = total_singer.loc[ total_singer.id.isin( list(val.id.values) ) ] # singer 
train_singer = total_singer.loc[ total_singer.id.isin( list(train.id.values) ) ] # sin

#최종!
# test_singer = total_singer.loc[ total_singer.id.isin( list(test.id.values) ) ] 
# del total_word # 메모리 최적화

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_singer , train_ply = train_singer , tn = 7 )
save_to_pickle( sim_ply , 'candi/res_ver34_singer.pickle')

<h3></h3>
<h3></h3>

**tag matrix만들기**

In [10]:
# 테그로만 이루어진 ply df
tag_df = pd.read_pickle('data/transfromation/tag_df.pickle') # tag_df 에 train , val , test 모두 합친거 넣기

# 임베딩된 태그들
tag_vec = pd.read_pickle('data/word_clustering/clustering_tag_emb_30.pickle') # train , val , test 모두 합쳐서 embeding


In [11]:
total_tag = get_cluster_matrix( tag_df , tag_vec , by = 'tag')

val_tag = total_tag.loc[ total_tag.id.isin( list(val.id.values) )]
train_tag = total_tag.loc[ total_tag.id.isin( list(train.id.values) )]

#최종
# test_tag = total_tag.loc[ total_tag.id.isin( list(test.id.values) )]

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_tag , train_ply = train_tag , tn = 7 )
save_to_pickle( sim_ply , 'candi/res_ver34_tag.pickle')

<h3></h3>
<h3></h3>


**genre matrix 만들기**

In [12]:
# 노래와 장르 dictionary 화
def song_to_gnr ( df ):
    global song_meta
    d_g = song_meta.song_gn_dtl_gnr_basket.values
    g = song_meta.song_gn_gnr_basket.values

    song_gnr = dict(zip(song_meta.id , [ d_g[i] + g[i]  for i in range(len(d_g))]) )
    gnrs = [[ [g for s in sgs for g in song_gnr[s]] ] for sgs in df.songs.values]
    return pd.DataFrame( gnrs , columns = ['gnr'] , index = df.id)

In [13]:
# 장르와 노래 맵핑
def get_genre_matrix( df ):
    global genre
    gnr_new = pd.DataFrame(list(zip(genre.index , range(len(genre.index)))) , columns = ['id' , 'label'])
    gnrs = song_to_gnr( df )

    res = get_cluster_matrix( gnrs.reset_index() , gnr_new , by = 'genre')
    res.columns = ['id'] + list(genre.values)
    return res

In [14]:
# 데이터 병합과정
g = pd.concat( [train , val] )
g = pd.concat( [g , test ] )


total_gnr = get_genre_matrix(g)

val_gnr = total_gnr.loc[ total_gnr.id.isin( list(val.id.values) ) ] # singer 
train_gnr = total_gnr.loc[ total_gnr.id.isin( list(train.id.values) ) ] # sin

#최종!
# train_gnr = total_gnr.loc[ total_gnr.id.isin( list(train.id.values) ) ] # sin
# del train_gnr # 메모리 최적화

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_gnr , train_ply = train_gnr , tn = 7 )
save_to_pickle( sim_ply , 'candi/res_ver34_gnr.pickle')

<h3></h3>
<h3></h3>

**album matrix 만들기**

In [15]:
album_df = pd.read_pickle('data/transfromation/album_df.pickle')
album_vec = pd.read_pickle('data/song_clustering/clust_album_emb100.pickle')


total_album = get_cluster_matrix( album_df , album_vec , by = 'album')
train_album = total_album.loc[ total_album.id.isin( train.id.values ) ]
val_album = total_album.loc[ total_album.id.isin( val.id.values ) ]

# 최종 
# test_album = album_df.loc[ album_df.id.isin( train.id.values ) ]

In [None]:
sim_ply = dist_get_sim_ply( 0 , 3000 , val_ply = val_album , train_ply = train_album )
save_to_pickle( sim_ply , 'candi/res_ver34_album.pickle')

<h3></h3>
<h3></h3>

### 임의 추가적 비교 기준

1. song + word + singer + gnr + tag


In [16]:
train_ply = pd.merge( train_song , train_word , on = 'id' , how = 'inner')
train_ply = pd.merge( train_ply , train_singer , on = 'id' , how = 'inner')
train_ply = pd.merge( train_ply , train_gnr , on = 'id' , how = 'inner')
train_ply = pd.merge( train_ply , train_tag , on = 'id' , how = 'inner')
train_ply = pd.merge( train_ply , train_album , on = 'id' , how = 'inner')

In [17]:
val_ply = pd.merge( val_song , val_word , on = 'id' , how = 'inner')
val_ply = pd.merge( val_ply , val_singer , on = 'id' , how = 'inner')
val_ply = pd.merge( val_ply , val_gnr , on = 'id' , how = 'inner')
val_ply = pd.merge( val_ply , val_tag , on = 'id' , how = 'inner')
val_ply = pd.merge( val_ply , val_album , on = 'id' , how = 'inner')

In [None]:
sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_ply , train_ply = train_ply , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver34_ply.pickle')

In [18]:
train_SAG = pd.merge( train_singer , train_album , on = 'id' , how = 'inner' )
train_SAG = pd.merge( train_SAG , train_gnr , on = 'id' , how = 'inner' )

val_SAG = pd.merge( val_singer , val_album , on = 'id' , how = 'inner' )
val_SAG = pd.merge( val_SAG , val_gnr , on = 'id' , how = 'inner' )

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_SAG , train_ply = train_SAG , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver37_SAG.pickle')

3000it [00:28, 105.92it/s]


3000 만큼 했다


3000it [00:28, 105.28it/s]


6000 만큼 했다


3000it [00:28, 103.56it/s]


9000 만큼 했다


3000it [00:28, 105.66it/s]


12000 만큼 했다


3000it [00:28, 103.80it/s]


15000 만큼 했다


3000it [00:27, 107.45it/s]


18000 만큼 했다


3000it [00:27, 109.21it/s]


21000 만큼 했다


2015it [00:19, 105.62it/s]


24000 만큼 했다


In [20]:
train_SWG = pd.merge( train_song , train_word , on = 'id' , how = 'inner' )
train_SWG = pd.merge( train_SWG , train_gnr , on = 'id' , how = 'inner' )

val_SWG = pd.merge( val_song , val_word , on = 'id' , how = 'inner' )
val_SWG = pd.merge( val_SWG , val_gnr , on = 'id' , how = 'inner' )

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_SWG , train_ply = train_SWG , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver37_SWG.pickle')

3000it [00:26, 111.66it/s]


3000 만큼 했다


3000it [00:27, 108.23it/s]


6000 만큼 했다


3000it [00:27, 108.41it/s]


9000 만큼 했다


3000it [00:27, 110.65it/s]


12000 만큼 했다


3000it [00:27, 107.94it/s]


15000 만큼 했다


3000it [00:27, 108.54it/s]


18000 만큼 했다


3000it [00:27, 110.28it/s]


21000 만큼 했다


2015it [00:17, 111.99it/s]


24000 만큼 했다


In [21]:
train_SWAG = pd.merge( train_SWG , train_album , on = 'id' , how = 'inner')
val_SWAG = pd.merge( val_SWG , val_album , on = 'id' , how = 'inner')

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_SWAG , train_ply = train_SWAG , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver39_SWAG.pickle')

3000it [00:27, 110.69it/s]


3000 만큼 했다


3000it [00:27, 110.57it/s]


6000 만큼 했다


3000it [00:27, 107.41it/s]


9000 만큼 했다


3000it [00:28, 105.00it/s]


12000 만큼 했다


3000it [00:26, 113.94it/s]


15000 만큼 했다


3000it [00:26, 112.95it/s]


18000 만큼 했다


3000it [00:27, 110.67it/s]


21000 만큼 했다


2015it [00:19, 105.45it/s]


24000 만큼 했다


In [22]:
train_TSA = pd.merge( train_tag , train_singer , on = 'id' , how = 'inner' )
train_TSA = pd.merge( train_TSA , train_album , on = 'id' , how = 'inner' )

val_TSA = pd.merge( val_tag , val_singer , on = 'id' , how = 'inner' )
val_TSA = pd.merge( val_TSA , val_album , on = 'id' , how = 'inner' )

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_TSA , train_ply = train_TSA , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver40_TSA.pickle')


3000it [00:26, 113.27it/s]


3000 만큼 했다


3000it [00:26, 111.63it/s]


6000 만큼 했다


3000it [00:26, 112.75it/s]


9000 만큼 했다


3000it [00:26, 111.22it/s]


12000 만큼 했다


3000it [00:27, 108.59it/s]


15000 만큼 했다


3000it [00:26, 112.17it/s]


18000 만큼 했다


3000it [00:26, 112.74it/s]


21000 만큼 했다


2015it [00:17, 112.32it/s]


24000 만큼 했다


In [23]:
train_WAT = pd.merge( train_tag , train_singer , on = 'id' , how = 'inner' )
train_WAT = pd.merge( train_WAT , train_word , on = 'id' , how = 'inner' )

val_WAT = pd.merge( val_tag , val_singer , on = 'id' , how = 'inner' )
val_WAT = pd.merge( val_WAT , val_word , on = 'id' , how = 'inner' )

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_WAT , train_ply = train_WAT , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver40_WAT.pickle')



3000it [00:27, 108.74it/s]


3000 만큼 했다


3000it [00:27, 110.55it/s]


6000 만큼 했다


3000it [00:27, 110.60it/s]


9000 만큼 했다


3000it [00:28, 104.25it/s]


12000 만큼 했다


3000it [00:27, 107.64it/s]


15000 만큼 했다


3000it [00:28, 107.04it/s]


18000 만큼 했다


3000it [00:27, 108.64it/s]


21000 만큼 했다


2015it [00:18, 108.23it/s]


24000 만큼 했다


In [24]:
train_SWAT = pd.merge( train_WAT , train_song , on = 'id' , how = 'inner' )
val_SWAT = pd.merge( val_WAT , val_song , on = 'id' , how = 'inner' )

sim_ply = dist_get_sim_ply( 0 ,  3000 , val_ply = val_SWAT , train_ply = train_SWAT , tn = 7)
save_to_pickle( sim_ply , 'candi/res_ver40_SWAT.pickle')

3000it [00:28, 106.77it/s]


3000 만큼 했다


3000it [00:30, 99.21it/s] 


6000 만큼 했다


3000it [00:29, 102.88it/s]


9000 만큼 했다


3000it [00:28, 105.52it/s]


12000 만큼 했다


3000it [00:28, 106.41it/s]


15000 만큼 했다


3000it [00:28, 104.77it/s]


18000 만큼 했다


3000it [00:28, 107.08it/s]


21000 만큼 했다


2015it [00:18, 107.86it/s]


24000 만큼 했다
