In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json, os, io, sys, gc, time, copy, random, warnings
warnings.filterwarnings("ignore")
import distutils.dir_util
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

def load_json(fname):
    # json 불러오기
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)
    return json_obj

def write_json(data, fname):
    # json 저장하기
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError
    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        
def check_target_type(df):
    # 노래 및 태그 유무에 따른 dataframe 인덱스 분할
    no_song_idx = []
    no_tag_idx = []
    no_both_idx = []
    yes_all_idx = []
    for i in df.index:
        ### v1 : 노래 O 태그 X
        if bool(df.loc[i:i, 'songs'].values[0])       & bool(not df.loc[i:i, 'tags'].values[0]):
            no_tag_idx.append(i)
        ### v2 : 노래 X 태그 O
        elif bool(not df.loc[i:i, 'songs'].values[0]) & bool(df.loc[i:i, 'tags'].values[0]):
            no_song_idx.append(i)
        ### v3 : 노래 O 태그 O
        elif bool(df.loc[i:i, 'songs'].values[0])     & bool(df.loc[i:i, 'tags'].values[0]):
            yes_all_idx.append(i)          
        ### v4 : 노래 X 태그 X
        else:
            no_both_idx.append(i)
    print("노래 O 태그 X : {}개".format(len(no_tag_idx)))
    print("노래 X 태그 O : {}개".format(len(no_song_idx)))
    print("노래 O 태그 O : {}개".format(len(yes_all_idx)))
    print("노래 X 태그 X : {}개".format(len(no_both_idx)))
    return no_tag_idx, no_song_idx, yes_all_idx, no_both_idx

def most_popular(playlists, col, topk_count):
    # plylst 내 노래 counter 및 상위 topk_count 노래 출력
    c = Counter()
    for doc in playlists[col]:
        c.update(doc)
    topk = c.most_common(topk_count)
    return c, [k for k, v in topk]

def remove_seen(seen, l):
    # l에서 seen이 없는 요소 출력 (이미 존재하는 노래 및 태그 제거 위함)
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [None]:
# data_path = '../../../1_melon_playlist/data/'   # data 경로
data_path = '/kaggle/input/kakao-arena-melon/'
data_path_new = '/kaggle/input/kakao-save-data/'

train = pd.read_json(data_path+'train.json')
val = pd.read_json(data_path+'val.json')
test = pd.read_json(data_path+'test.json')
song_meta = load_json(data_path+'song_meta.json')
# genre_gn_all = pd.read_json(data_path+'genre_gn_all.json', typ = 'series')

### make_file.ipynb 내에서 만들어진 tag_song_dict / song_tag_dict 불러오기
tag_song_dict = load_json(data_path_new+'tag_song_dict_no0.json')
song_tag_dict = load_json(data_path_new+'song_tag_dict.json')

In [None]:
train.index = train['id']
train.head()

In [None]:
plylst_song_dict = {}
for i in tqdm(train.index):
    plylst_song_dict[i] = train.loc[i]['songs']

In [None]:
def flatten(df_col):
    # nested list 풀기
    list_of_list = df_col.values.tolist()
    flatten = [j for i in list_of_list for j in i]
    return flatten

In [None]:
# onehot_dict = {}
# for i in train.index:
#     onehot_dict[i] = [1 if song in train.loc[i]['songs'] else 0 for song in tqdm(song_list)]

song_all = pd.concat([train['songs'], val['songs'], test['songs']])
song_counter = Counter(flatten(song_all))
del song_all
# song_counter.most_common(20)
# song_counter[144663]
len(song_counter)

In [None]:
plt.boxplot(list(song_counter.values()))
plt.show()

In [None]:
thres = np.quantile(list(song_counter.values()), 0.75)
song_list = []
for s, cnt in song_counter.items():
    if cnt > thres:
        song_list.append(s)
len(song_list)  ### 곡 개수가 649091개에서 156348개로 감소

In [None]:
def make_before_pivot(data):
    # id별 포함된 노래 id dataframe 생성
    result = []
    for cnt, i in enumerate(data.index):
        if cnt % 10000 == 0:
            print('{} / {}'.format(cnt, len(data)))
        for song in data.loc[i]['songs']:
            if song in song_list:
                result.append({'id':data.loc[i]['id'], 'song':song, 'point':1})
    return pd.DataFrame(result)

before_pivot_train = make_before_pivot(train)
before_pivot_val = make_before_pivot(val)
before_pivot_test = make_before_pivot(test)

before_tmp = pd.concat([before_pivot_train, before_pivot_val, before_pivot_test])

In [None]:
def make_cosine_pred(before_tmp):

    def sparse_argsort(arr):
        indices = np.nonzero(arr)[0]
        return indices[np.argsort(arr[indices])]

    R_df = before_tmp.pivot(index = 'id', columns ='song', values = 'point').fillna(0)
    cosine_array = cosine_similarity(R_df, R_df)
    predicted_array = np.zeros(shape=(len(R_df.index), len(R_df.columns)))
    for i in tqdm(range(len(cosine_array))):
        top_200 = sparse_argsort(cosine_array)[-201:][::-1]
        weighted_sum = np.array([0])
        for top_idx in top_200:
            weighted_sum = weighted_sum + (cosine_array[i][top_idx]/((song_counter[song]-1)**0.4+1))
        predicted = weighted_sum / len(top_200)
        predicted_array[i] = predicted
        gc.collect()
    # 기존 pivot table에 predicted array 더한 결과(?)
    iu_predicted = R_df.values*(-99999) + predicted_array     

    ##### 플레이리스트 id별 코사인 유사도 높은 상위 200곡 dict 생성
    print("make dic data")
    cf_dic = {}
    for i in range(len(iu_predicted)):
        cf_dic[R_df.index[i]] = R_df.columns[iu_predicted[i].argsort()[-200:][::-1]].tolist()    
        gc.collect()
    return cf_dic

In [None]:
cf_dic = make_cosine_predict(before_tmp)

In [None]:
no_tag, no_song, yes_index, no_both = check_target_type(val)
val1 = val[val.index.isin(no_tag)]

In [None]:
val1_predict = []
for i in tqdm(val1.index):
    ##### song 채우기
    year = val1.loc[i]['year']
    row_number = val1.loc[i]['id']
    cur_song = remove_seen(val1.loc[i]['songs'], list(cf_dic[row_number]))[:100]
    ##### tag 채우기
    tag_counter = Counter()
    for song in val1.loc[i]['songs']:
        if str(song) in song_tag_dict:
            for tag in song_tag_dict[str(song)]:
                tag_counter.update({tag : 1})
    tag_counter = sorted(tag_counter.items(), key= lambda x:x[1], reverse = True)        
    cur_tag = []
    for k in tag_counter[:10]:
        cur_tag.append(k[0])     
    if len(cur_tag) == 0:
        cur_tag = popular_year[year]['tags'][:10]
    elif len(cur_tag) < 10:
        update_tag = remove_seen(cur_tag, popular_year[year]['tags'])
        cur_tag.extend(update_tag)
        cur_tag = cur_tag[:10]
    ### val1_predict(제출 결과물)에 추가
    val1_predict.append({
        "id" : val1.loc[i]['id'],
        "songs": cur_song,
        "tags": cur_tag,
    })

In [None]:
write_json(val1_predict, data_path+'val1_weighted_cosine_sim.json')