## Set Up

In [None]:
import pandas as pd
import json as js 
import numpy as np
import re
import requests 
from bs4 import BeautifulSoup
import time

## Load Datasets

In [None]:
curr_path = '/content/drive/MyDrive/Self-study/ML/melon/'
song_meta = pd.read_json(curr_path + 'melon_data/song_meta.json')
date_df = pd.read_csv(curr_path + 'melon_data/date.csv')

## Get `is_top` 

In [None]:
#get `issue_year`
issue_year_list = []
for date in song_meta['issue_date']:
    if date != 0:
        issue_year_list.append(int(str(date)[:4]))
    else:
        issue_year_list.append(np.nan)

song_meta['issue_year'] = issue_year_list

In [None]:
#remove song with various artists
idx = 0
idx_lst = []
for id in song_meta['artist_id_basket']:
    if len(id) < 2:
        idx_lst.append(idx)
    idx+=1

song_meta = song_meta.iloc[idx_lst,:]

In [None]:
#remove [] in `artist_id_basket`, `artist_name_basket`, and `song_gn_gnr_basket`

artist_list = []
artist_name_list = []
genre_list = []

for row_idx, row_series in song_meta.iterrows():
    try:
        artist_list.append(row_series[4][0])
    except:
        artist_list.append(np.nan)
    try:
        artist_name_list.append(row_series[7][0])
    except:
        artist_name_list.append(np.nan)
    try:
        genre_list.append(row_series[6][0])
    except:
        genre_list.append(np.nan)

song_meta['artist_id'] = artist_list
song_meta['song_gn_gnr'] = genre_list
song_meta['artist_name'] = artist_name_list
song_meta.drop(['artist_id_basket', 'artist_name_basket','song_gn_gnr_basket'], inplace = True, axis = 1)

In [None]:
# read all charts file 
startDay_list = []

for row_idx, row_series in date_df.iterrows():
    startDay_list.append(row_series[5])

startDay_list = startDay_list[:-1]
file_path_list = [curr_path + 'melon_data/chart/{startDay}.csv'.format(startDay = day) for day in startDay_list]

In [None]:
#concatenate all charts
all_charts_df = pd.DataFrame({'rank':[], 'song_name': [], 'song_id': [], 'artist_name':[], 'artist_id':[], 'alb_name' :[], 'alb_id':[]})

for file_path in file_path_list:
    additional_df = pd.read_csv(file_path)
    all_charts_df = all_charts_df.append(additional_df, ignore_index = True)

In [None]:
#drop duplicated values in order to get the list of unique songs on the chart
unique_songs_df = all_charts_df.drop_duplicates(['song_id'])
unique_songs_df

Unnamed: 0.1,rank,song_name,song_id,artist_name,artist_id,alb_name,alb_id,Unnamed: 0
0,1.0,당신의 밤 (Feat. 오혁),30179089.0,황광희 X 개코,1285544.0,무한도전 위대한 유산,10027428.0,0.0
1,2.0,에라 모르겠다,30147445.0,BIGBANG (빅뱅),198094.0,MADE,10022709.0,1.0
2,3.0,Beautiful,30157753.0,Crush,674710.0,도깨비 OST Part 4,10024106.0,2.0
3,4.0,좋다고 말해,30163110.0,볼빨간사춘기,792022.0,Full Album RED PLANET 'Hidden Track',10024816.0,3.0
4,5.0,Stay With Me,30132687.0,찬열 (CHANYEOL),672857.0,도깨비 OST Part.1,10020654.0,4.0
...,...,...,...,...,...,...,...,...
27992,96.0,인생찬가,35008534.0,임영웅,994944.0,IM HERO,10923444.0,95.0
27996,100.0,사랑역,35008529.0,임영웅,994944.0,IM HERO,10923444.0,99.0
28063,67.0,팡파레,35145136.0,다비치,236815.0,Season Note,10955743.0,66.0
28080,84.0,미친 것처럼,35126568.0,V.O.S,108794.0,아픔을 말하는,10954134.0,83.0


In [None]:
#merge the song metadata with the unique songs dataset. NaN on the right means that the song has never been on the top weekly chart. NaN pn left means that there is some 
song_meta.rename(columns = {'album_id': 'alb_id', 'album_name': 'alb_name'}, inplace = True)
main_df = song_meta.merge(unique_songs_df, how = 'outer', on = ['song_name', 'artist_id', 'alb_id'])

#the warning happens because we may only merge a copy of the dataframe instead of a deep copy of it. Nothing to worry about since we save the new info into main_df

In [None]:
main_df[(~np.isnan(main_df['rank']))]['issue_year'].value_counts()

2017.0    383
2019.0    353
2018.0    328
2016.0     94
2020.0     83
2014.0      9
2015.0      5
2011.0      5
2013.0      5
2012.0      4
2005.0      2
2007.0      1
2004.0      1
2010.0      1
1995.0      1
1998.0      1
2008.0      1
2006.0      1
Name: issue_year, dtype: int64

After merging the song_metadata and charts from 2017-5/2022. It turns out that the distribution of songs on top is mostly over 2016-2020. It makes sense because the charts are from 2017-5/2022. If taking into consideration other songs from other years outside of 2016-2020 then there would be a serious problem of class imbalance and irrelevant data since it’s really uncommon for a song released in 2002 to make it to 2017 charts ⇒ tradeoff remove every song not from 2016-2020


There are many songs that make it to the top 100 weekly but don't have metadata. However, we can't extract metadata for them because it will create bias towards songs that couldn't make it to the top 100 weekly. Therefore, we have to eliminate those songs

In [None]:
#create `is_top`

main_df['is_top'] = ~np.isnan(main_df['rank'])
main_df.drop(['rank', 'artist_name_y', 'alb_name_y', 'Unnamed: 0'], axis = 1, inplace = True)
main_df.dropna(subset = ['issue_date'], inplace = True)
main_df.rename(columns = {'alb_name_x':'alb_name', 'artist_name_x':'artist_name'}, inplace = True)
main_df = main_df[(main_df['issue_year'] >= 2016) & (main_df['issue_year'] <= 2020)]
main_df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,alb_name,alb_id,song_name,id,issue_year,artist_id,song_gn_gnr,artist_name,song_id,is_top
2,[GN0901],20180518.0,Hit,4698747.0,Solsbury Hill (Remastered 2002),2.0,2018.0,3361.0,GN0900,Peter Gabriel,,False
7,"[GN0105, GN0101]",20170320.0,Pastel Reflection,10047088.0,"사랑, 그대라는 멜로디",9.0,2017.0,753752.0,GN0100,진호,,False
8,[GN1201],20170407.0,Luv.Loops,10053652.0,Hi (Heyoo),10.0,2017.0,1625859.0,GN1200,Miraa.,,False
11,"[GN1701, GN1703]",20161117.0,Groove It,10015809.0,Voyage,13.0,2016.0,1221039.0,GN1700,JBeeS Jazz Bigband,,False
14,[GN0901],20191023.0,Earth Glow,10341972.0,Can&#39;t Stand Still,16.0,2019.0,896417.0,GN0900,Ruelle,,False
...,...,...,...,...,...,...,...,...,...,...,...,...
658427,"[GN1301, GN1302]",20181018.0,Colors Compilation,2692383.0,Everybody (Just Bounce),707976.0,2018.0,967093.0,GN1300,Vbnd,,False
658428,"[GN0908, GN0901]",20191025.0,My Blood,10343254.0,My Blood,707978.0,2019.0,100377.0,GN0900,Westlife,,False
658430,"[GN0401, GN0403]",20171202.0,Mirrorball,10116357.0,Mirrorball (Feat. YunB),707980.0,2017.0,924416.0,GN0400,SUMIN (수민),,False
658431,"[GN2207, GN1501, GN1506, GN1509]",20160601.0,생일왕국의 프린세스 프링 OST1 : 프린세스 프링의 초대,2688257.0,생일축하노래,707981.0,2016.0,889414.0,GN1500,ButterFly,,False


In [None]:
main_df.to_csv(curr_path+ 'melon_data/main_df')