## Set up

In [None]:
import pandas as pd
import numpy as np
import json as js
import re
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

## Get the list of weeks to scrape

In [None]:
curr_path = '/content/drive/MyDrive/Self-study/ML/melon/'
date_df = pd.read_csv(curr_path + 'melon_data/date.csv')

Calculate sample size to get 99% confidence interval with a 5% margin of error using Cochran formula: There are 922 weeks (population is songs that on top weekly chart from 22/11/2004

https://www.statisticshowto.com/probability-and-statistics/find-sample-size/

In [None]:
z = 2.576
e = 0.05
p = 0.5
q = 0.5
n = ((z**2)*p*q)/e**2

In [None]:
n

663.5776

In order to get more than 664 unique songs, we should scrape 25 weeks

In [1]:
date_df = date_df.sample(n= 25, random_state = 1)

NameError: ignored

## Scrape Melon chart data

highest chart ranking
add metadata:
- artist gender 
- playlist tag
- weighted like, share, count, etc of the songs?
- artist like counts
- artist debut date
- band or not?

Github link for melon api: https://github.com/ko28/melon-api

- Some issue dates are missing. Mostly old songs
- Some songIds are missing
Only get songs from 2000 to 2022 because older songs may not be a good training example for new songs.
- Domestic comprehensive for 2000s

- class = 'rank'
- class="ellipsis rank01" => song title
- class="ellipsis rank02" => artist name (goArtistDetail -> id)
- data-song-no => songId
- class="ellipsis rank03" => album name (goAlbumDetail -> id)

(also have info about music video)

In [None]:
def get_song_id(temp_html):
    try: 
        target = temp_html.find('div', {'class': 'ellipsis rank01'}).a['href']
        pattern = 'javascript:melon.play.playSong\(\'19080101\',\'(\d*)\'\);'
        return re.findall(pattern, target)[0]
    except: 
        return np.nan

def get_artist_id(temp_html):
    try: 
        target = temp_html.find('div', {'class': 'ellipsis rank02'}).a['href']
        pattern = r'javascript:melon.link.goArtistDetail\(\'(\d*)\'\);'
        return re.findall(pattern, target)[0]
    except:
        return np.nan

def get_alb_id(temp_html):
    try:
        target = temp_html.find('div', {'class': 'ellipsis rank03'}).a['href']
        pattern = r'javascript:melon.link.goAlbumDetail\(\'(\d*)\'\);'
        return re.findall(pattern, target)[0]
    except:
        return np.nan

In [None]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.melon.com/")

In [None]:
main_url = 'https://www.melon.com/chart/search/list.htm?chartType=WE&age={age}&year={year}&mon={mon}&day={day}&classCd={classCd}&startDay={startDay}&endDay={endDay}&moved=Y'

def save_chart(age, year, mon, day, classCd, startDay, endDay):
    global main_url
    sub_url = main_url.format(age = age, year = year, mon = mon, day = day, classCd = classCd, startDay = startDay, endDay = endDay)
    driver.get(sub_url)
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    chart_list = []
    bs_list = bs.tbody.find_all('tr',{'class': 'lst50'})
    bs_list += bs.tbody.find_all('tr',{'class': 'lst100'})
    file_path = curr_path + 'melon_data/chart/{startDay}.csv'

    for temp_html in bs_list:
        temp_dict = {}

        #temp_dict['rank']
        try: 
            temp_dict['rank'] = temp_html.find('span', {'class': 'rank top'}).text
        except: 
            temp_dict['rank'] = temp_html.find('span', {'class': 'rank'}).text

        #temp_dict['song_name']
        try:
            temp_dict['song_name'] = temp_html.find('div', {'class': 'wrap_song_info'}).a.text
        except: 
            temp_dict['song_name'] = np.nan

        #temp_dict['song_id']
        temp_dict['song_id'] = get_song_id(temp_html)

        #temp_dict['artist_name']
        try:
            temp_dict['artist_name'] = temp_html.find('div', {'class': 'ellipsis rank02'}).a.text.strip()
        except: 
            temp_dict['artist_name'] = np.nan

        #temp_dict['artist_id']
        temp_dict['artist_id'] = get_artist_id(temp_html)

        #temp_dict['alb_name']
        try:
            temp_dict['alb_name'] = temp_html.find('div', {'class': 'ellipsis rank03'}).a.text.strip()
        except:
            temp_dict['alb_name'] = np.nan

        #temp_dict['alb_id']
        temp_dict['alb_id'] = get_alb_id(temp_html)

        chart_list.append(temp_dict)
    chart_df = pd.DataFrame(chart_list)
    chart_df.to_csv(file_path.format(startDay = startDay))

In [None]:
for row_idx, row_series in date_df.iterrows():
    age = row_series[0]
    year = row_series[1]
    mon = row_series[2]
    day = row_series[3]
    classCd = row_series[4]
    startDay = row_series[5]
    endDay = row_series[6]
    save_chart(age, year, mon, day, classCd, startDay, endDay)
    sleep(2)