In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import os
from time import sleep
from tqdm.auto import tqdm
import random
import pandas as pd

In [16]:
TOP_MUSIC_URL =  lambda year_param, page_param, pop : f"https://www.melon.com/chart/age/index.htm?chartType=YE&chartGenre={pop}&chartDate={year_param}#params%5Bidx%5D={page_param}"

def random_sleep(t_range=(2, 5)):
  time2sleep = random.uniform(*t_range)
  sleep(time2sleep)

def get_music_info(tr_web_element, year=None)->dict:
  '''
  Description
  -----------
  melon에서 곡에 대한 webdriver element를 입력받고 이미지, 가수, 노래 제목을 parsing & formatting 해주는 함수
  
  Parameters
  ----------
  tr_web_element : melon에서 곡에 대한 webdriver element
  year : 해당 곡이 Top music에 포함된 년도
  
  Returns
  -------
  곡 정보(dict) : {
    "Image" : url,
    "artist" : name,
    "title" : music name
  }
  '''
  image_url = tr_web_element.find_element(By.CSS_SELECTOR, "td > .wrap > a > img").get_attribute("src")
  title = tr_web_element.find_element(By.CSS_SELECTOR, ".t_left > .wrap > .wrap_song_info a").get_attribute("title")
  artist = tr_web_element.find_element(By.CSS_SELECTOR, ".t_left > .wrap > .wrap_song_info .ellipsis.rank02 > a").text
  return {
    "image_url" : image_url,
    "artist" : artist,
    "title" : title,
    "top_year" : year
  }

In [10]:
# 파싱을 진행할 기간 설정
period = range(2000, 2024)

options = Options()
options.add_argument("--headless")  # 헤드리스 모드로 설정
driver = webdriver.Chrome(options=options)
driver.get('https://www.melon.com/')

### ________________________ ###

# 국내 음원 KPOP

# 전체 정보를 담을 리스트
music_data_kpop = list()

# url을 다르게 넣어주면서 페이지 순회
for year_param in tqdm(period):
  for page_param in [1, 51]:
    music_info = []
    random_sleep()
    url2parsing = TOP_MUSIC_URL(year_param, page_param, "KPOP")
    driver.get(url2parsing)
    random_sleep()
    # 곡 정보들 획득
    if page_param == 1:
      music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[:50]
      
    elif page_param == 51:
      music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[50:]
    
    random_sleep()
    for web_ele in music_web_elements:
      music_info.append(get_music_info(web_ele, year_param))

    music_data_kpop.extend(music_info)

# df로 저장
pd.DataFrame(music_data_kpop).drop_duplicates().to_csv("melon_top_music_kpop.csv", index=False)


## ------------------------------------------------ ##

# 해외 음원 POP

# 전체 정보를 담을 리스트
music_data_pop = list()

# url을 다르게 넣어주면서 페이지 순회
for year_param in tqdm(period):
  for page_param in [1, 51]:
    music_info = []
    random_sleep()
    url2parsing = TOP_MUSIC_URL(year_param, page_param, "POP")
    driver.get(url2parsing)
    random_sleep()
    # 곡 정보들 획득
    if page_param == 1:
      music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[:50]
      
    elif page_param == 51:
      music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[50:]
    
    random_sleep()
    for web_ele in music_web_elements:
      music_info.append(get_music_info(web_ele, year_param))

    music_data_pop.extend(music_info)

# df로 저장
pd.DataFrame(music_data_pop).drop_duplicates().to_csv("test.csv", index=False)

driver.quit()

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

In [17]:
# 파싱을 진행할 기간 설정
period = range(1980, 2000)

options = Options()
#options.add_argument("--headless")  # 헤드리스 모드로 설정
driver = webdriver.Chrome(options=options)
driver.get('https://www.melon.com/')

### ________________________ ###

# 국내 음원 KPOP

# 전체 정보를 담을 리스트
music_data_kpop = list()

# url을 다르게 넣어주면서 페이지 순회
for year_param in tqdm(period):
  for page_param in [1, 51]:
    music_info = []
    random_sleep()
    url2parsing = TOP_MUSIC_URL(year_param, page_param, "KPOP")
    try:
      driver.get(url2parsing)
      random_sleep()
      # 곡 정보들 획득
      if page_param == 1:
        music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[:50]
        
      elif page_param == 51:
        music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[50:]
        
      random_sleep()
      for web_ele in music_web_elements:
        music_info.append(get_music_info(web_ele, year_param))

      music_data_kpop.extend(music_info)
    except:
      pass

# df로 저장
pd.DataFrame(music_data_kpop).drop_duplicates().to_csv("kpop_test.csv", index=False)


## ------------------------------------------------ ##

# 해외 음원 POP

# 전체 정보를 담을 리스트
music_data_pop = list()

# url을 다르게 넣어주면서 페이지 순회
for year_param in tqdm(period):
  for page_param in [1, 51]:
    music_info = []
    random_sleep()
    url2parsing = TOP_MUSIC_URL(year_param, page_param, "POP")
    try:
      driver.get(url2parsing)
      random_sleep()
      # 곡 정보들 획득
      if page_param == 1:
        music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[:50]
      
      elif page_param == 51:
        music_web_elements = driver.find_elements(By.CSS_SELECTOR, '#frm > table > tbody > tr')[50:]
    
      random_sleep()
      for web_ele in music_web_elements:
        music_info.append(get_music_info(web_ele, year_param))

      music_data_pop.extend(music_info)
    except:
      pass

# df로 저장
pd.DataFrame(music_data_pop).drop_duplicates().to_csv("pop_test.csv", index=False)

driver.quit()

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [36]:
y2000_kpop = pd.read_csv('melon_top_music_kpop.csv')
y2000_pop = pd.read_csv('melon_top_music_pop.csv')
y1980_kpop = pd.read_csv('kpop_test.csv')
y1980_pop = pd.read_csv('pop_test.csv')

# pop / kpop 라벨 추가
y2000_kpop["class"] = "kpop"
y2000_pop["class"] = "pop"
y1980_kpop["class"] = "kpop"
y1980_pop["class"] = "pop"


# 병합
pd.concat([y1980_kpop, y1980_pop, y2000_kpop, y2000_pop], axis=0).to_csv("top_music_meta.csv", index=False)