# Gaon Chart
### 2021 Album Chart Web Scraping
### Data from: *http://gaonchart.co.kr/*

### ---

### Importing libraries


In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

### Pulling data using BeautifulSoup

In [2]:
URL = 'http://www.gaonchart.co.kr/main/section/chart/album.gaon?nationGbn=T&serviceGbn=&targetTime=01&hitYear=2021&termGbn=month'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", "Accept-Encoding":"gzip, deflate, br", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1", "Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, "html.parser")
soup = BeautifulSoup(soup.prettify(), "html.parser")
soup

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="http://gaonchart.co.kr" property="og:url"/>
<meta content="국내 대표 음악 차트 가온차트!" property="og:title"/>
<meta content="대한민국 유료 음악 회원 2,000만명의 선택으로 만들어지는 국내 최초 공인 음악차트 '가온차트' " property="og:description"/>
<meta content="http://gaonchart.co.kr/common/img/gaon_meta_main_img.jpg" property="og:image"/>
<meta content="website" property="og:type"/>
<title>
   gaon chart
  </title>
<link href="/css/gaon.css?ver=20180419" rel="stylesheet" type="text/css"/>
<link href="/se/css/smart_editor2.css" rel="stylesheet" type="text/css"/>
<link href="/se/css/smart_editor2_in.css" rel="stylesheet" type="text/css"/>
<link href="/se/css/smart_editor2_items.css" rel="stylesheet" type="text/css"/>
<link href="/se/css/smart_editor2_out.css" rel="stylesheet" typ

### Storing data in dictionary

In [17]:
items = soup.find_all('tr')[1:101] # all albums and album information

for item in items:
    album = {}
    album["album_rank"] = int(item.find("td", class_="ranking").get_text().strip())

    change = item.find("td", class_="change")
    try:
        album["rank_change"] = change.span['class'][0]
        if album["rank_change"] == "up":
            album["rank_difference"] = int(change.get_text().strip())
        elif album["rank_change"] == "down":
            album["rank_difference"] = int(change.get_text().strip())*-1
        else:
            album["rank_difference"] = None
    except:
        album["rank_change"] = change.get_text().strip()
        album["rank_difference"] = None

    date = soup.find('div', class_="fl").get_text().strip()
    album["month"] = int(date.split(" ")[1][:2])

    subject = item.find("td", class_="subject")
    album["title"] = subject.p['title']
    album["artist"] = subject.find("p", class_="singer").get_text().strip()

    sales = item.find("td", class_="count").p.get_text().strip()
    album["new_sales"] = sales.split(" / ")[0]
    album["cumulative_sales"] = sales.split(" / ")[1]
    album["production"] = item.find("td", class_="production").p.get_text().strip()
    print(album)

{'album_rank': 1, 'rank_change': 'new', 'rank_difference': None, 'month': 1, 'title': 'THE FIRST STEP : TREASURE EFFECT', 'artist': 'TREASURE (트레저)', 'new_sales': '266,894', 'cumulative_sales': '266,894', 'production': 'YG PLUS'}
{'album_rank': 2, 'rank_change': 'down', 'rank_difference': -1, 'month': 1, 'title': 'NCT RESONANCE Pt. 2 - The 2nd Album', 'artist': 'NCT', 'new_sales': '244,629', 'cumulative_sales': '244,629', 'production': 'Dreamus'}
{'album_rank': 3, 'rank_change': 'new', 'rank_difference': None, 'month': 1, 'title': 'I burn', 'artist': '(여자)아이들', 'new_sales': '159,268', 'cumulative_sales': '159,268', 'production': 'Kakao Entertainment'}
{'album_rank': 4, 'rank_change': 'new', 'rank_difference': None, 'month': 1, 'title': 'NOIR - The 2nd Mini Album', 'artist': '유노윤호 (U-KNOW)', 'new_sales': '138,236', 'cumulative_sales': '138,236', 'production': 'Dreamus'}
{'album_rank': 5, 'rank_change': 'new', 'rank_difference': None, 'month': 1, 'title': 'HIDEOUT: BE OUR VOICE - SEASON 

### Creating a function to run steps above and append data to CSV file

In [21]:
# Takes URL and file_name as parameters

def append_data(url, file_name):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", "Accept-Encoding":"gzip, deflate, br", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1", "Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")

    items = soup.find_all('tr')[1:101] # all albums and album information

    for item in items:
        album = {}

        # album ranking
        album["album_rank"] = int(item.find("td", class_="ranking").get_text().strip())

        # ranking differences and change directions
        change = item.find("td", class_="change")
        try:
            album["rank_change"] = change.span['class'][0]
            if album["rank_change"] == "up":
                album["rank_difference"] = int(change.get_text().strip())
            elif album["rank_change"] == "down":
                album["rank_difference"] = int(change.get_text().strip())*-1
            else:
                album["rank_difference"] = None
        except:
            album["rank_change"] = change.get_text().strip()
            album["rank_difference"] = None

        # month
        date = soup.find('div', class_="fl").get_text().strip()
        album["month"] = int(date.split(" ")[1][:2])

        # album info
        subject = item.find("td", class_="subject")
        album["title"] = subject.p['title']
        album["artist"] = subject.find("p", class_="singer").get_text().strip()

        sales = item.find("td", class_="count").p.get_text().strip()
        album["new_sales"] = sales.split(" / ")[0]
        album["cumulative_sales"] = sales.split(" / ")[1]
        album["production"] = item.find("td", class_="production").p.get_text().strip()

        album_info = [
            album["month"],
            album["album_rank"],
            album["rank_change"],
            album["rank_difference"],
            album["title"],
            album["artist"],
            album["new_sales"],
            album["cumulative_sales"],
            album["production"]

        ]

        with open(file_name, 'a+', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(album_info)
    

### Create CSV file with header and update

In [30]:
# Creating new file

header = ['month', 'album_rank', 'rank_change', 'rank_difference', 'title', 'artist', 'new_sales', 'cumulative_sales', 'production']

with open('GaonAlbumChart2021Raw.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [32]:
# Appending data for each month

months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

for month in months:
    url = "http://gaonchart.co.kr/main/section/chart/album.gaon?nationGbn=T&serviceGbn=&targetTime=" + month + "&hitYear=2021&termGbn=month"
    append_data(url, 'GaonAlbumChart2021Raw.csv')

In [33]:
# Using pandas to make sure all data was appended correctly

df = pd.read_csv('GaonAlbumChart2021Raw.csv')
df

Unnamed: 0,month,album_rank,rank_change,rank_difference,title,artist,new_sales,cumulative_sales,production
0,1,1,new,,THE FIRST STEP : TREASURE EFFECT,TREASURE (트레저),266894,266894,YG PLUS
1,1,2,down,-1.0,NCT RESONANCE Pt. 2 - The 2nd Album,NCT,244629,244629,Dreamus
2,1,3,new,,I burn,(여자)아이들,159268,159268,Kakao Entertainment
3,1,4,new,,NOIR - The 2nd Mini Album,유노윤호 (U-KNOW),138236,138236,Dreamus
4,1,5,new,,HIDEOUT: BE OUR VOICE - SEASON 3.,CRAVITY,112301,112301,Kakao Entertainment
...,...,...,...,...,...,...,...,...,...
1195,12,96,up,49.0,Merry & Happy,TWICE,3022,15763,Dreamus
1196,12,97,up,24.0,LALISA,리사 (LISA),2919,708475,YG PLUS
1197,12,98,up,21.0,VENI VIDI VICI,트라이비(TRI.BE),2730,13345,Universal Music
1198,12,99,up,32.0,Love poem,아이유 (IU),2685,31277,Kakao Entertainment


### Modify function to take URL for annual album chart

In [20]:
# Takes URL and file_name as parameters

def append_data_annual(url, file_name):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", "Accept-Encoding":"gzip, deflate, br", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1", "Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")

    items = soup.find_all('tr')[1:101] # all albums and album information

    for item in items:
        album = {}

        # rank
        album["album_rank"] = int(item.find("td", class_="ranking").get_text().strip())

        # album info
        subject = item.find("td", class_="subject")
        album["title"] = subject.p['title']
        album["artist"] = subject.find("p", class_="singer").get_text().strip()
        album["production"] = item.find("td", class_="production").p.get_text().strip()

        # sales
        album["total_sales"] = item.find("td", class_="count").p.get_text().strip()

        album_info = [
            album["album_rank"],
            album["title"],
            album["artist"],
            album["total_sales"],
            album["production"]

        ]

        with open(file_name, 'a+', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(album_info)
    

In [16]:
# Creating new file

header_annual = ['album_rank', 'title', 'artist', 'total_sales', 'production']

with open('GaonAlbumChartAnnual2021.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header_annual)

In [21]:
# Appending data for each month

url_annual = "http://gaonchart.co.kr/main/section/chart/album.gaon?nationGbn=T&serviceGbn=&termGbn=year&hitYear=2021&year_time=3"

append_data_annual(url_annual, 'GaonAlbumChartAnnual2021.csv')

{'album_rank': 1, 'title': 'Butter', 'artist': '방탄소년단', 'production': 'YG PLUS', 'total_sales': '2,999,407'}
{'album_rank': 2, 'title': 'Sticker - The 3rd Album', 'artist': 'NCT 127', 'production': 'Dreamus', 'total_sales': '2,427,559'}
{'album_rank': 3, 'title': '맛 (Hot Sauce) - The 1st Album', 'artist': 'NCT DREAM', 'production': 'Dreamus', 'total_sales': '2,097,185'}
{'album_rank': 4, 'title': '9th Mini Album `Attacca`', 'artist': '세븐틴', 'production': 'YG PLUS', 'total_sales': '2,059,073'}
{'album_rank': 5, 'title': 'Universe - The 3rd Album', 'artist': 'NCT', 'production': 'Dreamus', 'total_sales': '1,630,715'}
{'album_rank': 6, 'title': '8th Mini Album `Your Choice`', 'artist': '세븐틴', 'production': 'YG PLUS', 'total_sales': '1,462,405'}
{'album_rank': 7, 'title': 'DON`T FIGHT THE FEELING - Special Album', 'artist': 'EXO', 'production': 'Dreamus', 'total_sales': '1,326,189'}
{'album_rank': 8, 'title': 'NOEASY', 'artist': 'Stray Kids (스트레이 키즈)', 'production': 'Dreamus', 'total_sales