## Get Fes dataset

* Get COUNT DOWN JAPAN and ROCK IN JAPAN fes data from wikipedia
  * [COUNT DOWN JAPAN](https://ja.wikipedia.org/wiki/COUNTDOWN_JAPAN)
  * [ROCK IN JAPAN](https://ja.wikipedia.org/wiki/ROCK_IN_JAPAN_FESTIVAL)


## Preparation

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
%matplotlib inline

In [34]:
def set_path():
    root = os.path.join(os.path.realpath("."), "../")
    if root not in sys.path:
        sys.path.append(root)
    return root

ROOT_DIR = set_path()
DATA_DIR = ROOT_DIR
for _dir in ["data", "raw"]:
    DATA_DIR = os.path.join(DATA_DIR, _dir)
    if not os.path.exists(DATA_DIR):
        os.mkdir(DATA_DIR)

## Scrape Fes page

In [4]:
COUNTDOWN_JAPAN = "https://ja.wikipedia.org/wiki/COUNTDOWN_JAPAN"
ROCK_IN_JAPAN = "https://ja.wikipedia.org/wiki/ROCK_IN_JAPAN_FESTIVAL"

In [38]:
import requests
from bs4 import BeautifulSoup


COUNTDOWN_JAPAN_ROOT = None
ROCK_IN_JAPAN_ROOT = None

for kind, url in zip(["c", "r"], [COUNTDOWN_JAPAN, ROCK_IN_JAPAN]):
    r = requests.get(url)
    if r.ok:
        root = BeautifulSoup(r.content, "html.parser")
    
    if kind == "c":
        COUNTDOWN_JAPAN_ROOT = root
    else:
        ROCK_IN_JAPAN_ROOT = root

### COUNT DOWN JAPAN

In [45]:
def get_countdown_japan():
    candidates = COUNTDOWN_JAPAN_ROOT.findAll("h3")
    years = []
    
    # Extract sections
    for title in candidates:
        text = title.find("span", {"class": "mw-headline"})
        text = "" if text is None else text.get_text()
        if len(text.split("/")) == 2:
            # count down japan is slash separated format like "17/18"
            is_pair_of_year = True
            for y in text.split("/"):
                if not y.isdigit():
                    is_pair_of_year = False
            
            if is_pair_of_year:
                years.append((text.strip(), title))
    
    # Extract table
    artists = []
    date_parser = lambda d: d.replace("月", "/").replace("日", "")
    for y, e in years:
        table = e.find_next("table", {"class": "wikitable"})
        header = []
        for i, r in enumerate(table.find_all("tr")):
            if len(header) == 0:
                for h in r.find_all("th"):
                    header.append(h.get_text().strip())
            else:
                contents = [r.find("th")] + r.find_all("td")
                day = ""
                for h, c in zip(header, contents):
                    if c.name == "th":
                        day = c.get_text().strip()
                    else:
                        for ls in c.find_all("li"):
                            artist = {
                                "year": "20" + y.split("/")[0],
                                "day": date_parser(day),
                                "day_index": i,
                                "stage": h,
                                "artist": ls.get_text().strip(),
                                "detail_link": ""
                            }
                            if ls.find("a") is not None:
                                artist["detail_link"] = ls.find("a").get("href")
                            artists.append(artist)

    return artists


df_cdj = pd.DataFrame(get_countdown_japan())

In [46]:
df_cdj.head(5)

Unnamed: 0,artist,day,day_index,detail_link,stage,year
0,B-DASH,12/29,1,/wiki/B-DASH,EARTH STAGE,2003
1,ASIAN KUNG-FU GENERATION,12/29,1,/wiki/ASIAN_KUNG-FU_GENERATION,EARTH STAGE,2003
2,斉藤和義,12/29,1,/wiki/%E6%96%89%E8%97%A4%E5%92%8C%E7%BE%A9,EARTH STAGE,2003
3,一青窈,12/29,1,/wiki/%E4%B8%80%E9%9D%92%E7%AA%88,EARTH STAGE,2003
4,GRAPEVINE,12/29,1,/wiki/GRAPEVINE,EARTH STAGE,2003


In [47]:
df_cdj.to_csv(os.path.join(DATA_DIR, "cdj.csv"), index=False, encoding="utf-8")

### ROCK IN JAPAN

In [54]:
def get_rockin_japan():
    candidates = ROCK_IN_JAPAN_ROOT.findAll("h3")
    years = []
    
    # Extract sections
    for title in candidates:
        text = title.find("span", {"class": "mw-headline"})
        text = "" if text is None else text.get_text()
        if text.endswith("年") and text[:-2].isdigit():
            years.append((text.strip(), title))

    # Extract table
    artists = []
    
    def date_parser(d):
        day = d.replace("（", " ").replace("）", " ").split(" ")[1]
        day = day.replace("月", "/").replace("日", "")
        return day

    for y, e in years:
        header = []
        table = e.find_next("table", {"class": "wikitable"})
        if table is None:
            continue

        for i, r in enumerate(table.find_all("tr")):
            if len(header) == 0:
                for h in r.find_all("th"):
                    header.append(h.get_text().strip())
            else:
                contents = [r.find("th")] + r.find_all("td")
                day = ""
                for h, c in zip(header, contents):
                    if c.name == "th":
                        day = c.get_text().strip()
                    else:
                        for ls in c.find_all("li"):
                            artist = {
                                "year": y.replace("年", ""),
                                "day": date_parser(day),
                                "day_index": i,
                                "stage": h if h != "出演者" else "STAGE",
                                "artist": ls.get_text().strip(),
                                "detail_link": ""
                            }
                            if ls.find("a") is not None:
                                artist["detail_link"] = ls.find("a").get("href")
                            artists.append(artist)
    
    return artists

df_rinj = pd.DataFrame(get_rockin_japan())

In [55]:
df_rinj.tail(5)

Unnamed: 0,artist,day,day_index,detail_link,stage,year
2560,魔法少女になり隊,8/12,4,/wiki/%E9%AD%94%E6%B3%95%E5%B0%91%E5%A5%B3%E3%...,HILLSIDE STAGE,2018
2561,ベリーグッドマン,8/12,4,/wiki/%E3%83%99%E3%83%AA%E3%83%BC%E3%82%B0%E3%...,HILLSIDE STAGE,2018
2562,cinema staff,8/12,4,/wiki/Cinema_staff,HILLSIDE STAGE,2018
2563,宇宙まお,8/12,4,/wiki/%E5%AE%87%E5%AE%99%E3%81%BE%E3%81%8A,HILLSIDE STAGE,2018
2564,LONGMAN,8/12,4,/wiki/LONGMAN,HILLSIDE STAGE,2018


In [56]:
df_rinj.to_csv(os.path.join(DATA_DIR, "rinj.csv"), index=False, encoding="utf-8")