# Initialization

In [62]:
import re
import pandas as pd
from os import listdir
from os.path import *
from time import sleep
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
from tqdm import tqdm

## Initialize selenium

In [122]:
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

# Scrape song info

## Download main page's html

In [5]:
main_page_url = 'https://ll-fans.jp/data/song'

In [8]:
driver.get(main_page_url)

In [9]:
with open('main.html', mode='wt') as f:
    f.write(driver.page_source)

## Parse main page to download each song's html

In [10]:
with open('main.html', mode='r') as f:
    main_source = f.read()

In [12]:
soup = BeautifulSoup(main_source)

In [16]:
all_songs_a = soup.find_all('a', href=re.compile('\/data\/song/'))

In [21]:
all_songs_link = list(map(lambda a: 'https://ll-fans.jp' + a['href'], all_songs_a))

In [29]:
for link in tqdm(all_songs_link):
    song_id = re.search('(\d+)$', link).group(1)
    out_filename = str(song_id) + '.html'
    with open(join('songs_html', out_filename), mode='wt') as f:
        driver.get(link)
        f.write(driver.page_source)
        sleep(1)

100%|█████████████████████████████████████████| 406/406 [18:32<00:00,  2.74s/it]


## Parse each page, get information

Columns:
- `name`: name of the song in Japanese.
- `release_date`: release date in Japanese format.
- `artist`
- `lyricist`
- `composer`
- `arranger`
- `discs`
- `appearance`: json `[{'event': ..., 'concert': ..., 'performance': ...}, ...]`

In [203]:
html_files = list(filter(lambda x: x.endswith('html'), listdir('songs_html')))

In [204]:
def select_or_none(soup, css_selector, prop='text'):
    """
    Select the first element which matches the given CSS selector.
    Return None if there aren't any
    """
    results = soup.select(css_selector)
    if len(results) == 0:
        return None
    if prop == 'text':
        return str(results[0].text)
    return str(results[0][prop])

In [245]:
info_df_dict = []
for filename in tqdm(html_files):
    song_info = dict()
    song_info['id'] = re.search('(\d+)\.html', filename).group(1)
    
    with open(join('songs_html', filename)) as f:    
        soup = BeautifulSoup(f.read())
        
    # Get song name
    song_info['name'] = select_or_none(soup, 'h1[class*="llstyle12"]')
    # Get date
    song_date_sel = '[class*="MuiGrid-grid-md"]:nth-child(1) > dl:nth-child(1) > dd:nth-child(2)'
    song_info['date'] = select_or_none(soup, song_date_sel)
    # Get artist
    artist_sel = '[class*="MuiGrid-grid-md"]:nth-child(1) > dl:nth-child(1) > dd:nth-child(4) > p > a'
    song_info['artists'] = select_or_none(soup, artist_sel)
    # Get lyricist
    lyr_sel = '[class*="MuiGrid-grid-md"]:nth-child(1) > dl:nth-child(1) > dd:nth-child(6)'
    song_info['lyricists'] = select_or_none(soup, lyr_sel)
    # Get composer
    com_sel = 'dd.llstyle18:nth-child(8)'
    song_info['composers'] = select_or_none(soup, com_sel)
    # Get arranger
    arr_sel = 'dd.llstyle17:nth-child(10)'
    song_info['arrangers'] = select_or_none(soup, arr_sel)
    
    # Get disc ids
    discs_sel = 'table.MuiTable-root:nth-child(2) > tbody:nth-child(1) > tr'
    disc_entries = soup.select(discs_sel)
    song_info['discs'] = set()
    
    for entry in disc_entries:
        entry_a = entry.select('tr > td > a')[0]
        disc_id = re.search('(\d+)$', entry_a['href']).group(1)
        song_info['discs'].add(int(disc_id))
        
    # Get events
    events_sel = 'div.llstyle8:nth-child(6) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > dl:nth-child(1) > dd:nth-child(8) > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr'
    events_entry = soup.select(events_sel)
    song_info['events'] = list()
    
    for entry in events_entry:
        entry_a = entry.select('tr > td > p > a')[0]
        pattern = re.compile('\/data\/event\/(\d*)\?concert=(\d*)\&performance=(\d*)')
        pattern_results = pattern.search(entry_a['href'])
        song_info['events'].append({
            'event': pattern_results.group(1),
            'concert': pattern_results.group(2),
            'performance': pattern_results.group(3)
        })         
    info_df_dict.append(song_info)

100%|█████████████████████████████████████████| 406/406 [00:12<00:00, 31.97it/s]


In [246]:
songs_df = pd.DataFrame(info_df_dict)

In [247]:
songs_df

Unnamed: 0,id,name,date,artists,lyricists,composers,arrangers,discs,events
0,1,僕らのLIVE 君とのLIFE,2010年8月25日,μ's,畑 亜貴,山田高弘,高田 暁,"{1, 166, 23}","[{'event': '38', 'concert': '63', 'performance..."
1,2,友情ノーチェンジ,2010年8月25日,μ's,畑 亜貴,Tron-LM,Tron-LM,"{1, 166, 23}","[{'event': '107', 'concert': '145', 'performan..."
2,3,Snow halation,2010年12月22日,μ's,畑 亜貴,山田高弘,中西亮輔,"{2, 166, 23}","[{'event': '38', 'concert': '63', 'performance..."
3,4,baby maybe 恋のボタン,2010年12月22日,μ's,畑 亜貴,山口朗彦,山口朗彦,"{2, 166, 23}","[{'event': '107', 'concert': '145', 'performan..."
4,5,Love marginal,2011年5月25日,Printemps,畑 亜貴,藤末 樹,松坂康司、藤末 樹,"{3, 166, 23}","[{'event': '98', 'concert': '134', 'performanc..."
...,...,...,...,...,...,...,...,...,...
401,403,HAPPY TO DO WA！,2022年2月25日,唐 可可、平安名すみれ、葉月 恋,宮嶋淳子,イワツボコーダイ、吹野クワガタ,吹野クワガタ,{240},[]
402,404,Stella!,2022年2月25日,澁谷かのん、嵐 千砂都、平安名すみれ,宮嶋淳子,久保田真悟（Jazzin'park）、栗原 暁（Jazzin'park）,久保田真悟（Jazzin'park）,{241},[]
403,405,変わらないすべて,2022年2月25日,澁谷かのん、嵐 千砂都,宮嶋淳子,川崎里実,出羽良彰,{242},[]
404,410,トゥ トゥ トゥ！,2022年2月25日,Liella!,宮嶋淳子,高木誠司,高木誠司,{238},[]


In [248]:
songs_df.to_csv('songs.csv', index=False)

# Scrape artist info

## Download main page's html

In [120]:
main_page_url = 'https://ll-fans.jp/data/artist'

In [123]:
driver.get(main_page_url)

In [124]:
with open('main_artist.html', mode='wt') as f:
    f.write(driver.page_source)

## Parse main page to download each artist's html

In [125]:
with open('main_artist.html', mode='r') as f:
    main_source = f.read()

In [126]:
soup = BeautifulSoup(main_source)

In [128]:
all_artist_links = soup.find_all('a', href=re.compile('\/data\/artist/'))

In [129]:
all_artist_links = list(map(lambda a: 'https://ll-fans.jp' + a['href'], all_artist_links))

In [131]:
for link in tqdm(all_artist_links):
    artist_id = re.search('(\d+)$', link).group(1)
    out_filename = str(artist_id) + '.html'
    with open(join('artists_html', out_filename), mode='wt') as f:
        driver.get(link)
        f.write(driver.page_source)
        sleep(1)

100%|█████████████████████████████████████████| 108/108 [04:11<00:00,  2.33s/it]


## Parse each artist's html to generate artist df

In [132]:
html_files = list(filter(lambda x: x.endswith('html'), listdir('artists_html')))

In [150]:
artist_df_dict = []
for filename in tqdm(html_files):
    artist_info = dict()
    artist_info['id'] = re.search('(\d+)\.html', filename).group(1)
    
    with open(join('artists_html', filename)) as f:    
        soup = BeautifulSoup(f.read())
    
    # Get artist name
    name_sel = 'h1.MuiTypography-root'
    artist_info['name'] = soup.select(name_sel)[0].text
    
    # Get members
    members_sel = 'div.MuiPaper-root:nth-child(5) > table:nth-child(1) > tbody:nth-child(1)'
    members_table = soup.select(members_sel)[0]
    member_entries = members_table.select('tbody > tr > td > p')
    artist_info['members'] = '|'.join(list(map(lambda x: x.text, member_entries)))
    
    artist_df_dict.append(artist_info)

100%|█████████████████████████████████████████| 108/108 [00:01<00:00, 56.76it/s]


In [151]:
artist_df = pd.DataFrame(artist_df_dict)
artist_df

Unnamed: 0,id,name,members
0,32,A-RISE,綺羅ツバサ （CV: 櫻川めぐ）|統堂英玲奈 （CV: 松永真穂）|優木あんじゅ （CV: ...
1,35,AZALEA,松浦果南 （CV: 諏訪ななか）|黒澤ダイヤ （CV: 小宮有紗）|国木田花丸 （CV: 高...
2,33,Aqours,高海千歌 （CV: 伊波杏樹）|桜内梨子 （CV: 逢田梨香子）|松浦果南 （CV: 諏訪な...
3,101,Aqours・虹ヶ咲学園スクールアイドル同好会・Liella!,高海千歌 （CV: 伊波杏樹）|桜内梨子 （CV: 逢田梨香子）|松浦果南 （CV: 諏訪な...
4,62,A・ZU・NA,上原歩夢 （CV: 大西亜玖璃）|桜坂しずく （CV: 前田佳織里）|優木せつ菜 （CV: ...
...,...,...,...
103,55,高海千歌、桜内梨子、渡辺 曜、津島善子、国木田花丸、黒澤ルビィ,高海千歌 （CV: 伊波杏樹）|桜内梨子 （CV: 逢田梨香子）|渡辺 曜 （CV: 斉藤朱...
104,90,高海千歌、黒澤ダイヤ、津島善子,高海千歌 （CV: 伊波杏樹）|黒澤ダイヤ （CV: 小宮有紗）|津島善子 （CV: 小林愛香）
105,42,黒澤ダイヤ,黒澤ダイヤ （CV: 小宮有紗）
106,49,黒澤ダイヤ、黒澤ルビィ,黒澤ダイヤ （CV: 小宮有紗）|黒澤ルビィ （CV: 降幡 愛）


In [152]:
artist_df.to_csv('artists.csv', index=False)