Strategy: first crawl all seiyuu _links_ from all LL animes.  
For each seiyuu: crawl all voice roles and staff roles.

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from selenium import webdriver

## Crawl all seiyuu links from each LL anime

In [2]:
ll_anime_links = [
    ('muse', 'https://myanimelist.net/anime/15051/Love_Live_School_Idol_Project'),
    ('aqours', 'https://myanimelist.net/anime/32526/Love_Live_Sunshine'),
    ('nijigasaki', 'https://myanimelist.net/anime/40879/Love_Live_Nijigasaki_Gakuen_School_Idol_Doukoukai')
]

In [3]:
def get_seiyuus_from_anime(url: str) -> list:
    """
    Get all seiyuus from an anime.
    
    Param: MAL anime url.
    Return: A list consisting of all seiyuus.
    Each seiyuu is a dictionary, keys: ('name', 'url')
    """
    html = requests.get(url).content
    soup = BeautifulSoup(html)
    
    all_seiyuu_cells = soup.find_all('td', {'class': 'va-t ar pl4 pr4'})
    seiyuu_list = []
    
    for cell in all_seiyuu_cells:
        name = cell.find('a').text
        url = cell.find('a')['href']
        seiyuu_list.append({
            'name': name,
            'url': url,
        })
    return seiyuu_list

In [4]:
def get_seiyuu_info(url: str, selenium_driver) -> list:
    """
    Get all roles of a seiyuu, including both voicing roles and staff roles.
    
    Returns a list of dicts with keys: 
        - anime
        - type [supporting/main/staff]
        - character
    """
    selenium_driver.get(url)
    html = selenium_driver.page_source
    soup = BeautifulSoup(html)
    
    roles = []
    
    all_character_rows = soup.find_all('tr', {'class': 'js-people-character'})
    all_staff_rows = soup.find_all('tr', {'class': 'js-people-staff'})
    
    for row in all_character_rows:
        # Each row consisting of 3 cols: anime img, anime details, character details, character img
        anime_name = row.find('a', 'js-people-title').text
        anime_url = row.find('a', 'js-people-title')['href']
        anime_id = re.search('anime/(\d+)/', anime_url).group(1)
                
        char_details_col = row.find_all('td')[2]
        # Each char detail col has 3 divs: character name, type (main/sup), fav num
        char_details_divs = char_details_col.find_all('div')
        roles.append({
            'anime': anime_name,
            'type': char_details_divs[1].text.strip(),
            'character': char_details_divs[0].find('a').text.strip(),
            'anime_id': anime_id
        })
        
    for row in all_staff_rows:
        anime_name = row.find('a', 'js-people-title').text
        anime_url = row.find('a', 'js-people-title')['href']
        anime_id = re.search('anime/(\d+)/', anime_url).group(1)
        
        roles.append({
            'anime': anime_name,
            'type': 'Staff',
            'character': np.nan,
            'anime_id': anime_id
        })
    return roles

In [5]:
seiyuus_df = pd.DataFrame()
firefox = webdriver.Firefox()
for group, link in ll_anime_links:
    # Get all seiyuus from each LL anime
    seiyuu_list = get_seiyuus_from_anime(link)
    for seiyuu in seiyuu_list:
        print(seiyuu['name'], end='... ')
        seiyuu_df = pd.DataFrame(get_seiyuu_info(seiyuu['url'], firefox))
        print('Got ', len(seiyuu_df))
        seiyuu_df['name'] = seiyuu['name']
        seiyuu_df['group'] = group
        seiyuus_df = pd.concat([seiyuus_df, seiyuu_df])

Pile... Got  32
Tokui, Sora... Got  110
Kusuda, Aina... Got  32
Uchida, Aya... Got  109
Mimori, Suzuko... Got  217
Nitta, Emi... Got  50
Iida, Riho... Got  40
Nanjou, Yoshino... Got  134
Kubo, Yurika... Got  111
Sakuragawa, Megu... Got  21
Kobayashi, Aika... Got  17
Saitou, Shuka... Got  16
Takatsuki, Kanako... Got  14
Aida, Rikako... Got  24
Suzuki, Aina... Got  43
Furihata, Ai... Got  13
Komiya, Arisa... Got  9
Suwa, Nanaka... Got  11
Inami, Anju... Got  13
Satou, Hinata... Got  10
Tanaka, Chiemi... Got  5
Kusunoki, Tomori... Got  31
Kitou, Akari... Got  72
Sagara, Mayu... Got  7
Oonishi, Aguri... Got  7
Maeda, Kaori... Got  21
Murakami, Natsumi... Got  8
Kubota, Miyu... Got  25
Yano, Hinaki... Got  6
Sashide, Maria... Got  6


## Save result

In [6]:
seiyuus_df.to_csv('seiyuu.csv', index=False)