In [18]:
import requests
import re
import pandas as pd
import concurrent.futures as cf
from bs4 import BeautifulSoup

HOLOLIVE_TALENT_MAIN: str = "https://hololive.hololivepro.com/en/talents"
OUTPUT_CSV: str = "./talent_info.csv"

In [19]:
def getSoup(url: str) -> BeautifulSoup:
    r = requests.get(url)

    return BeautifulSoup(r.content, "html.parser")

In [20]:
soup = getSoup(HOLOLIVE_TALENT_MAIN)
talent_list = soup.find("ul", class_="talent_list clearfix")
talent_list_item = talent_list.find_all("li")
talent_list_item

[<li>
 <a href="https://hololive.hololivepro.com/en/talents/tokino-sora/">
 <figure>
 <img alt="" class="attachment-large size-large wp-post-image" decoding="async" fetchpriority="high" height="340" sizes="(max-width: 340px) 100vw, 340px" src="https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb.png" srcset="https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb.png 340w, https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb-300x300.png 300w, https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb-150x150.png 150w" width="340"/></figure>
 <h3>
 Tokino Sora<span>ときのそら</span>
 </h3>
 </a>
 </li>,
 <li>
 <a href="https://hololive.hololivepro.com/en/talents/roboco-san/">
 <figure>
 <img alt="" class="attachment-large size-large wp-post-image" decoding="async" height="340" sizes="(max-width: 340px) 100vw, 340px" src="https://hololive.hololivepro.com/wp-content/uploads/2020/06/roboco-san_thumb.png" 

In [21]:
talent_link = [link.find("a").get("href") for link in talent_list_item]
talent_link

['https://hololive.hololivepro.com/en/talents/tokino-sora/',
 'https://hololive.hololivepro.com/en/talents/roboco-san/',
 'https://hololive.hololivepro.com/en/talents/aki-rosenthal/',
 'https://hololive.hololivepro.com/en/talents/akai-haato/',
 'https://hololive.hololivepro.com/en/talents/shirakami-fubuki/',
 'https://hololive.hololivepro.com/en/talents/natsuiro-matsuri/',
 'https://hololive.hololivepro.com/en/talents/minato-aqua/',
 'https://hololive.hololivepro.com/en/talents/murasaki-shion/',
 'https://hololive.hololivepro.com/en/talents/nakiri-ayame/',
 'https://hololive.hololivepro.com/en/talents/yuzuki-choco/',
 'https://hololive.hololivepro.com/en/talents/oozora-subaru/',
 'https://hololive.hololivepro.com/en/talents/azki/',
 'https://hololive.hololivepro.com/en/talents/ookami-mio/',
 'https://hololive.hololivepro.com/en/talents/sakuramiko/',
 'https://hololive.hololivepro.com/en/talents/nekomata-okayu/',
 'https://hololive.hololivepro.com/en/talents/inugami-korone/',
 'https://

In [22]:
soraSoup: BeautifulSoup = getSoup(talent_link[0])
soraSoup


<!DOCTYPE html>

<html lang="en">
<head>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-ZY0CGN7WFD"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());
  gtag('config', 'G-ZY0CGN7WFD');
</script>
<meta charset="utf-8"/>
<link href="https://hololive.hololivepro.com/en/talents/tokino-sora/" rel="canonical"/>
<meta content="width=device-width,initial-scale=1.0" name="viewport"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!--en-->
<meta content="" name="description"/>
<!--en-->
<!--en-->
<title>Tokino Sora | TALENT | hololive official website</title>
<!--en-->
<!-- Facebook OGP -->
<!--en-->
<meta content="Tokino Sora | TALENT | hololive official website" property="og:title"/>
<!--en-->
<meta content="https://hololive.hololivepro.com/en/talents/tokino-sora/" property="og:url"/>
<!--en-->
<meta content="" property="og:de

In [23]:
def getName(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")
    talent_name = talent_article.find("div", class_="talent_top").find("h1")

    # For some reason, the simple talent_name.get_text(strip=True) returns both the English and Japanese names concatenated
    # r"^[A-Za-z\s]+&" did not work either
    talent_name_en = [re.sub(r"^Alum", "", char) for char in talent_name.get_text(strip=True) if "A" <= char <= "Z" or "a" <= char <= "z" or char == " "]

    result["English Name"] = "".join(talent_name_en)
    result["Japanese Name"] = talent_name.find("span").get_text(strip=True)

    return result

getName(soraSoup)

{'English Name': 'Tokino Sora', 'Japanese Name': 'ときのそら'}

In [24]:
def getLinks(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")
    
    external_links_list = talent_article.find("ul", class_="t_sns clearfix")
    external_links_list_item = [link.find("a").get("href") for link in external_links_list.find_all("li")]

    result["Youtube"] = external_links_list_item[0].split('?', 1)[0]
    result["Twitter/X"] = external_links_list_item[1]

    return result

getLinks(soraSoup)

{'Youtube': 'https://www.youtube.com/channel/UCp6993wxpyDPHUpavwDFqgg',
 'Twitter/X': 'https://twitter.com/tokino_sora'}

In [25]:
def getData(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")

    talent_data_box = talent_article.find("div", class_="talent_data").find("div", class_="table_box")
    talent_data_list = talent_data_box.find_all("dl")
    
    for dl in talent_data_list:
        dt = dl.find("dt").get_text(strip=True)
        dd = dl.find("dd").get_text(strip=True)
        result[dt] = dd

        if dl.find("a") is not None:
            link = dl.find("a").get("href")
            result[dt + " Link"] = link

    return result

getData(soraSoup)

{'Birthday': 'May 15',
 'Debut Stream': 'September 7, 2017',
 'Height': '160 cm',
 'Unit': 'hololive Generation 0',
 'Illustrator': 'Ordan',
 'Illustrator Link': 'https://twitter.com/ordan',
 'Dream': 'To perform a solo concert in Yokohama',
 'Fan Name': 'Sora-tomo (Sora’s Pals)',
 'Hashtags': 'Stream Tags: #ときのそら生放送#ときのそら実況するってよFan Art: #そらArt',
 'Catchphrases': '“Sora-tomo,” or “Sora’s Pals,” a name given to her fans.“Nun-nun,” a sound she makes when agreeing and nodding along or to pump herself up.Appears frequently in stream chat along with the emoji(๑╹ᆺ╹)',
 'Regular/Specialty Streams': 'Singing streams and chill gaming',
 'Hobbies': 'Karaoke, collecting temple stamps, rhythm games',
 'Likes': 'Sweets, Chinese food, cats, Vocaloid',
 'Special Skills': 'Clearing horror games without batting an eye'}

In [26]:
def getInfo(url: str) -> dict:
    soup = getSoup(url)

    name = getName(soup)
    links = getLinks(soup)
    data = getData(soup)

    return name | links | data

getInfo(talent_link[0])

{'English Name': 'Tokino Sora',
 'Japanese Name': 'ときのそら',
 'Youtube': 'https://www.youtube.com/channel/UCp6993wxpyDPHUpavwDFqgg',
 'Twitter/X': 'https://twitter.com/tokino_sora',
 'Birthday': 'May 15',
 'Debut Stream': 'September 7, 2017',
 'Height': '160 cm',
 'Unit': 'hololive Generation 0',
 'Illustrator': 'Ordan',
 'Illustrator Link': 'https://twitter.com/ordan',
 'Dream': 'To perform a solo concert in Yokohama',
 'Fan Name': 'Sora-tomo (Sora’s Pals)',
 'Hashtags': 'Stream Tags: #ときのそら生放送#ときのそら実況するってよFan Art: #そらArt',
 'Catchphrases': '“Sora-tomo,” or “Sora’s Pals,” a name given to her fans.“Nun-nun,” a sound she makes when agreeing and nodding along or to pump herself up.Appears frequently in stream chat along with the emoji(๑╹ᆺ╹)',
 'Regular/Specialty Streams': 'Singing streams and chill gaming',
 'Hobbies': 'Karaoke, collecting temple stamps, rhythm games',
 'Likes': 'Sweets, Chinese food, cats, Vocaloid',
 'Special Skills': 'Clearing horror games without batting an eye'}

In [27]:
talent_info = []
with cf.ThreadPoolExecutor(max_workers=64) as executor:
    futures = [executor.submit(getInfo, url) for url in talent_link]

    for future in cf.as_completed(futures):
        try:
            talent_info.append(future.result())
        except Exception as e:
            print("%r: ERROR -> %s", (future, e))

talent_info

[{'English Name': 'Robocosan',
  'Japanese Name': 'ロボ子さん',
  'Youtube': 'https://www.youtube.com/channel/UCDqI2jOz0weumE8s7paEk6g',
  'Twitter/X': 'https://twitter.com/robocosan',
  'Birthday': 'May 23',
  'Debut': 'March 9, 2018',
  'Height': '154 cm',
  'Unit': 'hololive Generation 0',
  '3D Modeler': 'Kuromaru9',
  '3D Modeler Link': 'https://twitter.com/kuromaru9',
  'Fan Name': '“Roboser,” from “Robot Circle,” or “Robotics Club”',
  'Hashtags': 'Stream Tag: #ロボ子生放送Fan Art: #ロボ子Art'},
 {'English Name': 'Natsuiro Matsuri',
  'Japanese Name': '夏色まつり',
  'Youtube': 'https://youtube.com/channel/UCQ0UDLQCjY0rmuxCDE38FGg',
  'Twitter/X': 'https://twitter.com/natsuiromatsuri',
  'Birthday': 'July 22',
  'Debut Stream': 'June 1, 2018',
  'Height': '152 cm',
  'Unit': 'hololive 1st Generation',
  'Illustrator': 'Minamura Haruki',
  'Illustrator Link': 'https://twitter.com/halllki',
  'Dream': 'To be the top idol & voice actress!',
  'Fan Name': 'Matsurisu (abbreviation of “Matsuri viewers” 

In [31]:
main_df = pd.DataFrame()

for talent in talent_info:
    talent_df = pd.DataFrame(talent, index=[0])
    main_df = pd.concat([main_df, talent_df], ignore_index=True)

main_df.head()

Unnamed: 0,English Name,Japanese Name,Youtube,Twitter/X,Birthday,Debut,Height,Unit,3D Modeler,3D Modeler Link,...,Heigh,birthday,fan name,hashtag,The type of streams you often do,The Difference of Ayunda and Risu’s Voice,First Appearance,Skills,Memes,Referred to as:
0,Robocosan,ロボ子さん,https://www.youtube.com/channel/UCDqI2jOz0weum...,https://twitter.com/robocosan,May 23,"March 9, 2018",154 cm,hololive Generation 0,Kuromaru9,https://twitter.com/kuromaru9,...,,,,,,,,,,
1,Natsuiro Matsuri,夏色まつり,https://youtube.com/channel/UCQ0UDLQCjY0rmuxCD...,https://twitter.com/natsuiromatsuri,July 22,,152 cm,hololive 1st Generation,,,...,,,,,,,,,,
2,Kazama Iroha,風真いろは,https://www.youtube.com/channel/UC_vMYWcDjmfdp...,https://twitter.com/kazamairohach,June 18,,156 cm,holoX,,,...,,,,,,,,,,
3,Aki Rosenthal,アキ・ローゼンタール,https://www.youtube.com/channel/UCFTLzh12_nrtz...,https://twitter.com/akirosenthal,February 17,,162 cm,hololive 1st Generation,,,...,,,,,,,,,,
4,Tokino Sora,ときのそら,https://www.youtube.com/channel/UCp6993wxpyDPH...,https://twitter.com/tokino_sora,May 15,,160 cm,hololive Generation 0,,,...,,,,,,,,,,
