In [142]:
import requests
import re
import pandas as pd
import concurrent.futures as cf
from bs4 import BeautifulSoup

HOLOLIVE_TALENT_MAIN: str = "https://hololive.hololivepro.com/en/talents"
OUTPUT_CSV: str = "./talent_info.csv"

In [143]:
def getSoup(url: str) -> BeautifulSoup:
    r = requests.get(url)

    return BeautifulSoup(r.content, "html.parser")

In [144]:
soup = getSoup(HOLOLIVE_TALENT_MAIN)
talent_list = soup.find("ul", class_="talent_list clearfix")
talent_list_item = talent_list.find_all("li")
talent_list_item

[<li>
 <a href="https://hololive.hololivepro.com/en/talents/tokino-sora/">
 <figure>
 <img alt="" class="attachment-large size-large wp-post-image" decoding="async" fetchpriority="high" height="340" sizes="(max-width: 340px) 100vw, 340px" src="https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb.png" srcset="https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb.png 340w, https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb-300x300.png 300w, https://hololive.hololivepro.com/wp-content/uploads/2021/05/tokino_sora_thumb-150x150.png 150w" width="340"/></figure>
 <h3>
 Tokino Sora<span>ときのそら</span>
 </h3>
 </a>
 </li>,
 <li>
 <a href="https://hololive.hololivepro.com/en/talents/roboco-san/">
 <figure>
 <img alt="" class="attachment-large size-large wp-post-image" decoding="async" height="340" sizes="(max-width: 340px) 100vw, 340px" src="https://hololive.hololivepro.com/wp-content/uploads/2020/06/roboco-san_thumb.png" 

In [145]:
talent_link = [link.find("a").get("href") for link in talent_list_item]
talent_link

['https://hololive.hololivepro.com/en/talents/tokino-sora/',
 'https://hololive.hololivepro.com/en/talents/roboco-san/',
 'https://hololive.hololivepro.com/en/talents/aki-rosenthal/',
 'https://hololive.hololivepro.com/en/talents/akai-haato/',
 'https://hololive.hololivepro.com/en/talents/shirakami-fubuki/',
 'https://hololive.hololivepro.com/en/talents/natsuiro-matsuri/',
 'https://hololive.hololivepro.com/en/talents/minato-aqua/',
 'https://hololive.hololivepro.com/en/talents/murasaki-shion/',
 'https://hololive.hololivepro.com/en/talents/nakiri-ayame/',
 'https://hololive.hololivepro.com/en/talents/yuzuki-choco/',
 'https://hololive.hololivepro.com/en/talents/oozora-subaru/',
 'https://hololive.hololivepro.com/en/talents/azki/',
 'https://hololive.hololivepro.com/en/talents/ookami-mio/',
 'https://hololive.hololivepro.com/en/talents/sakuramiko/',
 'https://hololive.hololivepro.com/en/talents/nekomata-okayu/',
 'https://hololive.hololivepro.com/en/talents/inugami-korone/',
 'https://

In [146]:
soraSoup: BeautifulSoup = getSoup(talent_link[0])
kiryuSoup: BeautifulSoup = getSoup(talent_link[67])
sanaSoup: BeautifulSoup = getSoup(talent_link[68])
irysSoup: BeautifulSoup = getSoup(talent_link[48])

In [147]:
def getName(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")
    talent_name = talent_article.find("div", class_="talent_top").find("h1")

    # For some reason, the simple talent_name.get_text(strip=True) returns both the English and Japanese names concatenated
    # r"^[A-Za-z\s]+&" did not work either
    talent_name_en = [char for char in talent_name.get_text(strip=True) if "A" <= char <= "Z" or "a" <= char <= "z" or char == " " or "(" <= char <= ")"]
    talent_name_en_joined = "".join(talent_name_en)
    talent_name_en_final = ""

    if talent_name_en_joined[0:4] != "Alum":
        talent_name_en_final = talent_name_en_joined
        result["Status"] = "Active"
    else:
        talent_name_en_final = talent_name_en_joined[4:].strip()
        result["Status"] = "Alumni"

    print(talent_name_en_final)

    result["English Name"] = talent_name_en_final if talent_name_en_final[0:4] != "IRyS" else "IRyS"
    result["Japanese Name"] = talent_name.find("span").get_text(strip=True)

    return result

getName(irysSoup)

IRySIRyS


{'Status': 'Active', 'English Name': 'IRyS', 'Japanese Name': 'IRyS'}

In [148]:
def getLinks(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")
    
    external_links_list = talent_article.find("ul", class_="t_sns clearfix")
    external_links_list_item = [link.find("a").get("href") for link in external_links_list.find_all("li")]

    result["Youtube"] = external_links_list_item[0].split('?', 1)[0]
    result["Twitter/X"] = external_links_list_item[1]

    return result

getLinks(soraSoup)

{'Youtube': 'https://www.youtube.com/channel/UCp6993wxpyDPHUpavwDFqgg',
 'Twitter/X': 'https://twitter.com/tokino_sora'}

In [149]:
def getData(soup: BeautifulSoup) -> dict:
    result: dict = {}
    talent_article = soup.find("article", class_="in_talent single")

    talent_data_box = talent_article.find("div", class_="talent_data").find("div", class_="table_box")
    talent_data_list = talent_data_box.find_all("dl")
    
    for dl in talent_data_list:
        dt = dl.find("dt").get_text(strip=True)
        dd = dl.find("dd").get_text(strip=True)
        result[dt] = dd

        if dl.find("a") is not None:
            link = dl.find("a").get("href")
            result[dt + " Link"] = link

    return result

getData(soraSoup)

{'Birthday': 'May 15',
 'Debut Stream': 'September 7, 2017',
 'Height': '160 cm',
 'Unit': 'hololive Generation 0',
 'Illustrator': 'Ordan',
 'Illustrator Link': 'https://twitter.com/ordan',
 'Dream': 'To perform a solo concert in Yokohama',
 'Fan Name': 'Sora-tomo (Sora’s Pals)',
 'Hashtags': 'Stream Tags: #ときのそら生放送#ときのそら実況するってよFan Art: #そらArt',
 'Catchphrases': '“Sora-tomo,” or “Sora’s Pals,” a name given to her fans.“Nun-nun,” a sound she makes when agreeing and nodding along or to pump herself up.Appears frequently in stream chat along with the emoji(๑╹ᆺ╹)',
 'Regular/Specialty Streams': 'Singing streams and chill gaming',
 'Hobbies': 'Karaoke, collecting temple stamps, rhythm games',
 'Likes': 'Sweets, Chinese food, cats, Vocaloid',
 'Special Skills': 'Clearing horror games without batting an eye'}

In [150]:
def getInfo(url: str, url_index: int) -> dict:
    soup = getSoup(url)

    name = getName(soup)
    links = getLinks(soup)
    data = getData(soup)

    return {"Index": url_index} | name | links | data

getInfo(talent_link[0], 0)

Tokino Sora


{'Index': 0,
 'Status': 'Active',
 'English Name': 'Tokino Sora',
 'Japanese Name': 'ときのそら',
 'Youtube': 'https://www.youtube.com/channel/UCp6993wxpyDPHUpavwDFqgg',
 'Twitter/X': 'https://twitter.com/tokino_sora',
 'Birthday': 'May 15',
 'Debut Stream': 'September 7, 2017',
 'Height': '160 cm',
 'Unit': 'hololive Generation 0',
 'Illustrator': 'Ordan',
 'Illustrator Link': 'https://twitter.com/ordan',
 'Dream': 'To perform a solo concert in Yokohama',
 'Fan Name': 'Sora-tomo (Sora’s Pals)',
 'Hashtags': 'Stream Tags: #ときのそら生放送#ときのそら実況するってよFan Art: #そらArt',
 'Catchphrases': '“Sora-tomo,” or “Sora’s Pals,” a name given to her fans.“Nun-nun,” a sound she makes when agreeing and nodding along or to pump herself up.Appears frequently in stream chat along with the emoji(๑╹ᆺ╹)',
 'Regular/Specialty Streams': 'Singing streams and chill gaming',
 'Hobbies': 'Karaoke, collecting temple stamps, rhythm games',
 'Likes': 'Sweets, Chinese food, cats, Vocaloid',
 'Special Skills': 'Clearing horror ga

In [151]:
talent_info = []
with cf.ThreadPoolExecutor(max_workers=64) as executor:
    futures = [executor.submit(getInfo, url, talent_link.index(url)) for url in talent_link]

    for future in cf.as_completed(futures):
        try:
            talent_info.append(future.result())
        except Exception as e:
            print("%r: ERROR -> %s", (future, e))

talent_info

Minato AquaAki Rosenthal
Natsuiro Matsuri

Hoshimachi Suisei
Tokino Sora
Shirakami Fubuki
Amane KanataMoona Hoshinova

Sakura Miko
Sakamata Chloe
Omaru Polka
Otonose Kanade
Anya Melfissa
Koseki Bijou
Yukihana Lamy
Airani Iofifteen
Takane Lui
Hiodoshi Ao
Murasaki Shion
Himemori Luna
Mococo Abyssgard
Raora Panthera
Akai Haato
Robocosan
Kaela Kovalskia
Inugami Korone
Shiranui Flare
AZKi
IRySIRyS
Tokoyami Towa
Yuzuki Choco
Mori Calliope
Oozora Subaru
Ouro Kronii
Fuwawa Abyssgard
Ninomae Inanis
Usada Pekora
Elizabeth Rose Bloodflame
Nerissa Ravencroft
Ceres Fauna
Kobo Kanaeru
Tsunomaki Watame
Ookami Mio
Momosuzu NeneShiori Novella
Hakui Koyori
Cecilia Immergreen
Watson Amelia
Hakos Baelz
Takanashi Kiara

Vestia Zeta
Nanashi Mumei
Shishiro Botan
Gawr Gura
Tsukumo Sana
Nekomata Okayu
Pavolia Reine
Kazama Iroha
Houshou Marine
Todoroki Hajime
Shirogane Noel
La Darknesss
Ichijou RirikaGigi Murin
Friend A (Achan)A
Nakiri Ayame

Kiryu Coco
Harusaki Nodoka
Juufuutei Raden
Kureiji Ollie
Ayunda Risu


[{'Index': 2,
  'Status': 'Active',
  'English Name': 'Aki Rosenthal',
  'Japanese Name': 'アキ・ローゼンタール',
  'Youtube': 'https://www.youtube.com/channel/UCFTLzh12_nrtzqBPsTCqenA',
  'Twitter/X': 'https://twitter.com/akirosenthal',
  'Birthday': 'February 17',
  'Debut Stream': 'June 1, 2018',
  'Height': '162 cm',
  'Unit': 'hololive 1st Generation',
  'Illustrator': 'Azumi Akitake',
  'Illustrator Link': 'https://twitter.com/akitake_a',
  'Dreams/Goals': 'An AkiRose in every household!I wanna be able to meet everyone frequently using VR and AR technology!I wanna try doing an entertainment show some day!',
  'Fan Name': 'Rose Knights',
  'Hashtags': 'Stream Tag: #アキびゅーわーるどFan Art: #アロ絵',
  'Message': 'I’d love to become your friendly neighborhood lady who makes your lives colorful! Let’s experience lots of things together!',
  'Regular/Specialty Streams': 'Evening drinking chat, karaoke (lots of nostalgic songs), and Let’s PlaysPeople often ask me to streamHorror game Let’s Plays by a pro

In [152]:
main_df = pd.DataFrame()

for talent in talent_info:
    talent_df = pd.DataFrame(talent, index=[talent["Index"]])
    main_df = pd.concat([main_df, talent_df], ignore_index=True)

main_df.head()

Unnamed: 0,Index,Status,English Name,Japanese Name,Youtube,Twitter/X,Birthday,Debut Stream,Height,Unit,...,IllustratorLive2D Designer Link,Heigh,First Appearance,Memes,Skills,birthday,fan name,hashtag,The type of streams you often do,The Difference of Ayunda and Risu’s Voice
0,2,Active,Aki Rosenthal,アキ・ローゼンタール,https://www.youtube.com/channel/UCFTLzh12_nrtz...,https://twitter.com/akirosenthal,February 17,"June 1, 2018",162 cm,hololive 1st Generation,...,,,,,,,,,,
1,5,Active,Natsuiro Matsuri,夏色まつり,https://youtube.com/channel/UCQ0UDLQCjY0rmuxCD...,https://twitter.com/natsuiromatsuri,July 22,"June 1, 2018",152 cm,hololive 1st Generation,...,,,,,,,,,,
2,6,Active,Minato Aqua,湊あくあ,https://www.youtube.com/channel/UC1opHUrw8rvns...,https://twitter.com/minatoaqua,December 1,"August 8, 2018",148 cm,hololive 2nd Generation,...,,,,,,,,,,
3,16,Active,Hoshimachi Suisei,星街すいせい,https://www.youtube.com/channel/UC5CwaMl1eIgY8...,https://twitter.com/suisei_hosimati,March 22,,160 cm,hololive Generation 0,...,,,,,,,,,,
4,0,Active,Tokino Sora,ときのそら,https://www.youtube.com/channel/UCp6993wxpyDPH...,https://twitter.com/tokino_sora,May 15,"September 7, 2017",160 cm,hololive Generation 0,...,,,,,,,,,,


In [153]:
main_df["Debut Stream"] = pd.to_datetime(main_df["Debut Stream"])
main_df = main_df.set_index("Index")
main_df

Unnamed: 0_level_0,Status,English Name,Japanese Name,Youtube,Twitter/X,Birthday,Debut Stream,Height,Unit,Illustrator,...,IllustratorLive2D Designer Link,Heigh,First Appearance,Memes,Skills,birthday,fan name,hashtag,The type of streams you often do,The Difference of Ayunda and Risu’s Voice
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Active,Aki Rosenthal,アキ・ローゼンタール,https://www.youtube.com/channel/UCFTLzh12_nrtz...,https://twitter.com/akirosenthal,February 17,2018-06-01,162 cm,hololive 1st Generation,Azumi Akitake,...,,,,,,,,,,
5,Active,Natsuiro Matsuri,夏色まつり,https://youtube.com/channel/UCQ0UDLQCjY0rmuxCD...,https://twitter.com/natsuiromatsuri,July 22,2018-06-01,152 cm,hololive 1st Generation,Minamura Haruki,...,,,,,,,,,,
6,Active,Minato Aqua,湊あくあ,https://www.youtube.com/channel/UC1opHUrw8rvns...,https://twitter.com/minatoaqua,December 1,2018-08-08,148 cm,hololive 2nd Generation,gaou,...,,,,,,,,,,
16,Active,Hoshimachi Suisei,星街すいせい,https://www.youtube.com/channel/UC5CwaMl1eIgY8...,https://twitter.com/suisei_hosimati,March 22,NaT,160 cm,hololive Generation 0,,...,,,,,,,,,,
0,Active,Tokino Sora,ときのそら,https://www.youtube.com/channel/UCp6993wxpyDPH...,https://twitter.com/tokino_sora,May 15,2017-09-07,160 cm,hololive Generation 0,Ordan,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Alumni,Kiryu Coco,桐生ココ,https://www.youtube.com/channel/UCS9uQI-jC3DE0...,https://twitter.com/kiryucoco,June 17,2019-12-28,180 cm (7 m in dragon form),hololive 4th Generation,yaman**,...,,,,,,,,,,
70,Active,Harusaki Nodoka,春先のどか,https://www.youtube.com/channel/UCJFZiqLMntJuf...,https://twitter.com/harusakinodoka,May 7,NaT,154 cm,Office Staff,おるだん,...,,,"April 1, 2022",,Making pretty images,,,,,
65,Active,Juufuutei Raden,儒烏風亭らでん,https://www.youtube.com/@JuufuuteiRaden,https://twitter.com/juufuuteiraden,February 4,2023-09-10,159 cm,ReGLOSS,Kaoming,...,,,,,,,,,,
37,Active,Kureiji Ollie,クレイジー・オリー,https://www.youtube.com/channel/UCYz_5n-uDuChH...,https://twitter.com/kureijiollie,October 13,NaT,,hololive Indonesia,LAM,...,,162 cm,,,,,,,,


In [154]:
main_df = main_df.sort_values(by="Index")
main_df

Unnamed: 0_level_0,Status,English Name,Japanese Name,Youtube,Twitter/X,Birthday,Debut Stream,Height,Unit,Illustrator,...,IllustratorLive2D Designer Link,Heigh,First Appearance,Memes,Skills,birthday,fan name,hashtag,The type of streams you often do,The Difference of Ayunda and Risu’s Voice
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Active,Tokino Sora,ときのそら,https://www.youtube.com/channel/UCp6993wxpyDPH...,https://twitter.com/tokino_sora,May 15,2017-09-07,160 cm,hololive Generation 0,Ordan,...,,,,,,,,,,
1,Active,Robocosan,ロボ子さん,https://www.youtube.com/channel/UCDqI2jOz0weum...,https://twitter.com/robocosan,May 23,NaT,154 cm,hololive Generation 0,,...,,,,,,,,,,
2,Active,Aki Rosenthal,アキ・ローゼンタール,https://www.youtube.com/channel/UCFTLzh12_nrtz...,https://twitter.com/akirosenthal,February 17,2018-06-01,162 cm,hololive 1st Generation,Azumi Akitake,...,,,,,,,,,,
3,Active,Akai Haato,赤井はあと,https://www.youtube.com/channel/UC1CfXB_kRs3C-...,https://twitter.com/akaihaato,August 10,2018-06-02,154 cm,hololive 1st Generation,Haruyuki,...,,,,,,,,,,
4,Active,Shirakami Fubuki,白上フブキ,https://www.youtube.com/channel/UCdn5BQ06XqgXo...,https://twitter.com/shirakamifubuki,October 5,2018-06-01,155cm,hololive 1st Generation/Gamers,Nagishiro Mito,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,Active,Todoroki Hajime,轟はじめ,https://www.youtube.com/@TodorokiHajime,https://twitter.com/todoroki_hajime,June 7,2023-09-10,155 cm,ReGLOSS,Achiki,...,,,,,,,,,,
67,Alumni,Kiryu Coco,桐生ココ,https://www.youtube.com/channel/UCS9uQI-jC3DE0...,https://twitter.com/kiryucoco,June 17,2019-12-28,180 cm (7 m in dragon form),hololive 4th Generation,yaman**,...,,,,,,,,,,
68,Alumni,Tsukumo Sana,九十九佐命,https://www.youtube.com/channel/UCsUj0dszADCGb...,https://twitter.com/tsukumosana,June 10,2021-08-23,"169cm (With limiter, and without limiter 1697 ...",hololive English -Council-,pako,...,,,,,,,,,,
69,Active,Friend A (Achan)A,友人A（えーちゃん）,https://www.youtube.com/channel/UCJFZiqLMntJuf...,https://twitter.com/achan_UGA,February 28,NaT,157.1 cm,Office Staff,Ordan,...,,,"February 28, 2018",MondayAll-out screaminghologra extra,,,,,,


In [155]:
main_df.to_csv(OUTPUT_CSV, index=False)