In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.firefox import GeckoDriverManager

# Create Driver and launch scraper

In [2]:
firefox_options = Options()
firefox_options.add_argument('--headless')
firefox_options.add_argument('--sandbox')
driver = webdriver.Firefox(options=firefox_options, service=FirefoxService(GeckoDriverManager().install()))
page_url = "https://witcher.fandom.com/wiki/Category:Characters_in_the_stories"
driver.get(page_url)

# Build dictionaries for books and character list

In [3]:
# Get all elements with the title attribute
books_elements = driver.find_elements(By.CLASS_NAME, "category-page__member-link")

# Scrape book name and url into list
books = []
for element in books_elements:
    book_url = element.get_attribute("href")
    book_name = element.text
    books.append({"book": book_name, "url": book_url})

# Enter each book url and scrape the character and book name into character list
characters = []
for book in books:
    driver.get(book["url"])
    character_elements= driver.find_elements(By.CLASS_NAME, "category-page__member-link")
    for element in character_elements:
        characters.append({"book": book["book"], "character": element.text})

In [4]:
df = pd.DataFrame(characters)
df

Unnamed: 0,book,character
0,Category:Baptism of Fire characters,Adalia
1,Category:Baptism of Fire characters,Adela
2,Category:Baptism of Fire characters,Aen Saevherne
3,Category:Baptism of Fire characters,Aevenien
4,Category:Baptism of Fire characters,Aglaïs
...,...,...
1270,Category:Time of Contempt characters,Yanna of Murivel
1271,Category:Time of Contempt characters,Yarpen Zigrin
1272,Category:Time of Contempt characters,Yennefer of Vengerberg
1273,Category:Time of Contempt characters,Yiolenta Suarez


# Clean DataFrame and store as CSV

In [5]:
# Remove unnecessary components of book column
df['book'] = df['book'].str.replace('Category:', '').str.replace(' characters', '')

In [6]:
import re
# Remove parentheses from character names and text within
df['character'] = df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))

# Create first_name column
df['first_name'] = df['character'].apply(lambda x: x.split(' ', 1)[0])

In [7]:
# Store csv file
df.to_csv('characters.csv')