In [1]:
import os
import sys

project_path = os.path.abspath("..")

sys.path.append(project_path)

In [2]:
from urllib.parse import urlparse

from bs4 import BeautifulSoup as bs

from src.scrap_hrefs import get_section_hrefs
from src.scrap_infobox import extract_unit_data
from src.utils import fetch_page_content

## Implementation

In [3]:
def get_units_urls(url) -> list:
    
    units_url = url + "wiki/Unit_(Age_of_Empires_III)"
    unit_list_html = fetch_page_content(units_url)
    href_soup = bs(unit_list_html, "html.parser")

    content = href_soup.find('div', class_='mw-parser-output')
    h2s = content.find_all('h2', recursive=False)
    sections = [h2.find('span', class_='mw-headline', recursive=False).text for h2 in h2s]
    # section_unit_hrefs = {section: get_units_hrefs(section, content) for section in sections}
    units_urls = [
        url + href
        for section in sections
        for href in get_section_hrefs(section, content).values()
    ]

    return units_urls

In [4]:
url = "https://ageofempires.fandom.com/"

units_urls = get_units_urls(url)
units_urls

['https://ageofempires.fandom.com//wiki/Tiger#Axehilt_the_Tame_Tiger',
 'https://ageofempires.fandom.com//wiki/Wolf#Buck_the_Tame_Wolf',
 'https://ageofempires.fandom.com//wiki/Carib_Blowgunner',
 'https://ageofempires.fandom.com//wiki/Coyote#Fang_the_Tame_Coyote',
 'https://ageofempires.fandom.com//wiki/Disciple',
 'https://ageofempires.fandom.com//wiki/Eagle_Scout_(Age_of_Empires_III)',
 'https://ageofempires.fandom.com//wiki/Envoy_(Age_of_Empires_III)',
 'https://ageofempires.fandom.com//wiki/Hot_Air_Balloon',
 'https://ageofempires.fandom.com//wiki/Jaguar#Janey_the_Pet_Jaguar',
 'https://ageofempires.fandom.com//wiki/Nootka_War_Chief',
 'https://ageofempires.fandom.com//wiki/Dog#Peruvian_Dog',
 'https://ageofempires.fandom.com//wiki/White_Tiger_(Age_of_Empires_III)#Wagner_the_Pet_White_Tiger',
 'https://ageofempires.fandom.com//wiki/Dog#Age_of_Empires_III',
 'https://ageofempires.fandom.com//wiki/Architect',
 'https://ageofempires.fandom.com//wiki/Barbary_Warrior',
 'https://ageofe

In [5]:
def get_infoboxes(url, target_game):

    try:

        unit_html = fetch_page_content(url)
        unit_soup = bs(unit_html, "html.parser")
        infoboxes = unit_soup.find_all("aside", class_="portable-infobox")

        valid_infoboxes = []
        for infobox in infoboxes:
            
            game_div = infobox.find("div", class_="pi-data-value")
            
            if not game_div:
                continue

            game = game_div.text.strip()

            if target_game in game:
                valid_infoboxes.append(infobox)

        return valid_infoboxes

    except Exception as e:
        raise Exception(f"Error: {e}")

In [9]:
url = "https://ageofempires.fandom.com/wiki/Gatling_Gun"
infoboxes = get_infoboxes(url, "Age of Empires III")
unit_data = extract_unit_data(infoboxes[0])
unit_data

Error extracting data form block 'Bombard attack' in 'Gatling Gun' infobox: Information for label 'Damage' could not be extracted in dict format.
Present values: ['30 (6)']


{'name': 'Gatling Gun',
 'Information': {'Introduced in': 'Age of Empires III: The WarChiefs',
  'Type': ['Artillery'],
  'Civilization(s)': ['Revolutionary',
   'United States',
   'United States (revolutionary)'],
  'Age': ['Fortress Age (United States)', 'Revolution (revolutionary)']},
 'Training': {'Trained at': ['Artillery Foundry',
   'Fort',
   'Steamer (United States)',
   'American Citadel'],
  'Cost': {'Wood': '100', 'Coin': '250', 'Population': '4'},
  'Train time': {'seconds': '38'}},
 'Statistics': {'Hit points': '150',
  'Resistance': {'Ranged': '75%'},
  'Speed': '4.0',
  'Line of Sight': '26',
  'Train XP': '35',
  'Kill XP': '35'}}

In [7]:
data = []
TARGET_GAME = "Age of Empires III"
for url in units_urls:
    infoboxes = get_infoboxes(url, TARGET_GAME)
    for infobox in infoboxes:
        try:
            unit_data = extract_unit_data(infobox)
            data.append(unit_data)
        except Exception as e:
            print(f"Extraction error for {url}: {e}")

print(f"Extracted data for {len(data)} units")

KeyboardInterrupt: 