In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

project_path = os.path.abspath("..")

sys.path.append(project_path)

In [142]:
from itertools import combinations
from urllib.parse import urlparse
from tqdm import tqdm

from bs4 import BeautifulSoup as bs

from src.scrap_hrefs import get_units_hrefs
from src.scrap_infobox import extract_unit_data
from src.utils import fetch_page_content

In [3]:
aoe_wiki_url = "https://ageofempires.fandom.com/"
units_url = aoe_wiki_url + "wiki/Unit_(Age_of_Empires_III)"
unit_list_html = fetch_page_content(units_url)
href_soup = bs(unit_list_html, "html.parser")
main_page = href_soup.find('main', class_='page__main')


content = href_soup.find('div', class_='mw-parser-output')
h2s = content.find_all('h2', recursive=False)
sections = [h2.find('span', class_='mw-headline', recursive=False).text for h2 in h2s]
section_unit_hrefs = {section: get_units_hrefs(section, content) for section in sections}

In [8]:
# Units in more than one building
unique_units = {}
for key, values in section_unit_hrefs.items():
    unique_units[key] = set(values.keys())

# Generar todas las posibles parejas de llaves
key_pairs = combinations(unique_units.keys(), 2)

for k1, k2 in key_pairs:
    intersection = unique_units[k1] & unique_units[k2]
    if len(intersection) > 0:
        pair = (k1, k2)
        print(pair, " :", intersection)

('Explorer/War Chief/Monk/General', 'Town Center')  : {'Envoy'}
('Town Center', 'Trading Post/Native Embassy/Tambo')  : {'Chasqui'}
('Livestock Pen/Farm', 'Village')  : {'Goat'}
('Dock/Port', 'Revolutionary Variants')  : {'Fire Ship'}
('Dock/Port', 'Untrainable')  : {'Fire Ship'}
('Trading Post/Native Embassy/Tambo', 'Community Plaza')  : {'Healer'}
('Church/Mosque/Meeting House/Cathedral', 'Basilica')  : {'Priest', 'Spy'}
('Artillery Foundry', 'Factory')  : {'Rocket'}
('Kallanka', 'Community Plaza')  : {'Maceman'}
('Revolutionary Variants', 'Untrainable')  : {'Buccaneer Captain', 'Native Boy', 'White Jaguar', "Morgan's Flagship", 'Black Powder Wagon', 'Outlaw Musketeer', 'White Buffalo', 'Cherokee Archer', 'White Wolf', 'Aztec Chief', 'Horse', 'Boneguard Swordsman', 'Great Plains Chief', 'Greta', 'Treasure Ship', 'Great Cannon', 'Flat-bottomed Boat', 'Dinghy', 'Cherokee Horse Archer', 'Fierce Cougar', 'Miner', 'Fire Ship', 'Railroad Worker', 'Boneguard Musketeer', 'Native American Chi

In [10]:
all_hrefs = [href for sect in section_unit_hrefs.values() for href in sect.values()]

In [140]:
def find_infobox(fragment_id, infoboxes):
    TARGET_GAME = "Age of Empires III"
    
    fragment_normalized = fragment_id.strip().replace(" ", "_").lower()
    
    for infobox in infoboxes:
        unit_name = infobox.find("h2")
        if not unit_name:
            continue

        unit_name_normalized = unit_name.text.strip().replace(" ", "_").lower()
        game_div = infobox.find("div", class_="pi-data-value")

        if not game_div:
            continue

        game = game_div.text.strip()

        if TARGET_GAME in game and fragment_normalized == unit_name_normalized:
            return infobox
    
    return None

In [145]:
item_types = {
    "text": [
        "Introduced in",
        "Required Home City Card",
        "Hit points",
        "Speed",
        "Line of Sight",
        "XP train bounty",
        "XP kill bounty",
        "Range",
        "Rate of Fire",
        "Train limit",
        "Ability",
        "Special ability",
        "Area of Effect",
        "Requires",
        "Regeneration",
        "Resource amount",
        "Gatherers",
        "Auto gather",
        "Pronunciation",
        "Garrison",
    ],
    "list": ["Type", "Civilization(s)", "Age", "Trained at", "Fatten rate"],
    "dict": ["Cost", "Train time", "Resistance", "Damage", "Bonus damage", "Fatten rate"],
    "ignore": ["Internal name"],
}

# for url in all_hrefs:
for url in tqdm(all_hrefs, colour="blue"):
    try:
        url = all_hrefs[0]
        unit_html = fetch_page_content(url)
        unit_soup = bs(unit_html, "html.parser")
        infoboxes = unit_soup.find_all("aside", class_="portable-infobox")
        fragment_id = urlparse(url).fragment
        if fragment_id:
            infobox = find_infobox(fragment_id, infoboxes)
        else:
            if len(infoboxes) > 1:
                raise ValueError("Multiple infoboxes found")
            infobox = infoboxes[0]
        if infobox is None:
            raise ValueError("Infobox not found")
        unit_data = extract_unit_data(infobox, item_types)
    except Exception as e:
        print(url)
        print(e)

100%|[34m██████████[0m| 420/420 [08:40<00:00,  1.24s/it]


In [144]:
unit_data

{'name': 'Axehilt the Tame Tiger',
 'Information': {'Introduced in': 'Age of Empires III: The Asian Dynasties',
  'Type': ['Pet', 'Hand infantry', 'Sees stealth*'],
  'Civilization(s)': ['Indians'],
  'Age': ['Exploration Age'],
  'Required Home City Card': '"Favorable Karma"'},
 'Training': {'Trained at': ['Brahmin'],
  'Cost': {'Food': '135'},
  'Train time': {'seconds': '22'},
  'Train limit': '10'},
 'Statistics': {'Hit points': '120',
  'Resistance': {'Hand': '10%'},
  'Speed': '5.0',
  'Line of Sight': '12',
  'XP train bounty': '14',
  'XP kill bounty': '14'},
 'Hand attack': {'Damage': {'Hand': '14'},
  'Rate of Fire': '1.5',
  'Bonus damage': {'x vs.': '1.3',
   'Treasure guardian': '1.2x vs.',
   'Villager': '0.2x vs.'}}}