In [1]:
import os
import sys

project_path = os.path.abspath("..")

sys.path.append(project_path)

In [2]:
from urllib.parse import urlparse

from bs4 import BeautifulSoup as bs

from src.scrap_hrefs import get_section_hrefs
from src.scrap_infobox import extract_unit_data, find_infobox
from src.utils import fetch_page_content

## Implementation

In [3]:
def get_units_urls(url) -> list:
    
    units_url = url + "wiki/Unit_(Age_of_Empires_III)"
    unit_list_html = fetch_page_content(units_url)
    href_soup = bs(unit_list_html, "html.parser")

    content = href_soup.find('div', class_='mw-parser-output')
    h2s = content.find_all('h2', recursive=False)
    sections = [h2.find('span', class_='mw-headline', recursive=False).text for h2 in h2s]
    # section_unit_hrefs = {section: get_units_hrefs(section, content) for section in sections}
    units_urls = {
        unit: url + href
        for section in sections
        for unit, href in get_section_hrefs(section, content).items()
    }

    return units_urls

In [4]:
url = "https://ageofempires.fandom.com/"

units_urls = get_units_urls(url)
units_urls

{'Axehilt the Tame Tiger': 'https://ageofempires.fandom.com//wiki/Tiger#Axehilt_the_Tame_Tiger',
 'Buck the Tame Wolf': 'https://ageofempires.fandom.com//wiki/Wolf#Buck_the_Tame_Wolf',
 'Carib Blowgun Ambusher': 'https://ageofempires.fandom.com//wiki/Carib_Blowgunner',
 'Fang the Tame Coyote': 'https://ageofempires.fandom.com//wiki/Coyote#Fang_the_Tame_Coyote',
 'Disciple': 'https://ageofempires.fandom.com//wiki/Disciple',
 'Eagle Scout': 'https://ageofempires.fandom.com//wiki/Eagle_Scout_(Age_of_Empires_III)',
 'Envoy': 'https://ageofempires.fandom.com//wiki/Envoy_(Age_of_Empires_III)',
 'Hot Air Balloon': 'https://ageofempires.fandom.com//wiki/Hot_Air_Balloon',
 'Janey the Pet Jaguar': 'https://ageofempires.fandom.com//wiki/Jaguar#Janey_the_Pet_Jaguar',
 'Nootka War Chief': 'https://ageofempires.fandom.com//wiki/Nootka_War_Chief',
 'Peruvian Dog': 'https://ageofempires.fandom.com//wiki/Dog#Peruvian_Dog',
 'Wagner the Pet White Tiger': 'https://ageofempires.fandom.com//wiki/White_Tige

In [5]:
def norm_string(s):
    return s.strip().replace(" ", "_").lower()

In [6]:
def scrape_unit_data(unit, units_urls) -> list:

    TARGET_GAME = "Age of Empires III"

    try:

        url = units_urls[unit]

        unit_html = fetch_page_content(url)
        unit_soup = bs(unit_html, "html.parser")
        infoboxes = unit_soup.find_all("aside", class_="portable-infobox")

        fragment_id = urlparse(url).fragment

        norm_fragment_id = norm_string(fragment_id) if fragment_id else None
        norm_target_game = norm_string(TARGET_GAME)
        norm_unit = norm_string(unit)

        unit_to_search = (norm_unit if norm_fragment_id == norm_target_game else norm_fragment_id) or norm_unit

        infobox = find_infobox(unit_to_search, infoboxes)

        unit_data = extract_unit_data(infobox)

        return unit_data

    except Exception as e:
        raise Exception(f"Error: {e}")

In [13]:
unit = "Hot Air Balloon"
scrape_unit_data(unit, units_urls)

Exception: Error: Infobox not found

In [33]:
data = {}
for unit in units_urls.keys():
    try:
        data[unit] = scrape_unit_data(unit, units_urls)
    except Exception as e:
        print(f"Extraction error for unit: {unit} - {e}")
print(f"Extracted data for {len(data)} units")

Extraction error for unit: Carib Blowgun Ambusher - Error: Infobox not found
Extraction error for unit: Eagle Scout - Error: Information for label 'Bonus damage' could not be extracted in dict format.
Present values: '['Same as ranged attack']'
Extraction error for unit: Hot Air Balloon - Error: Infobox not found
Extraction error for unit: Wagner the Pet White Tiger - Error: Information for label 'Bonus damage' could not be extracted in dict format.
Present values: '['×1.3 vs. Light infantry']'
Extraction error for unit: Buttercup the Pet Cougar - Error: Infobox not found
Extraction error for unit: Eclaireur - Error: Information for label 'Bonus damage' could not be extracted in dict format.
Present values: '['Same as hand attack']'
Extraction error for unit: Militiaman - Error: Information for label 'Bonus damage' could not be extracted in dict format.
Present values: '['Same as ranged attack']'
Extraction error for unit: Revolutionary - Error: Information for label 'Bonus damage' cou

## Scraped Wiki Data Analysis

In [28]:
unit = "Eagle Scout"
unit_url = units_urls[unit]
print("unit_url: ", unit_url)
print()

parsed_url = urlparse(unit_url)

print("scheme: ", parsed_url.scheme)
print("netloc: ", parsed_url.netloc)
print("path: ", parsed_url.path)
print("query: ", parsed_url.query)
print("fragment: ", parsed_url.fragment)
print()

unit_html = fetch_page_content(unit_url)
unit_soup = bs(unit_html, "html.parser")
infoboxes = unit_soup.find_all("aside", class_="portable-infobox")
print("infoboxes: ", len(infoboxes))

unit_url:  https://ageofempires.fandom.com//wiki/Eagle_Scout_(Age_of_Empires_III)

scheme:  https
netloc:  ageofempires.fandom.com
path:  //wiki/Eagle_Scout_(Age_of_Empires_III)
query:  
fragment:  

infoboxes:  1


In [29]:
unit_html = fetch_page_content(unit_url)
unit_soup = bs(unit_html, "html.parser")
infoboxes = unit_soup.find_all("aside", class_="portable-infobox")
fragment_id = urlparse(unit_url).fragment

In [30]:
TARGET_GAME = "Age of Empires III"

def norm_string(s):
    return s.strip().replace(" ", "_").lower()

norm_fragment_id = norm_string(fragment_id) if fragment_id else None
norm_target_game = norm_string(TARGET_GAME)
norm_unit = norm_string(unit)

unit_to_search = (norm_unit if norm_fragment_id == norm_target_game else norm_fragment_id) or norm_unit
unit_to_search

'eagle_scout'

In [31]:
for infobox in infoboxes:

    # Title verification
    game_div = infobox.find("div", class_="pi-data-value")

    if not game_div:
        continue

    game = game_div.text.strip()

    if TARGET_GAME not in game:
        founded = False
        print("Game doesn't match")
        print("TARGET_GAME: ", TARGET_GAME)
        print("game: ", game)
        print()
        continue

    # Unit name verification
    unit_name = infobox.find("h2")
    if not unit_name:
        continue

    norm_infobox = unit_name.text.strip().replace(" ", "_").lower()

    if not unit_to_search == norm_infobox:

        founded = False
        print("Unit name doesn't match")
        print("unit_to_search: ", unit_to_search)
        print("infobox_name: ", norm_infobox)
        print()
        continue

    else:
        founded = True
        print("Unit name matches")
        print("unit_to_search: ", unit_to_search)
        print("infobox_name: ", norm_infobox)
        print()
        break

if founded:
    print("Game matches")
    print("TARGET_GAME: ", TARGET_GAME)
    print("game: ", game)
    print("Infobox found")
else:
    print("Infobox not found")

Unit name matches
unit_to_search:  eagle_scout
infobox_name:  eagle_scout

Game matches
TARGET_GAME:  Age of Empires III
game:  Age of Empires III: Definitive Edition
Infobox found
