In [1]:
import os
from typing import List
from pydantic import BaseModel
from src.scrape.scrapers import (
    EventsScraper, 
    FightersScraper, 
    ResultsScraper, 
    RoundsScraper)
from src.scrape.utils import (
    load_yaml, 
    save_models_to_json, 
    convert_models_to_dicts, 
    get_event_urls, 
    get_fight_urls, 
    get_fighter_urls)
from src.logger import setup_logger

logger = setup_logger()

In [2]:
def scrape_events(config: dict) -> List[BaseModel]:
    """Scrape events using the provided configuration."""
    event_scraper = EventsScraper()
    events = event_scraper.scrape_events([config['event_urls']['all']])
    return events

def extract_event_urls(events: List[BaseModel]) -> List[str]:
    """Extract event URLs from the list of event models."""
    return get_event_urls(events_data=convert_models_to_dicts(models=events))

def scrape_results(event_urls: List[str]) -> List[BaseModel]:
    """Scrape results using event URLs."""
    result_scraper = ResultsScraper()
    results = result_scraper.scrape_results(event_urls=event_urls)
    return results

def extract_fight_urls(results: List[BaseModel]) -> List[str]:
    """Extract fight URLs from the results models."""
    return get_fight_urls(results_data=convert_models_to_dicts(models=results))

def scrape_rounds(fight_urls: List[str]) -> List[BaseModel]:
    """Scrape rounds using fight URLs."""
    rounds_scraper = RoundsScraper()
    rounds = rounds_scraper.scrape_rounds(fight_urls=fight_urls)
    return rounds

def extract_fighter_urls(results: List[BaseModel]) -> List[str]:
    """Extract fighter URLs from the results models."""
    return get_fighter_urls(results_data=convert_models_to_dicts(models=results))

def scrape_fighters(fighter_urls: List[str]) -> List[BaseModel]:
    """Scrape fighters using fighter URLs."""
    fighters_scraper = FightersScraper()
    fighters = fighters_scraper.scrape_fighters(fighter_urls=fighter_urls)
    return fighters


In [3]:

# =============================================================================
# Pipeline Runner
# =============================================================================

def run_pipeline(config: dict, root_dir: str) -> None:
    """
    Run the complete scraping pipeline and save the outputs to JSON files.
    
    The pipeline:
      1. Scrapes events and extracts event URLs.
      2. Scrapes results based on event URLs and extracts fight URLs.
      3. Scrapes rounds using fight URLs.
      4. Extracts fighter URLs from results and scrapes fighters.
      5. Saves all the scraped models to their respective JSON files.
    """
    # Step 1: Events
    events = scrape_events(config)
    event_urls = extract_event_urls(events)
    
    # Step 2: Results
    results = scrape_results(event_urls)
    fight_urls = extract_fight_urls(results)
    
    # Step 3: Rounds
    rounds = scrape_rounds(fight_urls)
    
    # Step 4: Fighters
    fighter_urls = extract_fighter_urls(results)
    fighters = scrape_fighters(fighter_urls)
    
    # Step 5: Save the models
    models_to_save = {
        'events': events,
        'fighters': fighters,
        'rounds': rounds,
        'results': results,
    }
    
    for key, models in models_to_save.items():
        filepath = os.path.join(root_dir, config['output_files']['raw'][key])
        save_models_to_json(models=models, filepath=filepath)

In [4]:
def main():
    root_dir = os.path.abspath('.')
    config_path = os.path.join(root_dir, 'config', 'config.yaml')
    config = load_yaml(config_path)
    run_pipeline(config, root_dir)

In [5]:
main()

2025-02-11 17:08:11,099 - src.scrape.scrapers - INFO - Starting concurrent scraping of events.
2025-02-11 17:08:12,412 - src.scrape.scrapers - INFO - Completed scraping events.
2025-02-11 17:08:12,418 - src.scrape.utils - INFO - Successfully extracted 719 event URLs.
2025-02-11 17:08:12,420 - src.scrape.scrapers - INFO - Starting concurrent scraping of fight results.
2025-02-11 17:09:13,376 - src.scrape.scrapers - INFO - Completed scraping fight results.
2025-02-11 17:09:13,425 - src.scrape.utils - INFO - Successfully extracted 8024 unique fight URLs.
2025-02-11 17:09:13,429 - src.scrape.scrapers - INFO - Starting concurrent scraping of rounds data.
2025-02-11 17:24:21,955 - src.scrape.scrapers - INFO - Completed scraping rounds data.
2025-02-11 17:24:22,025 - src.scrape.utils - INFO - Successfully extracted 2564 unique fighter URLs.
2025-02-11 17:24:22,030 - src.scrape.scrapers - INFO - Starting concurrent scraping of fighter details.
2025-02-11 17:27:39,737 - src.scrape.scrapers - IN