In [None]:
# 01_data_collection.ipynb

# Importación de bibliotecas
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Función para hacer scraping de Rock Werchter
def scrape_rock_werchter():
    """
    Scrapes historical lineup data from Rock Werchter festival.
    Returns a DataFrame with year, artist, and festival information.
    """
    base_url = "https://www.rockwerchter.be/en/history"
    festival_data = []

    try:
        # Configura las opciones para el navegador (Chrome) en modo headless
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Para ejecutar sin abrir una ventana del navegador
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(base_url)

        # Espera hasta que la sección con los festivales esté cargada
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "history-list"))
        )

        # Obtén el contenido HTML de la página
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Encuentra las secciones que contienen los festivales por año
        year_sections = soup.find_all('div', class_='history-item')

        for section in year_sections:
            # Extrae el año del festival
            year = section.find('h2').text.strip()
            
            # Encuentra todos los artistas en la sección
            artists = section.find_all('li')

            for artist in artists:
                festival_data.append({
                    'festival': 'Rock Werchter',
                    'year': int(year),
                    'artist': artist.text.strip(),
                    'source': base_url
                })

    except Exception as e:
        print(f"Error scraping Rock Werchter: {str(e)}")
    finally:
        # Asegúrate de cerrar el driver para liberar recursos
        driver.quit()

    # Devuelve los datos en un DataFrame de pandas
    return pd.DataFrame(festival_data)

# Función para hacer scraping de Pukkelpop
def scrape_pukkelpop():
    """
    Scrapes historical lineup data from Pukkelpop festival.
    Returns a DataFrame with year, artist, and festival information.
    """
    base_url = "https://www.pukkelpop.be/en/history"
    festival_data = []

    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Sin interfaz gráfica (ideal para servidores)
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(base_url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "history-list"))
        )

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        year_sections = soup.find_all('div', class_='history-year')

        for section in year_sections:
            year = section.find('h2').text.strip()
            artists = section.find_all('li')

            for artist in artists:
                festival_data.append({
                    'festival': 'Pukkelpop',
                    'year': int(year),
                    'artist': artist.text.strip(),
                    'source': base_url
                })
    except Exception as e:
        print(f"Error scraping Pukkelpop: {str(e)}")
    finally:
        driver.quit()

    return pd.DataFrame(festival_data)

# Función principal
def main():
    print("Scraping Rock Werchter data...")
    werchter_df = scrape_rock_werchter()

    print("Scraping Pukkelpop data...")
    pukkelpop_df = scrape_pukkelpop()

    # Combinar datos
    all_festivals_df = pd.concat([werchter_df, pukkelpop_df], ignore_index=True)

    # Guardar los datos recolectados
    all_festivals_df.to_csv('../data/festival_data_raw.csv', index=False)
    all_festivals_df.to_json('../data/festival_data_raw.json', orient='records')

    print("Data collection complete!")
    return all_festivals_df

# Ejecutar función principal
if __name__ == "__main__":
    festival_data = main()
