In [None]:
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

def nuxt_getting(url,max_retries=5, delay=5):
    options = webdriver.ChromeOptions()
    options.headless = True  # Run in headless mode to avoid opening a browser window
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        # Open the page
        driver.get(url)

        # Give the page some time to load (adjust the sleep time if necessary)
        time.sleep(3)

        # Extract the window.__NUXT__ variable (it will be embedded in the page's JavaScript)
        nuxt_data = driver.execute_script('return window.__NUXT__;')

        # Ensure encoding by converting the data into a JSON string with UTF-8 support
        nuxt_data_encoded = json.dumps(nuxt_data, ensure_ascii=False, indent=4)

        # Decode back to Python dictionary for further processing if needed
        nuxt_data = json.loads(nuxt_data_encoded)
        try:
            if nuxt_data:  # Check if data is not None or empty
                return nuxt_data
            else:
                raise ValueError("NUXT data is None or empty")

        except Exception as e:
            print(f"Error fetching data for URL {url}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {delay} seconds... (Attempt {retries + 1}/{max_retries})")
                time.sleep(delay)
            else:
                print(f"Failed to fetch data for URL {url} after {max_retries} attempts.")
                return None  # Return None or log the failure
        
      

    finally:
        # Close the browser
        driver.quit()


In [None]:
#Events' links getting

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup

# Function to load all events dynamically
def load_all_events(driver, url, max_events=200):
    driver.get(url)
    time.sleep(3)  # Allow initial page load

    events_loaded = 0
    while events_loaded < max_events:
        try:
            # Scroll to the bottom of the page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for dynamic content to load

            # Locate and click the "Load more" button
            wait = WebDriverWait(driver, 10)
            load_more_button = wait.until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'btn cursor-pointer')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
            load_more_button.click()
            time.sleep(3)  # Allow new events to load

            # Count the number of events loaded so far
            soup = BeautifulSoup(driver.page_source, "html.parser")
            events_loaded = len(soup.find_all("a", class_="event-list-item"))
            print(f"Loaded {events_loaded} events so far...")

        except Exception as e:
            print(f"No more 'Load more' button or error occurred: {e}")
            break

# Function to gather event links
def get_event_links(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    event_links = []

    # Update class for event link extraction
    events = soup.find_all("a", class_="event-list-item")
    base_url = "https://iticket.az"
    for event in events:
        href = event.get("href")
        if href:
            full_url = base_url + href if href.startswith("/") else href
            event_links.append(full_url)

    print(f"Found {len(event_links)} event links.")
    return event_links



In [None]:
#Loading evets' links to event_links text file
def run_program():
    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.maximize_window()

    try:
        # Base URL of the site
        BASE_URL = "https://iticket.az/en/events"

        # Load all events
        print("Loading all events...")
        load_all_events(driver, BASE_URL)

        # Extract event links
        print("Extracting event links...")
        event_links = get_event_links(driver)

        # Save the links to a file
        with open("event_links_data_team_18.txt", "w") as f:
            for link in event_links:
                f.write(link + "\n")

        print("Event links saved to 'event_links_data_team_18.txt'.")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        # Close the WebDriver
        driver.quit()
        print("WebDriver closed.")

# Entry point of the script
if __name__ == "__main__":
    run_program()


Loading all events...
Loaded 30 events so far...
Loaded 45 events so far...
Loaded 60 events so far...
Loaded 75 events so far...
Loaded 90 events so far...
Loaded 105 events so far...
Loaded 120 events so far...
Loaded 135 events so far...
Loaded 150 events so far...
Loaded 165 events so far...
Loaded 180 events so far...
No more 'Load more' button or error occurred: Message: element click intercepted: Element is not clickable at point (711, 2418)
  (Session info: chrome=131.0.6778.109)
Stacktrace:
	GetHandleVerifier [0x00943433+25059]
	(No symbol) [0x008CCE34]
	(No symbol) [0x007ABEC3]
	(No symbol) [0x007F5D37]
	(No symbol) [0x007F4189]
	(No symbol) [0x007F1DAD]
	(No symbol) [0x007F10AF]
	(No symbol) [0x007E5FD7]
	(No symbol) [0x00811EFC]
	(No symbol) [0x007E5A24]
	(No symbol) [0x00812194]
	(No symbol) [0x0082B51E]
	(No symbol) [0x00811C96]
	(No symbol) [0x007E3FAC]
	(No symbol) [0x007E4F3D]
	GetHandleVerifier [0x00C35593+3113795]
	GetHandleVerifier [0x00C4A25A+3198986]
	GetHandleVer

In [None]:
def scrape_event_data(nuxt_data):
    """Scrapes event data from the given nuxt_data dictionary."""
    
    # Main Event Details
    event_details = nuxt_data['data'][0]['event']
    similar_events = []
    # Main Event Data
    event_data = {
        "currency": nuxt_data['config'].get('currency', 'N/A'),
        "asset_url": nuxt_data['config'].get('assetsURL', 'N/A'),
        "map_url": nuxt_data['config'].get('mapURL', 'N/A'),

        "age_limit": event_details.get('age_limit', 'N/A'),
        "artist_bg_url": event_details.get('artist_bg_url', 'N/A'),
        "artist_url": event_details.get('artist_url', 'N/A'),
        "available_tickets": event_details.get('available_tickets_count', 0),
        "category_slug": event_details.get('category_slug', 'N/A'),
        "cover_url": event_details.get('cover_url', 'N/A'),
        "description": event_details.get('description', 'N/A'),
        "event_start": event_details.get('event_starts_at', 'N/A'),
        "event_end": event_details.get('event_ends_at', 'N/A'),
        "is_refundable": event_details.get('is_refundable', False),
        "max_price": event_details.get('max_price', 0),
        "min_price": event_details.get('min_price', 0),
        "gallery_images": event_details.get('gallery', []),
        "facebook_url": event_details.get('facebook_url', 'N/A'),

        # Meta Information
        "description_meta": event_details.get('meta', {}).get('description', 'N/A'),
        "title_meta": event_details.get('meta', {}).get('title', 'N/A'),
        "event_name": event_details.get('name', 'N/A'),

        # Poster Information
        "poster_created_at": event_details.get('poster', {}).get('created_at', 'N/A'),
        "poster_galleryable_type": event_details.get('poster', {}).get('galleryable_type', 'N/A'),
        "poster_updated_at": event_details.get('poster', {}).get('updated_at', 'N/A'),
        "poster_tag_count": event_details.get('poster', {}).get('tag', 0)
    }


    return event_data


In [None]:
import csv
def create_columns():
# Define headers
    headers = [ "currency",
    "asset_url",
    "map_url",
    "age_limit",
    "artist_bg_url",
    "artist_url",
    "available_tickets",
    "category_slug",
    "cover_url",
    "description",
    "event_start",
    "event_end",
    "is_refundable",
    "max_price",
    "min_price",
    "gallery_images",
    "facebook_url",
    "description_meta",
    "title_meta",
    "event_name",
    "poster_created_at",
    "poster_galleryable_type",
    "poster_updated_at",
    "poster_tag_count",
    "age_limit",
    "available_tickets_count",
    "min_price",
    "max_price",
    "name",
    "sell_ends_at",
    "sell_starts_at",
    "event_link"
    ]

    # Open a file to write
    with open("scraped_data_team_18.csv", "w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=headers)

        # Write headers
        writer.writeheader()
if __name__ == "__main__":
    create_columns()



In [None]:
import pandas as pd
import requests  # Assuming you use requests or similar library

def process_event_links(filename, output_csv):
    """Reads event links from a file, scrapes their data, and writes it to a CSV in chunks."""
    with open(filename, 'r') as file:
        event_links = file.read().splitlines()

    all_event_data = []
    chunk_size = 5

    for i, link in enumerate(event_links, start=1):
        print(f"Scraping data for: {link}")
        
        try:
            # Fetch nuxt_data
            nuxt_data = nuxt_getting(link, max_retries=5, delay=5)
            
            if not nuxt_data:
                print(f"Nuxt data did not exist for this link: {link}. Skipping...")
                continue  # Skip this link
            
            try:
                # Extract the event details
                event_details = nuxt_data['data'][0]['event']
            except KeyError:
                print(f"'event' key not found in nuxt_data for link: {link}. Skipping...")
                continue  # Skip if 'event' key is missing

            # Process event data
            event_data = scrape_event_data(nuxt_data)
            event_data["event_link"] = link
            all_event_data.append(event_data)

        except requests.exceptions.Timeout:
            print(f"Request timed out for link: {link}. Skipping...")
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error for link: {link}: {e}. Skipping...")
        except Exception as e:
            print(f"An unexpected error occurred for link: {link}: {e}. Skipping...")

        # Write in chunks
        if i % chunk_size == 0 or i == len(event_links):
            df = pd.DataFrame(all_event_data)
            if i <= chunk_size:  # First chunk
                df.to_csv(output_csv, mode='w', index=False)
            else:  # Append for subsequent chunks
                df.to_csv(output_csv, mode='a', index=False, header=False)
            print(f"Saved {len(all_event_data)} records to {output_csv}.")
            all_event_data = []  # Reset for next chunk

    print(f"Data successfully saved to {output_csv}.")



In [None]:
import pandas as pd

def main():
    links_file = "event_links_data_team_18.txt"

    # Output CSV file
    output_csv = "scraped_data_team_18.csv"


    # Process the event links and get event data
    process_event_links(links_file, output_csv)  

if __name__ == "__main__":
    main()


Scraping data for: https://iticket.az/en/events/concerts/subscription-number-1a
Scraping data for: https://iticket.az/en/events/sport/29th-baku-rhythmic-gymnastics-championship
Scraping data for: https://iticket.az/en/events/master-class/creative-master-class-for-kids
Scraping data for: https://iticket.az/en/events/kids/neon-creativity
Scraping data for: https://iticket.az/en/events/theatre/the-2115-train
Saved 5 records to output.csv.
Scraping data for: https://iticket.az/en/events/concerts/con-tempo-contemporary-music-here-and-now
An unexpected error occurred for link: https://iticket.az/en/events/concerts/con-tempo-contemporary-music-here-and-now: HTTPConnectionPool(host='localhost', port=54063): Read timed out. (read timeout=120). Skipping...
Scraping data for: https://iticket.az/en/events/hayal-kahvesi/azerbaijan-night-party
Scraping data for: https://iticket.az/en/events/concerts/old-city-jazz-nights-at-best-place
An unexpected error occurred for link: https://iticket.az/en/event