# Event Ingestion Pipeline Testing

This notebook tests the event ingestion pipeline using the adapter pattern.
Ra.co uses a GraphQL API adapter.

In [21]:
import sys
import os
import logging

# Setup path
sys.path.insert(0, os.path.abspath(".."))

# Enable logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(name)s - %(levelname)s - %(message)s'
)

print("Setup complete")

Setup complete


## Step 1: Test the Ra.co API Adapter

First, let's test the GraphQL API adapter directly to see raw data.

In [22]:
from src.ingestion.adapters import SourceType
from src.ingestion.adapters.api_adapter import APIAdapterConfig
from src.ingestion.pipelines.apis.ra_co import RaCoAdapter

# Create adapter config
adapter_config = APIAdapterConfig(
    source_id="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    max_retries=3,
    rate_limit_per_second=1.0,
    graphql_endpoint="https://ra.co/graphql",
)

# Create adapter
adapter = RaCoAdapter(adapter_config)
print(f"Adapter created: {adapter.source_id}")
print(f"Source type: {adapter.source_type.value}")

Adapter created: ra_co
Source type: api


In [26]:
# Fetch raw data from API
fetch_result = adapter.fetch(
    area_id=20,  # Barcelona
    page_size=100,
)

print(f"Fetch success: {fetch_result.success}")
print(f"Total fetched: {fetch_result.total_fetched}")
print(f"Duration: {fetch_result.duration_seconds:.2f}s")
print(f"Metadata: {fetch_result.metadata}")

if fetch_result.errors:
    print(f"Errors: {fetch_result.errors}")

src.ingestion.pipelines.apis.ra_co - INFO - Parsed 100 events from response


Fetch success: True
Total fetched: 100
Duration: 1.24s
Metadata: {'pages_fetched': 1, 'api_calls': 1, 'total_available': 100}


In [27]:
# Show raw event data
if fetch_result.raw_data:
    print(f"Raw events ({len(fetch_result.raw_data)} total):")
    print("=" * 60)
    
    for i, event in enumerate(fetch_result.raw_data[:3]):
        print(f"\nEvent {i+1}:")
        print(f"  ID: {event.get('id')}")
        print(f"  Title: {event.get('title')}")
        print(f"  Date: {event.get('date')}")
        venue = event.get('venue', {}) or {}
        print(f"  Venue: {venue.get('name')}")
        artists = event.get('artists', []) or []
        print(f"  Artists: {[a.get('name') for a in artists]}")
        print(f"  Cost: {event.get('cost')}")
        print(f"  URL: https://ra.co{event.get('contentUrl')}")

Raw events (100 total):

Event 1:
  ID: 2348917
  Title: Memento Xs
  Date: 2026-02-02T00:00:00.000
  Venue: Macarena Club
  Artists: ['queen yasmeen', 'PILAR MOLINERO']
  Cost: 10€/15€
  URL: https://ra.co/events/2348917

Event 2:
  ID: 2336851
  Title: uroz
  Date: 2026-02-02T00:00:00.000
  Venue: Moog Club
  Artists: ['uroz']
  Cost: 
  URL: https://ra.co/events/2336851

Event 3:
  ID: 2328694
  Title: Skylab at Noxe (26th floor W Barcelona)
  Date: 2026-02-02T00:00:00.000
  Venue: Noxe Barcelona
  Artists: ['Nesi']
  Cost: 0
  URL: https://ra.co/events/2328694


## Step 2: Run Full Pipeline

Now run the complete pipeline that normalizes data to EventSchema.

In [28]:
from src.ingestion.base_pipeline import PipelineConfig
from src.ingestion.pipelines.apis.ra_co import create_ra_co_pipeline

# Pipeline configuration
pipeline_config = PipelineConfig(
    source_name="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    batch_size=50,
)

# Source-specific config
source_config = {
    "graphql_endpoint": "https://ra.co/graphql",
    "request_timeout": 30,
    "max_retries": 3,
    "rate_limit_per_second": 1.0,
}

# Create pipeline
pipeline = create_ra_co_pipeline(pipeline_config, source_config)
print(f"Pipeline created: {pipeline.config.source_name}")
print(f"Source type: {pipeline.source_type.value}")

TypeError: PipelineConfig.__init__() got an unexpected keyword argument 'source_type'

In [29]:
# Execute pipeline
result = pipeline.execute(
    area_id=20,  # Barcelona
    page_size=20,
)

print("Pipeline Execution Results")
print("=" * 60)
print(f"Status: {result.status.value}")
print(f"Source Type: {result.source_type.value}")
print(f"Execution ID: {result.execution_id}")
print(f"Total processed: {result.total_events_processed}")
print(f"Successful: {result.successful_events}")
print(f"Failed: {result.failed_events}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.success_rate:.1f}%")

if result.errors:
    print(f"\nErrors: {result.errors}")

NameError: name 'pipeline' is not defined

In [None]:
# Show normalized events
if result.events:
    print(f"Normalized Events ({len(result.events)} total):")
    print("=" * 60)
    
    for i, event in enumerate(result.events[:5]):
        print(f"\nEvent {i+1}: {event.title}")
        print(f"  ID: {event.event_id}")
        print(f"  Start: {event.start_datetime}")
        print(f"  Venue: {event.location.venue_name}")
        print(f"  City: {event.location.city}")
        print(f"  Category: {event.primary_category}")
        print(f"  Event Type: {event.event_type.value if event.event_type else 'N/A'}")
        print(f"  Price: {event.price.price_raw_text} (free: {event.price.is_free})")
        print(f"  Quality Score: {event.data_quality_score:.2f}")
        artists = event.custom_fields.get('artists', [])
        if artists:
            print(f"  Artists: {artists[:3]}")
else:
    print("No events normalized.")

## Step 3: Convert to DataFrame

Convert normalized events to pandas DataFrame with target schema.

In [None]:

if result.events:
    df = pipeline.to_dataframe(result.events)
    print(f"DataFrame Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    print(list(df.columns))
else:
    print("No events to convert.")

In [None]:
# Display DataFrame
if result.events:
    display(df)

In [None]:
# Display key columns
if result.events:
    key_cols = [
        "title",
        "start_datetime",
        "venue_name",
        "city",
        "is_free",
        "min_price",
        "currency_code",
        "data_quality_score"
    ]
    print("Key Event Data:")
    display(df[key_cols])

## Step 4: Save Results (Optional)

In [None]:
# Save to parquet
if result.events:
    output_dir = "../data/raw"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/ra_co_events.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Saved {len(df)} events to {output_path}")

## Cleanup

In [None]:
# Close adapter and pipeline resources
adapter.close()
pipeline.close()
print("Resources released.")