# Event Ingestion Pipeline Testing

This notebook tests the event ingestion pipeline using the adapter pattern.
Ra.co uses a GraphQL API adapter.

In [1]:
import sys
import os
import logging
import pandas as pd 

# Setup path
sys.path.insert(0, os.path.abspath(".."))

# Enable logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(name)s - %(levelname)s - %(message)s'
)

print("Setup complete")

Setup complete


## Step 1: Test the Ra.co API Adapter

First, let's test the GraphQL API adapter directly to see raw data.

In [2]:
from src.ingestion.adapters import SourceType
from src.ingestion.adapters.api_adapter import APIAdapterConfig
from src.ingestion.pipelines.apis.ra_co import RaCoAdapter

# Create adapter config
adapter_config = APIAdapterConfig(
    source_id="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    max_retries=3,
    rate_limit_per_second=1.0,
    graphql_endpoint="https://ra.co/graphql",
)

# Create adapter
adapter = RaCoAdapter(adapter_config)
print(f"Adapter created: {adapter.source_id}")
print(f"Source type: {adapter.source_type.value}")

Adapter created: ra_co
Source type: api


In [3]:
# Fetch raw data from API
fetch_result = adapter.fetch(
    area_id=20,  # Barcelona
    page_size=100,
)

print(f"Fetch success: {fetch_result.success}")
print(f"Total fetched: {fetch_result.total_fetched}")
print(f"Duration: {fetch_result.duration_seconds:.2f}s")
print(f"Metadata: {fetch_result.metadata}")

if fetch_result.errors:
    print(f"Errors: {fetch_result.errors}")

src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - INFO - Parsed 100 events from response
src.ingestion.pipelines.apis.ra_co - INFO - Fetched all 100 available events
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 100 total events across 1 pages


Fetch success: True
Total fetched: 100
Duration: 1.25s
Metadata: {'pages_fetched': 1, 'total_available': 100, 'max_pages': 10}


In [4]:
# Show raw event data
if fetch_result.raw_data:
    print(f"Raw events ({len(fetch_result.raw_data)} total):")
    print("=" * 60)
    
    for i, event in enumerate(fetch_result.raw_data[:3]):
        print(f"\nEvent {i+1}:")
        print(f"  ID: {event.get('id')}")
        print(f"  Title: {event.get('title')}")
        content = event.get('content')
        print(f"  Content/Desc: {(content or 'N/A')[:100]}...")
        print(f"  Date: {event.get('date')}")
        venue = event.get('venue', {}) or {}
        print(f"  Venue: {venue.get('name')}")
        artists = event.get('artists', []) or []
        print(f"  Artists: {[a.get('name') for a in artists]}")
        print(f"  Cost: {event.get('cost')}")
        print(f"  URL: https://ra.co{event.get('contentUrl')}")


Raw events (100 total):

Event 1:
  ID: 2356712
  Title: AFTER THE BELL - EU bussiness welcome party
  Content/Desc: EU Business School is pleased to welcome students to the start of a new academic semester with an ex...
  Date: 2026-02-04T00:00:00.000
  Venue: Negro Rojo Club
  Artists: ['MARTISTA']
  Cost: 
  URL: https://ra.co/events/2356712

Event 2:
  ID: 2348920
  Title: Original Silk
  Content/Desc: N/A...
  Date: 2026-02-04T00:00:00.000
  Venue: Macarena Club
  Artists: ['Pau Guilera', 'Jones May']
  Cost: 10€
  URL: https://ra.co/events/2348920

Event 3:
  ID: 2360908
  Title: RUBI I SHOWCASE DJ SET I GUEST LIST
  Content/Desc: MIÉRCOLES · CITY HALL BARCELONA

El miércoles no pide permiso.
Se infiltra en la semana y convierte ...
  Date: 2026-02-04T00:00:00.000
  Venue: City Hall
  Artists: []
  Cost: 
  URL: https://ra.co/events/2360908


In [5]:
# Introspect additional types: Venue, EventListing, Artist
import requests
def introspect_type(type_name: str):
    """Introspect a GraphQL type and return its fields."""
    query = f"""
    query IntrospectType {{
      __type(name: "{type_name}") {{
        name
        fields {{
          name
          type {{
            name
            kind
            ofType {{
              name
              kind
            }}
          }}
        }}
      }}
    }}
    """
    
    response = requests.post(
        "https://ra.co/graphql",
        json={"query": query},
        headers={"Content-Type": "application/json"},
        timeout=30
    )
    
    if response.status_code == 200:
        result = response.json()
        return result.get("data", {}).get("__type", {})
    return None

# Introspect key types
for type_name in ["Venue", "EventListing", "Artist", "Area"]:
    type_info = introspect_type(type_name)
    
    if type_info and type_info.get("fields"):
        print("=" * 70)
        print(f"AVAILABLE FIELDS ON '{type_name}' TYPE")
        print("=" * 70)
        
        for field in sorted(type_info["fields"], key=lambda x: x["name"]):
            field_name = field["name"]
            field_type = field["type"]
            type_name_str = field_type.get("name") or ""
            type_kind = field_type.get("kind", "")
            
            if field_type.get("ofType"):
                inner = field_type["ofType"].get("name") or field_type["ofType"].get("kind")
                type_str = f"{type_kind}[{inner}]"
            else:
                type_str = type_name_str or type_kind
                
            print(f"  {field_name:30} -> {type_str}")
        print()

## Step 2: Run Full Pipeline

Now run the complete pipeline that normalizes data to EventSchema.

In [6]:
from src.ingestion.base_pipeline import PipelineConfig
from src.ingestion.pipelines.apis.ra_co import create_ra_co_pipeline

# Pipeline configuration
pipeline_config = PipelineConfig(
    source_name="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    batch_size=9000,
)

# Source-specific config with feature extraction enabled
source_config = {
    "graphql_endpoint": "https://ra.co/graphql",
    "request_timeout": 30,
    "max_retries": 3,
    "rate_limit_per_second": 1.0,
    # Enable feature extraction for taxonomy enrichment
    "feature_extraction": {
        "enabled": True,  # Set to True if you have an OpenAI API key
        "provider": "openai",
        "model_name": "gpt-3.5-turbo",
        "temperature": 0.3,
    },
}

# Create pipeline
pipeline = create_ra_co_pipeline(pipeline_config, source_config)
print(f"Pipeline created: {pipeline.config.source_name}")
print(f"Source type: {pipeline.source_type.value}")
print(f"Feature extractor enabled: {pipeline.feature_extractor is not None}")

Pipeline created: ra_co
Source type: api
Feature extractor enabled: True


In [7]:
# Execute pipeline
result = pipeline.execute(
    area_id=20,  # Barcelona
    page_size=100,
)

print("Pipeline Execution Results")
print("=" * 60)
print(f"Status: {result.status.value}")
print(f"Source Type: {result.source_type.value}")
print(f"Execution ID: {result.execution_id}")
print(f"Total processed: {result.total_events_processed}")
print(f"Successful: {result.successful_events}")
print(f"Failed: {result.failed_events}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.success_rate:.1f}%")

if result.errors:
    print(f"\nErrors: {result.errors}")

pipeline.ra_co - INFO - Starting pipeline execution: ra_co_20260204_115815_95eaeb5e
pipeline.ra_co - INFO - Source type: api
src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - INFO - Parsed 100 events from response
src.ingestion.pipelines.apis.ra_co - INFO - Fetched all 100 available events
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 100 total events across 1 pages
pipeline.ra_co - INFO - Fetched 100 raw events
httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
openai._base_client - INFO - Retrying request to /chat/completions in 0.457489 seconds
httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
openai._base_client - INFO - Retrying request to /chat/completions in 0.880816 seconds
httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"


KeyboardInterrupt: 

In [None]:
# Show normalized events with enrichment
if result.events:
    print(f"Normalized Events ({len(result.events)} total):")
    print("=" * 70)
    
    for i, event in enumerate(result.events[:3]):
        print(f"\nEvent {i+1}: {event}")
       

Normalized Events (91 total):

Event 1: event_id='04516c95-0c54-4c17-8c50-4819a39a6914' title='Beat Lab x WeLove pres. Tech House Night' description='Este martes 3 de Febrero, la escena de Barcelona se enciende con una noche de Tech House gracias a la colaboración entre Beat Lab y WeLove. Después de un verano en Barcelona, WeLove, el colectivo australiano conocido por su comunidad vibrante y sus sets cuidados, trae su sello internacional y energía a la ciudad. Beat Lab, referente local del Tech House, continúa ofreciendo noches consistentes y experiencias musicales envolventes. Juntos, prometen ritmos sólidos, grooves potentes y momentos pensados para los amantes del género más exigentes. Una cita imprescindible para quienes buscan calidad y buen ambiente en medio de la semana. This Tuesday, February 3rd, Barcelona’s scene comes alive with a Tech House night presented by Beat Lab and WeLove. After a summer in the city, WeLove, the Australian collective known for its vibrant community a

In [None]:
def flatten_event(event):
    # 1. Handle Taxonomy (Extracting the first dimension as primary or joining them)
    # If you want multiple, you might need to 'explode' the dataframe later.
    main_tax = event.taxonomy_dimensions[0] if event.taxonomy_dimensions else None
    
    # 2. Extract Artist from custom_fields
    artists = event.custom_fields.get('artists', [])

    return {
        "event_id": event.event_id,
        "title": event.title,
        "description": event.description,
        
        # Taxonomy Flattening
        "primary_category": main_tax.primary_category if main_tax else None,
        "subcategory_id": main_tax.subcategory if main_tax else None,
        "subcategory_name": main_tax.subcategory_name if main_tax else None,
        "values": main_tax.values if main_tax else None,

        "activity_id": main_tax.activity_id if main_tax else None,
        "activity_name": main_tax.activity_name if main_tax else None,
        "energy_level": main_tax.energy_level if main_tax else None,
        "social_intensity": main_tax.social_intensity if main_tax else None,
        "cognitive_load": main_tax.cognitive_load if main_tax else None,
        "physical_involvement": main_tax.physical_involvement if main_tax else None,
        "cost_level": main_tax.cost_level if main_tax else None,
        "time_scale": main_tax.time_scale if main_tax else None,
        "environment": main_tax.environment if main_tax else None,
        "emotional_output": main_tax.emotional_output if main_tax else None,
        "risk_level": main_tax.risk_level if main_tax else None,
        "age_accessibility": main_tax.age_accessibility if main_tax else None,
        "repeatability": main_tax.repeatability if main_tax else None,
        
        # Date & Time
        "start_datetime": event.start_datetime,
        "end_datetime": event.end_datetime,
        "duration_minutes": event.duration_minutes,
        
        # Location Flattening
        "venue_name": event.location.venue_name,
        "city": event.location.city,
        "street_address": event.location.street_address,
        "country_code": event.location.country_code,
        "latitude": event.location.coordinates.latitude if event.location.coordinates else None,
        "longitude": event.location.coordinates.longitude if event.location.coordinates else None,
        "timezone": event.location.timezone,

        "event_type": event.event_type,
        # "music_genres": event.music_genres,
        "artist_names": artists, # Stored as a list in the cell
        "capacity": event.capacity,
        "event_format": event.format,
        "is_recurring": event.is_recurring,
        "recurrence_pattern": event.recurrence_pattern,
        
        # Price Flattening
        "currency": event.price.currency,
        "is_free": event.price.is_free,
        "min_price": event.price.minimum_price,
        "max_price": event.price.maximum_price,
        "price_raw_text": event.price.price_raw_text,

        # Tickets Info
        "tickets_url": event.ticket_info.url if event.ticket_info else None,
        "is_sold_out": event.ticket_info.is_sold_out if event.ticket_info else None,
        # "ticket_count_available": event.ticket_info.ticket_count_available,
        # "going_count": event.ticket_info.going_count,
        # "age_restriction": event.ticket_info.age_restriction,

        # Organizer Info
        "organizer_name": event.organizer.name,
        
        # Source Info
        "source_name": event.source.source_name,
        "source_event_id": event.source.source_event_id,
        "source_url": event.source.source_url,
        "last_updated_from_source": event.source.last_updated_from_source,
        "ingestion_timestamp": event.source.ingestion_timestamp,
        "raw_html": event.source.raw_html if hasattr(event.source, 'raw_html') else None,
        
        # Other Fields
        "image_url": event.image_url,
        "media_assets": event.media_assets,
        "data_quality_score": event.data_quality_score,
        "normalization_errors": event.normalization_errors,
        "tags": event.tags,
        "custom_fields": event.custom_fields,
        "created_at": event.created_at,
        "updated_at": event.updated_at
    }

# Build the DataFrame
if result.events:
    flattened_data = [flatten_event(e) for e in result.events]
    df = pd.DataFrame(flattened_data)
else:
    print("No events to process.")

df.head(2)

Unnamed: 0,event_id,title,description,primary_category,subcategory_id,subcategory_name,values,activity_id,activity_name,energy_level,...,ingestion_timestamp,raw_html,image_url,media_assets,data_quality_score,normalization_errors,tags,custom_fields,created_at,updated_at
0,04516c95-0c54-4c17-8c50-4819a39a6914,Beat Lab x WeLove pres. Tech House Night,"Este martes 3 de Febrero, la escena de Barcelo...",play_and_fun,1.4,Music & Rhythm Play,"[expression, energy, flow, rhythm]",,,high,...,2026-02-03 21:07:37.071631,,https://ra.co/images/events/flyer/https://imag...,[],0.775,[],[],"{'artists': ['HollowFate', 'Oversant']}",2026-02-03 21:07:37.072262,2026-02-03 21:07:37.072267
1,0fc016cd-da29-4e4c-b5c7-a596254a8028,Plastic Night,,play_and_fun,1.4,Music & Rhythm Play,"[expression, energy, flow, rhythm]",,,high,...,2026-02-03 21:07:37.073248,,https://ra.co/images/events/flyer/https://imag...,[],0.775,[],[],{'artists': ['Kanedo']},2026-02-03 21:07:37.073471,2026-02-03 21:07:37.073473


In [None]:
df.shape

(91, 58)

In [None]:
df.columns

Index(['event_id', 'title', 'description', 'primary_category',
       'subcategory_id', 'subcategory_name', 'values', 'activity_id',
       'activity_name', 'energy_level', 'social_intensity', 'cognitive_load',
       'physical_involvement', 'cost_level', 'time_scale', 'environment',
       'emotional_output', 'risk_level', 'age_accessibility', 'repeatability',
       'start_datetime', 'end_datetime', 'duration_minutes', 'venue_name',
       'city', 'street_address', 'country_code', 'latitude', 'longitude',
       'timezone', 'event_type', 'artist_names', 'capacity', 'event_format',
       'is_recurring', 'recurrence_pattern', 'currency', 'is_free',
       'min_price', 'max_price', 'price_raw_text', 'tickets_url',
       'is_sold_out', 'organizer_name', 'source_name', 'source_event_id',
       'source_url', 'last_updated_from_source', 'ingestion_timestamp',
       'raw_html', 'image_url', 'media_assets', 'data_quality_score',
       'normalization_errors', 'tags', 'custom_fields', 'cre

In [None]:
df.shape

(91, 58)

In [None]:
# Display taxonomy enrichment columns
if result.events:
    enrichment_cols = [
        "title",
        "subcategory_id",
        "subcategory_name",
        "energy_level",
        "social_intensity",
        "cognitive_load",
        "physical_involvement",
        "cost_level",
        "time_scale",
        "environment",
        "emotional_output",
        "age_accessibility",
        "repeatability",
    ]
    print("Taxonomy Enrichment Data:")
    display(df[enrichment_cols].head(10))

Taxonomy Enrichment Data:


Unnamed: 0,title,subcategory_id,subcategory_name,energy_level,social_intensity,cognitive_load,physical_involvement,cost_level,time_scale,environment,emotional_output,age_accessibility,repeatability
0,Beat Lab x WeLove pres. Tech House Night,1.4,Music & Rhythm Play,high,large_group,low,none,medium,long,indoor,"[excitement, connection, relaxation, inspirati...",adults,medium
1,Plastic Night,1.4,Music & Rhythm Play,high,large_group,low,none,low,long,indoor,[enjoyment],adults,medium
2,Rubén Seoane (All Night Long),1.4,Music & Rhythm Play,high,large_group,low,light,medium,long,indoor,"[joy, energy, inspiration]",adults,medium
3,PANACHEZ / CASAS/DIP PHAN,1.4,Music & Rhythm Play,medium,large_group,low,none,free,long,indoor,[enjoyment],adults,medium
4,Ecler ISE Experience 2026,1.4,Music & Rhythm Play,high,large_group,low,light,medium,long,outdoor,"[excitement, energy, inspiration]",adults,medium
5,ALEXXFOX,1.4,Music & Rhythm Play,medium,large_group,low,none,free,long,indoor,[enjoyment],adults,medium
6,Dr. Dou Social Club meets D.Bird,1.4,Music & Rhythm Play,high,large_group,low,light,medium,long,indoor,"[connection, energy]",adults,medium
7,AFTER THE BELL - EU bussiness welcome party,1.4,Music & Rhythm Play,high,large_group,low,light,medium,long,indoor,"[joy, connection, relaxation, inspiration, bel...",adults,medium
8,Original Silk,1.4,Music & Rhythm Play,medium,large_group,low,none,low,long,indoor,[enjoyment],adults,medium
9,RUBI I SHOWCASE DJ SET I GUEST LIST,1.4,Music & Rhythm Play,high,large_group,low,none,medium,long,indoor,[energy],adults,medium


## Step 4: Save Results (Optional)

In [None]:
# Save to parquet
if result.events:
    output_dir = "../data/raw"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/ra_co_events.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Saved {len(df)} events to {output_path}")

Saved 91 events to ../data/raw/ra_co_events.parquet


## Cleanup

In [None]:
# Close adapter and pipeline resources
adapter.close()
pipeline.close()
print("Resources released.")

Resources released.
