# Event Ingestion Pipeline Testing

This notebook tests the event ingestion pipeline using the adapter pattern.
Ra.co uses a GraphQL API adapter.

In [20]:
import sys
import os
import logging

# Setup path
sys.path.insert(0, os.path.abspath(".."))

# Enable logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(name)s - %(levelname)s - %(message)s'
)

print("Setup complete")

Setup complete


## Step 1: Test the Ra.co API Adapter

First, let's test the GraphQL API adapter directly to see raw data.

In [21]:
from src.ingestion.adapters import SourceType
from src.ingestion.adapters.api_adapter import APIAdapterConfig
from src.ingestion.pipelines.apis.ra_co import RaCoAdapter

# Create adapter config
adapter_config = APIAdapterConfig(
    source_id="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    max_retries=3,
    rate_limit_per_second=1.0,
    graphql_endpoint="https://ra.co/graphql",
)

# Create adapter
adapter = RaCoAdapter(adapter_config)
print(f"Adapter created: {adapter.source_id}")
print(f"Source type: {adapter.source_type.value}")

Adapter created: ra_co
Source type: api


In [None]:
# Fetch raw data from API
fetch_result = adapter.fetch(
    area_id=20,  # Barcelona
    page_size=100,
)

print(f"Fetch success: {fetch_result.success}")
print(f"Total fetched: {fetch_result.total_fetched}")
print(f"Duration: {fetch_result.duration_seconds:.2f}s")
print(f"Metadata: {fetch_result.metadata}")

if fetch_result.errors:
    print(f"Errors: {fetch_result.errors}")

src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - ERROR - GraphQL errors: [{'message': 'Limit must not be greater than 100', 'path': ['eventListings'], 'extensions': {'code': 'DOWNSTREAM_SERVICE_ERROR'}}]
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 0 total events across 1 pages


Fetch success: False
Total fetched: 0
Duration: 1.14s
Metadata: {'pages_fetched': 1, 'total_available': 0, 'max_pages': 10}


In [23]:
# Show raw event data
if fetch_result.raw_data:
    print(f"Raw events ({len(fetch_result.raw_data)} total):")
    print("=" * 60)
    
    for i, event in enumerate(fetch_result.raw_data[:3]):
        print(f"\nEvent {i+1}:")
        print(f"  ID: {event.get('id')}")
        print(f"  Title: {event.get('title')}")
        content = event.get('content')
        print(f"  Content/Desc: {(content or 'N/A')[:100]}...")
        print(f"  Date: {event.get('date')}")
        venue = event.get('venue', {}) or {}
        print(f"  Venue: {venue.get('name')}")
        artists = event.get('artists', []) or []
        print(f"  Artists: {[a.get('name') for a in artists]}")
        print(f"  Cost: {event.get('cost')}")
        print(f"  URL: https://ra.co{event.get('contentUrl')}")

Raw events (100 total):

Event 1:
  ID: 2351138
  Title: Beat Lab x WeLove pres. Tech House Night
  Content/Desc: N/A...
  Date: 2026-02-03T00:00:00.000
  Venue: City Hall
  Artists: ['HollowFate', 'Oversant']
  Cost: 
  URL: https://ra.co/events/2351138

Event 2:
  ID: 2348919
  Title: Plastic Night
  Content/Desc: N/A...
  Date: 2026-02-03T00:00:00.000
  Venue: Macarena Club
  Artists: ['Kanedo']
  Cost: 10€
  URL: https://ra.co/events/2348919

Event 3:
  ID: 2336861
  Title: Rubén Seoane (All Night Long)
  Content/Desc: N/A...
  Date: 2026-02-03T00:00:00.000
  Venue: Moog Club
  Artists: ['Rubén Seoane']
  Cost: 
  URL: https://ra.co/events/2336861


## Step 2: Run Full Pipeline

Now run the complete pipeline that normalizes data to EventSchema.

In [36]:
from src.ingestion.base_pipeline import PipelineConfig
from src.ingestion.pipelines.apis.ra_co import create_ra_co_pipeline

# Pipeline configuration
pipeline_config = PipelineConfig(
    source_name="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    batch_size=9000,
)

# Source-specific config with feature extraction enabled
source_config = {
    "graphql_endpoint": "https://ra.co/graphql",
    "request_timeout": 30,
    "max_retries": 3,
    "rate_limit_per_second": 1.0,
    # Enable feature extraction for taxonomy enrichment
    "feature_extraction": {
        "enabled": True,  # Set to True if you have an OpenAI API key
        "provider": "openai",
        "model_name": "gpt-4o-mini",
        "temperature": 0.1,
    },
}

# Create pipeline
pipeline = create_ra_co_pipeline(pipeline_config, source_config)
print(f"Pipeline created: {pipeline.config.source_name}")
print(f"Source type: {pipeline.source_type.value}")
print(f"Feature extractor enabled: {pipeline.feature_extractor is not None}")

Pipeline created: ra_co
Source type: api
Feature extractor enabled: True


In [38]:
# Execute pipeline
result = pipeline.execute(
    area_id=20,  # Barcelona
    page_size=100,
)

print("Pipeline Execution Results")
print("=" * 60)
print(f"Status: {result.status.value}")
print(f"Source Type: {result.source_type.value}")
print(f"Execution ID: {result.execution_id}")
print(f"Total processed: {result.total_events_processed}")
print(f"Successful: {result.successful_events}")
print(f"Failed: {result.failed_events}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.success_rate:.1f}%")

if result.errors:
    print(f"\nErrors: {result.errors}")

pipeline.ra_co - INFO - Starting pipeline execution: ra_co_20260203_161009_3e71619e
pipeline.ra_co - INFO - Source type: api
src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - INFO - Parsed 100 events from response
src.ingestion.pipelines.apis.ra_co - INFO - Fetched all 100 available events
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 100 total events across 1 pages
pipeline.ra_co - INFO - Fetched 100 raw events
pipeline.ra_co - INFO - Deduplication: 100 -> 90 events
pipeline.ra_co - INFO - Pipeline completed: 90/100 successful


Pipeline Execution Results
Status: partial_success
Source Type: api
Execution ID: ra_co_20260203_161009_3e71619e
Total processed: 100
Successful: 90
Failed: 10
Duration: 1.28s
Success rate: 90.0%


In [39]:
# Show normalized events with enrichment
if result.events:
    print(f"Normalized Events ({len(result.events)} total):")
    print("=" * 70)
    
    for i, event in enumerate(result.events[:3]):
        print(f"\nEvent {i+1}: {event.title}")
        print(f"  ID: {event.event_id}")
        print(f"  Description: {(event.description or 'N/A')[:100]}...")
        print(f"  Start: {event.start_datetime}")
        print(f"  Venue: {event.location.venue_name}, {event.location.city}")
        print(f"  Category: {event.primary_category}")
        print(f"  Price: {event.price.price_raw_text} (free: {event.price.is_free})")
        
        # Show taxonomy enrichment
        if event.taxonomy_dimensions:
            dim = event.taxonomy_dimensions[0]
            print(f"  Taxonomy Enrichment:")
            print(f"    - Subcategory: {dim.subcategory} ({dim.subcategory_name})")
            print(f"    - Energy Level: {dim.energy_level}")
            print(f"    - Social Intensity: {dim.social_intensity}")
            print(f"    - Cost Level: {dim.cost_level}")
            print(f"    - Time Scale: {dim.time_scale}")
            print(f"    - Environment: {dim.environment}")
            print(f"    - Emotional Output: {dim.emotional_output}")
            print(f"    - Age Accessibility: {dim.age_accessibility}")
        
        artists = event.custom_fields.get('artists', [])
        if artists:
            print(f"  Artists: {artists[:3]}")
else:
    print("No events normalized.")

Normalized Events (90 total):

Event 1: Beat Lab x WeLove pres. Tech House Night
  ID: ra_co_2351138
  Description: N/A...
  Start: 2026-02-03 23:59:00
  Venue: City Hall, Barcelona
  Category: play_and_fun
  Price:  (free: True)
  Taxonomy Enrichment:
    - Subcategory: 1.4 (Music & Rhythm Play)
    - Energy Level: medium
    - Social Intensity: large_group
    - Cost Level: medium
    - Time Scale: long
    - Environment: indoor
    - Emotional Output: []
    - Age Accessibility: all
  Artists: ['HollowFate', 'Oversant']

Event 2: Plastic Night
  ID: ra_co_2348919
  Description: N/A...
  Start: 2026-02-03 23:59:00
  Venue: Macarena Club, Barcelona
  Category: play_and_fun
  Price: 10€ (free: False)
  Taxonomy Enrichment:
    - Subcategory: 1.4 (Music & Rhythm Play)
    - Energy Level: medium
    - Social Intensity: large_group
    - Cost Level: low
    - Time Scale: long
    - Environment: indoor
    - Emotional Output: []
    - Age Accessibility: all
  Artists: ['Kanedo']

Event 3: 

In [40]:
# Show normalized events with enrichment
if result.events:
    print(f"Normalized Events ({len(result.events)} total):")
    print("=" * 70)
    
    for i, event in enumerate(result.events[:3]):
        print(f"\nEvent {i+1}: {event}")
       

Normalized Events (90 total):

Event 1: event_id='ra_co_2351138' title='Beat Lab x WeLove pres. Tech House Night' description=None long_description=None primary_category='play_and_fun' taxonomy_dimensions=[TaxonomyDimension(primary_category=<PrimaryCategory.PLAY_AND_PURE_FUN: 'play_and_fun'>, subcategory='1.4', subcategory_name='Music & Rhythm Play', values=['expression', 'energy', 'flow', 'rhythm'], confidence=0.95, activity_id=None, activity_name=None, energy_level='medium', social_intensity='large_group', cognitive_load='low', physical_involvement='none', cost_level='medium', time_scale='long', environment='indoor', emotional_output=[], risk_level='very_low', age_accessibility='all', repeatability='medium'), TaxonomyDimension(primary_category=<PrimaryCategory.SOCIAL_CONNECTION: 'social_connection'>, subcategory='5.7', subcategory_name='Shared Activities & Co-Experience', values=['connection', 'belonging', 'shared joy'], confidence=0.8, activity_id=None, activity_name=None, energy_le

## Step 3: Convert to DataFrame

Convert normalized events to pandas DataFrame with target schema.

In [27]:
if result.events:
    df = pipeline.to_dataframe(result.events)
    print(f"DataFrame Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    for i, col in enumerate(df.columns, 1):
        print(f"  {i:2}. {col}")
else:
    print("No events to convert.")

DataFrame Shape: (90, 21)

Columns (21):
   1. event_id
   2. title
   3. description
   4. start_datetime
   5. end_datetime
   6. city
   7. country_code
   8. venue_name
   9. artists
  10. primary_category
  11. taxonomy
  12. event_type
  13. format
  14. is_free
  15. min_price
  16. max_price
  17. currency_code
  18. organizer
  19. source_url
  20. image_url
  21. data_quality_score


In [32]:
df.head()

Unnamed: 0,event_id,title,description,start_datetime,end_datetime,city,country_code,venue_name,artists,primary_category,...,event_type,format,is_free,min_price,max_price,currency_code,organizer,source_url,image_url,data_quality_score
0,ra_co_2351138,Beat Lab x WeLove pres. Tech House Night,,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,City Hall,"HollowFate, Oversant",play_and_fun,...,nightlife,in_person,True,,,EUR,City Hall,https://ra.co/events/2351138,https://ra.co/images/events/flyer/https://imag...,0.725
1,ra_co_2348919,Plastic Night,,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,Macarena Club,Kanedo,play_and_fun,...,nightlife,in_person,False,10.0,,EUR,Macarena Club,https://ra.co/events/2348919,https://ra.co/images/events/flyer/https://imag...,0.775
2,ra_co_2336861,Rubén Seoane (All Night Long),,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,Moog Club,Rubén Seoane,play_and_fun,...,nightlife,in_person,True,,,EUR,Moog Club,https://ra.co/events/2336861,https://ra.co/images/events/flyer/https://imag...,0.725
3,ra_co_2347628,Ecler ISE Experience 2026,,2026-02-03 10:00:00,2026-02-06 16:00:00,Barcelona,ES,Fira Gran Via,"Marc Piñol, Gee Lane, Inner Desires, Memory Pa...",play_and_fun,...,nightlife,in_person,True,,,EUR,Fira Gran Via,https://ra.co/events/2347628,https://ra.co/images/events/flyer/https://imag...,0.705
4,ra_co_2360448,PANACHEZ / CASAS/DIP PHAN,,2026-02-01 22:30:00,2026-02-08 03:00:00,Barcelona,ES,The Supermercat Raval,,play_and_fun,...,nightlife,in_person,True,,,EUR,The Supermercat Raval,https://ra.co/events/2360448,https://ra.co/images/events/flyer/https://imag...,0.705


In [33]:
df.shape

(90, 21)

In [30]:
# Display taxonomy enrichment columns
if result.events:
    enrichment_cols = [
        "title",
        "subcategory",
        "subcategory_name",
        "energy_level",
        "social_intensity",
        "cognitive_load",
        "physical_involvement",
        "cost_level",
        "time_scale",
        "environment",
        "emotional_output",
        "age_accessibility",
        "repeatability",
    ]
    print("Taxonomy Enrichment Data:")
    display(df[enrichment_cols].head(10))

Taxonomy Enrichment Data:


KeyError: "['subcategory', 'subcategory_name', 'energy_level', 'social_intensity', 'cognitive_load', 'physical_involvement', 'cost_level', 'time_scale', 'environment', 'emotional_output', 'age_accessibility', 'repeatability'] not in index"

## Step 4: Save Results (Optional)

In [14]:
# Save to parquet
if result.events:
    output_dir = "../data/raw"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/ra_co_events.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Saved {len(df)} events to {output_path}")

Saved 17 events to ../data/raw/ra_co_events.parquet


## Cleanup

In [31]:
# Close adapter and pipeline resources
adapter.close()
pipeline.close()
print("Resources released.")

Resources released.
