# Event Ingestion Pipeline Testing

This notebook tests the event ingestion pipeline using the adapter pattern.
Ra.co uses a GraphQL API adapter.

In [3]:
import sys
import os
import logging

# Setup path
sys.path.insert(0, os.path.abspath(".."))

# Enable logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(name)s - %(levelname)s - %(message)s'
)

print("Setup complete")

Setup complete


## Step 1: Test the Ra.co API Adapter

First, let's test the GraphQL API adapter directly to see raw data.

In [4]:
from src.ingestion.adapters import SourceType
from src.ingestion.adapters.api_adapter import APIAdapterConfig
from src.ingestion.pipelines.apis.ra_co import RaCoAdapter

# Create adapter config
adapter_config = APIAdapterConfig(
    source_id="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    max_retries=3,
    rate_limit_per_second=1.0,
    graphql_endpoint="https://ra.co/graphql",
)

# Create adapter
adapter = RaCoAdapter(adapter_config)
print(f"Adapter created: {adapter.source_id}")
print(f"Source type: {adapter.source_type.value}")

Adapter created: ra_co
Source type: api


In [5]:
# Fetch raw data from API
fetch_result = adapter.fetch(
    area_id=20,  # Barcelona
    page_size=100,
)

print(f"Fetch success: {fetch_result.success}")
print(f"Total fetched: {fetch_result.total_fetched}")
print(f"Duration: {fetch_result.duration_seconds:.2f}s")
print(f"Metadata: {fetch_result.metadata}")

if fetch_result.errors:
    print(f"Errors: {fetch_result.errors}")

src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - INFO - Parsed 100 events from response
src.ingestion.pipelines.apis.ra_co - INFO - Fetched all 100 available events
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 100 total events across 1 pages


Fetch success: True
Total fetched: 100
Duration: 1.24s
Metadata: {'pages_fetched': 1, 'total_available': 100, 'max_pages': 10}


In [6]:
# Show raw event data
if fetch_result.raw_data:
    print(f"Raw events ({len(fetch_result.raw_data)} total):")
    print("=" * 60)
    
    for i, event in enumerate(fetch_result.raw_data[:3]):
        print(f"\nEvent {i+1}:")
        print(f"  ID: {event.get('id')}")
        print(f"  Title: {event.get('title')}")
        print(f"  Date: {event.get('date')}")
        venue = event.get('venue', {}) or {}
        print(f"  Venue: {venue.get('name')}")
        artists = event.get('artists', []) or []
        print(f"  Artists: {[a.get('name') for a in artists]}")
        print(f"  Cost: {event.get('cost')}")
        print(f"  URL: https://ra.co{event.get('contentUrl')}")

Raw events (100 total):

Event 1:
  ID: 2351138
  Title: Beat Lab x WeLove pres. Tech House Night
  Date: 2026-02-03T00:00:00.000
  Venue: City Hall
  Artists: ['HollowFate', 'Oversant']
  Cost: 
  URL: https://ra.co/events/2351138

Event 2:
  ID: 2348919
  Title: Plastic Night
  Date: 2026-02-03T00:00:00.000
  Venue: Macarena Club
  Artists: ['Kanedo']
  Cost: 10€
  URL: https://ra.co/events/2348919

Event 3:
  ID: 2336861
  Title: Rubén Seoane (All Night Long)
  Date: 2026-02-03T00:00:00.000
  Venue: Moog Club
  Artists: ['Rubén Seoane']
  Cost: 
  URL: https://ra.co/events/2336861


## Step 2: Run Full Pipeline

Now run the complete pipeline that normalizes data to EventSchema.

In [7]:
from src.ingestion.base_pipeline import PipelineConfig
from src.ingestion.pipelines.apis.ra_co import create_ra_co_pipeline

# Pipeline configuration
pipeline_config = PipelineConfig(
    source_name="ra_co",
    source_type=SourceType.API,
    request_timeout=30,
    batch_size=50,
)

# Source-specific config
source_config = {
    "graphql_endpoint": "https://ra.co/graphql",
    "request_timeout": 30,
    "max_retries": 3,
    "rate_limit_per_second": 1.0,
}

# Create pipeline
pipeline = create_ra_co_pipeline(pipeline_config, source_config)
print(f"Pipeline created: {pipeline.config.source_name}")
print(f"Source type: {pipeline.source_type.value}")

Pipeline created: ra_co
Source type: api


In [None]:
# Execute pipeline
result = pipeline.execute(
    area_id=20,  # Barcelona
    page_size=100,
)

print("Pipeline Execution Results")
print("=" * 60)
print(f"Status: {result.status.value}")
print(f"Source Type: {result.source_type.value}")
print(f"Execution ID: {result.execution_id}")
print(f"Total processed: {result.total_events_processed}")
print(f"Successful: {result.successful_events}")
print(f"Failed: {result.failed_events}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.success_rate:.1f}%")

if result.errors:
    print(f"\nErrors: {result.errors}")

pipeline.ra_co - INFO - Starting pipeline execution: ra_co_20260203_111940_57b8bc7a
pipeline.ra_co - INFO - Source type: api
src.ingestion.pipelines.apis.ra_co - INFO - Fetching page 1/10...
src.ingestion.pipelines.apis.ra_co - INFO - Parsed 20 events from response
src.ingestion.pipelines.apis.ra_co - INFO - Fetched all 20 available events
src.ingestion.pipelines.apis.ra_co - INFO - Pagination complete: fetched 20 total events across 1 pages
pipeline.ra_co - INFO - Fetched 20 raw events
pipeline.ra_co - INFO - Deduplication: 20 -> 17 events
pipeline.ra_co - INFO - Pipeline completed: 17/20 successful


Pipeline Execution Results
Status: partial_success
Source Type: api
Execution ID: ra_co_20260203_111940_57b8bc7a
Total processed: 20
Successful: 17
Failed: 3
Duration: 1.21s
Success rate: 85.0%


In [10]:
# Show normalized events
if result.events:
    print(f"Normalized Events ({len(result.events)} total):")
    print("=" * 60)
    
    for i, event in enumerate(result.events[:5]):
        print(f"\nEvent {i+1}: {event.title}")
        print(f"  ID: {event.event_id}")
        print(f"  Start: {event.start_datetime}")
        print(f"  Venue: {event.location.venue_name}")
        print(f"  City: {event.location.city}")
        print(f"  Category: {event.primary_category}")
        print(f"  Event Type: {event.event_type if event.event_type else 'N/A'}")
        print(f"  Price: {event.price.price_raw_text} (free: {event.price.is_free})")
        print(f"  Quality Score: {event.data_quality_score:.2f}")
        artists = event.custom_fields.get('artists', [])
        if artists:
            print(f"  Artists: {artists[:3]}")
else:
    print("No events normalized.")

Normalized Events (17 total):

Event 1: Beat Lab x WeLove pres. Tech House Night
  ID: ra_co_2351138
  Start: 2026-02-03 23:59:00
  Venue: City Hall
  City: Barcelona
  Category: play_and_fun
  Event Type: nightlife
  Price:  (free: True)
  Quality Score: 0.73
  Artists: ['HollowFate', 'Oversant']

Event 2: Plastic Night
  ID: ra_co_2348919
  Start: 2026-02-03 23:59:00
  Venue: Macarena Club
  City: Barcelona
  Category: play_and_fun
  Event Type: nightlife
  Price: 10€ (free: False)
  Quality Score: 0.78
  Artists: ['Kanedo']

Event 3: Rubén Seoane (All Night Long)
  ID: ra_co_2336861
  Start: 2026-02-03 23:59:00
  Venue: Moog Club
  City: Barcelona
  Category: play_and_fun
  Event Type: nightlife
  Price:  (free: True)
  Quality Score: 0.73
  Artists: ['Rubén Seoane']

Event 4: Ecler ISE Experience 2026
  ID: ra_co_2347628
  Start: 2026-02-03 10:00:00
  Venue: Fira Gran Via
  City: Barcelona
  Category: play_and_fun
  Event Type: nightlife
  Price:  (free: True)
  Quality Score: 0.71

## Step 3: Convert to DataFrame

Convert normalized events to pandas DataFrame with target schema.

In [11]:

if result.events:
    df = pipeline.to_dataframe(result.events)
    print(f"DataFrame Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    print(list(df.columns))
else:
    print("No events to convert.")

DataFrame Shape: (17, 21)

Columns (21):
['event_id', 'title', 'description', 'start_datetime', 'end_datetime', 'city', 'country_code', 'venue_name', 'artists', 'primary_category', 'taxonomy', 'event_type', 'format', 'is_free', 'min_price', 'max_price', 'currency_code', 'organizer', 'source_url', 'image_url', 'data_quality_score']


In [12]:
# Display DataFrame
if result.events:
    display(df)

Unnamed: 0,event_id,title,description,start_datetime,end_datetime,city,country_code,venue_name,artists,primary_category,...,event_type,format,is_free,min_price,max_price,currency_code,organizer,source_url,image_url,data_quality_score
0,ra_co_2351138,Beat Lab x WeLove pres. Tech House Night,,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,City Hall,"HollowFate, Oversant",play_and_fun,...,nightlife,in_person,True,,,EUR,City Hall,https://ra.co/events/2351138,https://ra.co/images/events/flyer/https://imag...,0.725
1,ra_co_2348919,Plastic Night,,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,Macarena Club,Kanedo,play_and_fun,...,nightlife,in_person,False,10.0,,EUR,Macarena Club,https://ra.co/events/2348919,https://ra.co/images/events/flyer/https://imag...,0.775
2,ra_co_2336861,Rubén Seoane (All Night Long),,2026-02-03 23:59:00,2026-02-04 05:00:00,Barcelona,ES,Moog Club,Rubén Seoane,play_and_fun,...,nightlife,in_person,True,,,EUR,Moog Club,https://ra.co/events/2336861,https://ra.co/images/events/flyer/https://imag...,0.725
3,ra_co_2347628,Ecler ISE Experience 2026,,2026-02-03 10:00:00,2026-02-06 16:00:00,Barcelona,ES,Fira Gran Via,"Marc Piñol, Gee Lane, Inner Desires, Memory Pa...",play_and_fun,...,nightlife,in_person,True,,,EUR,Fira Gran Via,https://ra.co/events/2347628,https://ra.co/images/events/flyer/https://imag...,0.705
4,ra_co_2360448,PANACHEZ / CASAS/DIP PHAN,,2026-02-01 22:30:00,2026-02-08 03:00:00,Barcelona,ES,The Supermercat Raval,,play_and_fun,...,nightlife,in_person,True,,,EUR,The Supermercat Raval,https://ra.co/events/2360448,https://ra.co/images/events/flyer/https://imag...,0.705
5,ra_co_2360449,ALEXXFOX,,2026-02-01 22:30:00,2026-02-08 03:00:00,Barcelona,ES,The Supermercat Gotico,,play_and_fun,...,nightlife,in_person,True,,,EUR,The Supermercat Gotico,https://ra.co/events/2360449,https://ra.co/images/events/flyer/https://imag...,0.705
6,ra_co_2361238,Dr. Dou Social Club meets D.Bird,,2026-02-03 19:00:00,2026-02-03 22:00:00,Barcelona,ES,Dr. Dou Social Club,D.Bird,play_and_fun,...,nightlife,in_person,True,,,EUR,Dr. Dou Social Club,https://ra.co/events/2361238,https://ra.co/images/events/flyer/https://imag...,0.725
7,ra_co_2356712,AFTER THE BELL - EU bussiness welcome party,,2026-02-04 23:30:00,2026-02-05 05:00:00,Barcelona,ES,Negro Rojo Club,MARTISTA,play_and_fun,...,party,in_person,True,,,EUR,Negro Rojo Club,https://ra.co/events/2356712,https://ra.co/images/events/flyer/https://imag...,0.725
8,ra_co_2348920,Original Silk,,2026-02-04 23:59:00,2026-02-05 05:00:00,Barcelona,ES,Macarena Club,"Pau Guilera, Jones May",play_and_fun,...,nightlife,in_person,False,10.0,,EUR,Macarena Club,https://ra.co/events/2348920,https://ra.co/images/events/flyer/https://imag...,0.775
9,ra_co_2360908,RUBI I SHOWCASE DJ SET I GUEST LIST,,2026-02-04 23:59:00,2026-02-05 05:00:00,Barcelona,ES,City Hall,,play_and_fun,...,nightlife,in_person,True,,,EUR,City Hall,https://ra.co/events/2360908,https://ra.co/images/events/flyer/https://imag...,0.725


In [17]:
df.columns

Index(['event_id', 'title', 'description', 'start_datetime', 'end_datetime',
       'city', 'country_code', 'venue_name', 'artists', 'primary_category',
       'taxonomy', 'event_type', 'format', 'is_free', 'min_price', 'max_price',
       'currency_code', 'organizer', 'source_url', 'image_url',
       'data_quality_score'],
      dtype='str')

In [13]:
# Display key columns
if result.events:
    key_cols = [
        "title",
        "start_datetime",
        "venue_name",
        "city",
        "is_free",
        "min_price",
        "currency_code",
        "data_quality_score"
    ]
    print("Key Event Data:")
    display(df[key_cols])

Key Event Data:


Unnamed: 0,title,start_datetime,venue_name,city,is_free,min_price,currency_code,data_quality_score
0,Beat Lab x WeLove pres. Tech House Night,2026-02-03 23:59:00,City Hall,Barcelona,True,,EUR,0.725
1,Plastic Night,2026-02-03 23:59:00,Macarena Club,Barcelona,False,10.0,EUR,0.775
2,Rubén Seoane (All Night Long),2026-02-03 23:59:00,Moog Club,Barcelona,True,,EUR,0.725
3,Ecler ISE Experience 2026,2026-02-03 10:00:00,Fira Gran Via,Barcelona,True,,EUR,0.705
4,PANACHEZ / CASAS/DIP PHAN,2026-02-01 22:30:00,The Supermercat Raval,Barcelona,True,,EUR,0.705
5,ALEXXFOX,2026-02-01 22:30:00,The Supermercat Gotico,Barcelona,True,,EUR,0.705
6,Dr. Dou Social Club meets D.Bird,2026-02-03 19:00:00,Dr. Dou Social Club,Barcelona,True,,EUR,0.725
7,AFTER THE BELL - EU bussiness welcome party,2026-02-04 23:30:00,Negro Rojo Club,Barcelona,True,,EUR,0.725
8,Original Silk,2026-02-04 23:59:00,Macarena Club,Barcelona,False,10.0,EUR,0.775
9,RUBI I SHOWCASE DJ SET I GUEST LIST,2026-02-04 23:59:00,City Hall,Barcelona,True,,EUR,0.725


In [19]:
# give me the count of events per start_datetime in a plot

import matplotlib.pyplot as plt
import pandas as pd

event_counts = df['start_datetime'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
plt.plot(event_counts.index, event_counts.values, marker='o')
plt.title('Event Counts by Start DateTime')
plt.xlabel('Start DateTime')

ModuleNotFoundError: No module named 'matplotlib'

## Step 4: Save Results (Optional)

In [14]:
# Save to parquet
if result.events:
    output_dir = "../data/raw"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/ra_co_events.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Saved {len(df)} events to {output_path}")

Saved 17 events to ../data/raw/ra_co_events.parquet


## Cleanup

In [15]:
# Close adapter and pipeline resources
adapter.close()
pipeline.close()
print("Resources released.")

Resources released.
