# Multi-Modal Event Intelligence Pipeline

This notebook tests the **spatial / activity data layer** — TripAdvisor attractions, Civitatis guided tours, and GetYourGuide experiences — built on top of the config-driven `BaseAPIPipeline`.

**Temporal vs Spatial layers:**
- **Temporal** (disabled): `ra_co`, `ticketmaster` — time-boxed events with `start_datetime`
- **Spatial** (active): `tripadvisor`, `civitatis`, `getyourguide` — recurring activities without fixed start times

**Pipeline flow:**
1. Factory reads `ingestion.yaml` → creates pipelines for all enabled sources
2. REST adapter fetches raw JSON via config-driven params + pagination
3. `FieldMapper` extracts + transforms fields per `field_mappings`
4. `TaxonomyMapper` assigns Human Experience Taxonomy (primary + subcategory)
5. Events normalized to `EventSchema` with `future_events_only: false`
6. Optional enrichment: geocoding + compressed_html (TripAdvisor only)

In [None]:
import sys
import os
import logging
from dotenv import load_dotenv

# Setup path — point to services/api so src.* imports work
API_ROOT = os.path.abspath(os.path.join("..", "services", "api"))
if API_ROOT not in sys.path:
    sys.path.insert(0, API_ROOT)

# Load .env for API keys
load_dotenv(os.path.join("..", ".env"))

# Enable logging
logging.basicConfig(
    level=logging.INFO,
    format="%(name)s - %(levelname)s - %(message)s",
)

# Verify API key presence (keys are not printed)
ta_key = os.getenv("TRIPADVISOR_API_KEY", "")
cv_key = os.getenv("CIVITATIS_API_KEY", "")
gyg_key = os.getenv("GETYOURGUIDE_API_KEY", "")
print(f"TRIPADVISOR_API_KEY:  {'SET' if ta_key else 'MISSING — requests will fail with 401'}")
print(f"CIVITATIS_API_KEY:    {'SET' if cv_key else 'MISSING — requests will fail with 401'}")
print(f"GETYOURGUIDE_API_KEY: {'SET' if gyg_key else 'MISSING — requests will fail with 401'}")
print(f"\nAPI root: {API_ROOT}")
print("Setup complete")

## Step 1: PipelineFactory — Assert Only Spatial Sources Enabled

After the config change:
- `ra_co`: `enabled: false` ✓
- `ticketmaster`: `enabled: false` ✓
- `tripadvisor`: `enabled: true` ✓
- `civitatis`: `enabled: true` ✓
- `getyourguide`: `enabled: true` ✓

In [None]:
from src.ingestion.factory import PipelineFactory

factory = PipelineFactory()

print("Configured Sources:")
print("=" * 50)
for name, info in factory.list_sources().items():
    status = "ENABLED" if info["enabled"] else "disabled"
    print(f"  {name:20} type={info['type']:10} [{status}]")

enabled = factory.list_enabled_sources()
print(f"\nEnabled sources: {enabled}")

# Assert correct state
assert "tripadvisor" in enabled, "tripadvisor should be enabled"
assert "civitatis" in enabled, "civitatis should be enabled"
assert "getyourguide" in enabled, "getyourguide should be enabled"
assert "ra_co" not in enabled, "ra_co should be disabled"
assert "ticketmaster" not in enabled, "ticketmaster should be disabled"
print("\nAll source state assertions PASSED")

## Step 2: Create TripAdvisor Pipeline — Config Summary

In [None]:
ta = factory.create_pipeline("tripadvisor")

print(f"Pipeline:          {ta.config.source_name}")
print(f"Source type:       {ta.source_type.value}")
print(f"Protocol:          {ta.source_config.protocol}")
print(f"Endpoint:          {ta.source_config.endpoint}")
print(f"Cities:            {ta.source_config.defaults.get('location', {}).get('cities', [])}")
print(f"Page size:         {ta.source_config.defaults.get('page_size')} (TripAdvisor hard cap)")
print(f"Max pages (config):{ta.source_config.pagination.get('max_pages')}")
print(f"future_events_only:{ta.source_config.validation.get('future_events_only')}")
print(f"Default taxonomy:  primary={ta.source_config.taxonomy_suggestions.get('default_primary')} "
      f"subcategory={ta.source_config.taxonomy_suggestions.get('default_subcategory')}")
print(f"HTML enrichment:   {ta.source_config.html_enrichment.get('enabled')}")
print(f"Field mappings:    {len(ta.source_config.field_mappings)} fields")

## Step 3: Execute TripAdvisor — Barcelona (max_pages=1)

TripAdvisor `/location/search` hard-caps at 10 results per call. `max_pages=1` is the architectural limit.

In [None]:
# Restrict to Barcelona only for mock run
ta.source_config.defaults.setdefault('location', {})['cities'] = ['Barcelona']

ta_result = await ta.execute(max_pages=1, page_size=10)

print("TripAdvisor Pipeline Results")
print("=" * 60)
print(f"Status:            {ta_result.status.value}")
print(f"Total raw events:  {ta_result.total_events_processed}")
print(f"Successful:        {ta_result.successful_events}")
print(f"Failed:            {ta_result.failed_events}")
print(f"Duration:          {ta_result.duration_seconds:.2f}s")
print(f"Success rate:      {ta_result.success_rate:.1f}%")
print(f"Cities:            {ta_result.metadata.get('cities', [])}")

if ta_result.errors:
    print(f"\nErrors: {ta_result.errors[:3]}")

## Step 4: Inspect TripAdvisor Events — Location, Taxonomy, Quality

In [None]:
ta_events = ta_result.events

if ta_events:
    print(f"Sample TripAdvisor Events ({len(ta_events)} total):")
    print("=" * 70)

    for i, event in enumerate(ta_events[:10]):
        td = event.taxonomy_dimension
        coords = event.location.coordinates
        coord_str = f"({coords.latitude:.4f}, {coords.longitude:.4f})" if coords else "None"
        print(f"\n[{i+1}] {event.title}")
        print(f"    City: {event.location.city} | Country: {event.location.country_code}")
        print(f"    start_datetime: {event.start_datetime}  (None expected for attractions)")
        print(f"    Type: {event.event_type} | Coords: {coord_str}")
        print(f"    Source URL: {event.source.source_url}")
        if td:
            print(f"    Taxonomy: primary={td.primary_category_id} subcategory={td.subcategory_id}")
        print(f"    Quality: {event.data_quality_score:.2f}")
        if event.custom_fields:
            print(f"    Custom fields: {list(event.custom_fields.keys())}")
else:
    print("No events fetched. Check API key and pipeline logs above.")

## Step 5: Create + Execute Civitatis — Barcelona (max_pages=2)

Civitatis uses 1-indexed pagination. `destination` maps to area ID from `defaults.areas`.

In [None]:
cv = factory.create_pipeline("civitatis")

print(f"Pipeline:          {cv.config.source_name}")
print(f"Endpoint:          {cv.source_config.endpoint}")
print(f"Areas:             {cv.source_config.defaults.get('areas', {})}")
print(f"Start page:        {cv.source_config.pagination.get('start_page')} (1-indexed)")
print(f"Default taxonomy:  primary={cv.source_config.taxonomy_suggestions.get('default_primary')} "
      f"subcategory={cv.source_config.taxonomy_suggestions.get('default_subcategory')}")
print()

# Restrict to Barcelona only for mock run
cv.source_config.defaults['areas'] = {'Barcelona': 5}

cv_result = await cv.execute(max_pages=2, page_size=50)

print("\nCivitatis Pipeline Results")
print("=" * 60)
print(f"Status:            {cv_result.status.value}")
print(f"Total raw events:  {cv_result.total_events_processed}")
print(f"Successful:        {cv_result.successful_events}")
print(f"Failed:            {cv_result.failed_events}")
print(f"Duration:          {cv_result.duration_seconds:.2f}s")
print(f"Success rate:      {cv_result.success_rate:.1f}%")

if cv_result.errors:
    print(f"\nErrors: {cv_result.errors[:3]}")

## Step 6: Inspect Civitatis Events — Price, Duration, Categories, Taxonomy

In [None]:
cv_events = cv_result.events

if cv_events:
    print(f"Sample Civitatis Events ({len(cv_events)} total):")
    print("=" * 70)

    for i, event in enumerate(cv_events[:10]):
        td = event.taxonomy_dimension
        print(f"\n[{i+1}] {event.title}")
        print(f"    City: {event.location.city} | Country: {event.location.country_code}")
        print(f"    Type: {event.event_type}")
        print(f"    Price: min={event.price.cost_min} {event.price.currency} | free={event.price.is_free}")
        if event.custom_fields:
            dur = event.custom_fields.get('duration_minutes_raw', 'N/A')
            cats = event.custom_fields.get('categories', 'N/A')
            print(f"    Duration (min): {dur} | Categories: {cats}")
        print(f"    Source URL: {event.source.source_url}")
        if td:
            print(f"    Taxonomy: primary={td.primary_category_id} subcategory={td.subcategory_id}")
        print(f"    Quality: {event.data_quality_score:.2f}")
else:
    print("No events fetched. Check API key and pipeline logs above.")

## Step 5b: Create + Execute GetYourGuide — Barcelona (max_pages=2)

GetYourGuide uses offset pagination and header-based auth (`X-ACCESS-TOKEN`). `destination_id=45` maps to Barcelona.

In [None]:
gyg = factory.create_pipeline("getyourguide")

print(f"Pipeline:          {gyg.config.source_name}")
print(f"Endpoint:          {gyg.source_config.endpoint}")
print(f"Areas:             {gyg.source_config.defaults.get('areas', {})}")
print(f"Start page:        {gyg.source_config.pagination_start_page} (offset pagination)")
print(f"Default taxonomy:  primary={gyg.source_config.taxonomy_config.get('default_primary')} "
      f"subcategory={gyg.source_config.taxonomy_config.get('default_subcategory')}")
print()

# Restrict to Barcelona only for mock run
gyg.source_config.defaults['areas'] = {'Barcelona': 45}

gyg_result = await gyg.execute(max_pages=2, page_size=50)

print("\nGetYourGuide Pipeline Results")
print("=" * 60)
print(f"Status:            {gyg_result.status.value}")
print(f"Total raw events:  {gyg_result.total_events_processed}")
print(f"Successful:        {gyg_result.successful_events}")
print(f"Failed:            {gyg_result.failed_events}")
print(f"Duration:          {gyg_result.duration_seconds:.2f}s")
print(f"Success rate:      {gyg_result.success_rate:.1f}%")

if gyg_result.errors:
    print(f"\nErrors: {gyg_result.errors[:3]}")

## Step 6b: Inspect GetYourGuide Events — Duration, Categories, Taxonomy

In [None]:
gyg_events = gyg_result.events

if gyg_events:
    print(f"Sample GetYourGuide Events ({len(gyg_events)} total):")
    print("=" * 70)

    for i, event in enumerate(gyg_events[:10]):
        td = event.taxonomy_dimension
        print(f"\n[{i+1}] {event.title}")
        print(f"    City: {event.location.city} | Country: {event.location.country_code}")
        print(f"    Type: {event.event_type}")
        print(f"    Price: min={event.price.cost_min} {event.price.currency} | free={event.price.is_free}")
        if event.custom_fields:
            dur_min = event.custom_fields.get('duration_min_secs', 'N/A')
            cats = event.custom_fields.get('categories', 'N/A')
            print(f"    Duration (secs): {dur_min} | Categories: {cats}")
        print(f"    Source URL: {event.source.source_url}")
        if td:
            print(f"    Taxonomy: primary={td.primary_category_id} subcategory={td.subcategory_id}")
        print(f"    Quality: {event.data_quality_score:.2f}")
else:
    print("No events fetched. Check API key and pipeline logs above.")

## Step 7: Field Coverage Table

Side-by-side % non-null per field group for TripAdvisor, Civitatis, and GetYourGuide.

In [None]:
import pandas as pd

def coverage_dict(events, fields):
    """Return {field: pct_non_null} for a list of EventSchema objects."""
    total = len(events)
    if total == 0:
        return {f: 0.0 for f in fields}
    result = {}
    for f in fields:
        count = 0
        for e in events:
            try:
                val = None
                parts = f.split('.')
                obj = e
                for p in parts:
                    obj = getattr(obj, p, None)
                    if obj is None:
                        break
                if obj is not None and obj != '' and obj != []:
                    count += 1
            except Exception:
                pass
        result[f] = round(100 * count / total, 1)
    return result

field_groups = [
    # Core
    "title",
    "source.source_event_id",
    "source.source_url",
    "start_datetime",
    "event_type",
    # Location
    "location.city",
    "location.country_code",
    "location.street_address",
    "location.coordinates",
    # Media
    "source.image_url",
    # Pricing
    "price.cost_min",
    "price.is_free",
    # Description
    "description",
    # Taxonomy
    "taxonomy_dimension",
]

ta_cov = coverage_dict(ta_events, field_groups) if ta_events else {f: 0.0 for f in field_groups}
cv_cov = coverage_dict(cv_events, field_groups) if cv_events else {f: 0.0 for f in field_groups}
gyg_cov = coverage_dict(gyg_events, field_groups) if gyg_events else {f: 0.0 for f in field_groups}

df_cov = pd.DataFrame({
    "Field": field_groups,
    "TripAdvisor %": [ta_cov[f] for f in field_groups],
    "Civitatis %": [cv_cov[f] for f in field_groups],
    "GetYourGuide %": [gyg_cov[f] for f in field_groups],
})

print(df_cov.to_string(index=False))

## Step 8: Taxonomy Distribution

Verify that all three sources produce multiple unique subcategories (not just the default).

In [None]:
from collections import Counter

def taxonomy_counter(events, label):
    primary_counter = Counter()
    sub_counter = Counter()
    type_counter = Counter()
    for e in events:
        td = e.taxonomy_dimension
        if td:
            primary_counter[td.primary_category_id] += 1
            sub_counter[td.subcategory_id] += 1
        type_counter[e.event_type] += 1
    print(f"\n--- {label} ({len(events)} events) ---")
    print(f"Primary categories: {dict(primary_counter.most_common())}")
    print(f"Subcategories:      {dict(sub_counter.most_common())}")
    print(f"Event types:        {dict(type_counter.most_common())}")
    print(f"Unique subcategories: {len(sub_counter)}")

print("TAXONOMY DISTRIBUTION")
print("=" * 60)
taxonomy_counter(ta_events, "TripAdvisor")
taxonomy_counter(cv_events, "Civitatis")
taxonomy_counter(gyg_events, "GetYourGuide")

## Step 9: Multi-Modal Layer Comparison

Contrast the spatial layer (TripAdvisor + Civitatis + GetYourGuide) against the temporal layer (ra_co + Ticketmaster).

| Dimension | Temporal (ra_co / TM) | Spatial (TripAdvisor / Civitatis / GYG) |
|---|---|---|
| `start_datetime` | Required — event-level | None / now() fallback |
| Deduplication key | `(source_name, source_event_id)` | Same — stable across runs |
| Taxonomy primary | `"1"` (Play & Pure Fun) | `"2"` (Exploration & Adventure) |
| Price model | Variable (ticketed / free) | `cost_min` from structured field |
| Pagination | Date-windowed sliding | Page-number / offset (fixed) |
| Refresh cadence | Every 6–12h | Weekly (Monday) |

In [None]:
all_spatial = ta_events + cv_events + gyg_events

print("MULTI-MODAL INSIGHT SUMMARY")
print("=" * 60)
print(f"Total spatial events: {len(all_spatial)}")
print(f"  TripAdvisor:        {len(ta_events)}")
print(f"  Civitatis:          {len(cv_events)}")
print(f"  GetYourGuide:       {len(gyg_events)}")

if all_spatial:
    has_start = [e for e in all_spatial if e.start_datetime is not None]
    has_coords = [e for e in all_spatial if e.location.coordinates is not None]
    has_price = [e for e in all_spatial if e.price and e.price.cost_min is not None]
    has_desc = [e for e in all_spatial if e.description]
    has_taxonomy = [e for e in all_spatial if e.taxonomy_dimension is not None]

    total = len(all_spatial)
    print(f"\nField coverage across all sources:")
    print(f"  start_datetime:    {len(has_start)}/{total} ({100*len(has_start)/total:.0f}%) — expected low")
    print(f"  coordinates:       {len(has_coords)}/{total} ({100*len(has_coords)/total:.0f}%)")
    print(f"  price.cost_min:    {len(has_price)}/{total} ({100*len(has_price)/total:.0f}%)")
    print(f"  description:       {len(has_desc)}/{total} ({100*len(has_desc)/total:.0f}%)")
    print(f"  taxonomy_dimension:{len(has_taxonomy)}/{total} ({100*len(has_taxonomy)/total:.0f}%)")

    # Quality score distribution
    scores = [e.data_quality_score for e in all_spatial]
    print(f"\nQuality scores:")
    print(f"  Mean: {sum(scores)/len(scores):.2f}")
    print(f"  Min:  {min(scores):.2f}")
    print(f"  Max:  {max(scores):.2f}")

## Step 10: Save Results

In [None]:
import pickle

output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames as parquet
if ta_events:
    ta_df = ta.to_dataframe(ta_events)
    ta_path = f"{output_dir}/tripadvisor_events.parquet"
    try:
        ta_df.to_parquet(ta_path, index=False, engine='pyarrow')
    except ImportError:
        ta_df.to_parquet(ta_path, index=False, engine='fastparquet')
    print(f"Saved {len(ta_df)} TripAdvisor events to {ta_path}")

if cv_events:
    cv_df = cv.to_dataframe(cv_events)
    cv_path = f"{output_dir}/civitatis_events.parquet"
    try:
        cv_df.to_parquet(cv_path, index=False, engine='pyarrow')
    except ImportError:
        cv_df.to_parquet(cv_path, index=False, engine='fastparquet')
    print(f"Saved {len(cv_df)} Civitatis events to {cv_path}")

if gyg_events:
    gyg_df = gyg.to_dataframe(gyg_events)
    gyg_path = f"{output_dir}/getyourguide_events.parquet"
    try:
        gyg_df.to_parquet(gyg_path, index=False, engine='pyarrow')
    except ImportError:
        gyg_df.to_parquet(gyg_path, index=False, engine='fastparquet')
    print(f"Saved {len(gyg_df)} GetYourGuide events to {gyg_path}")

# Save raw PipelineExecutionResult objects as pickle
for label, result in [("tripadvisor", ta_result), ("civitatis", cv_result), ("getyourguide", gyg_result)]:
    pkl_path = f"{output_dir}/{label}_result.pkl"
    with open(pkl_path, "wb") as f:
        pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Saved PipelineExecutionResult → {pkl_path} ({result.successful_events} events)")

## Cleanup

In [None]:
await ta.close()
await cv.close()
await gyg.close()
print("Resources released.")