# Perplexity AI Scraper - Testing Notebook

Test the Perplexity AI scraper implementation:
- AI-powered search with citations
- Country-specific search
- Batch prompt processing

---

## Setup - Use Local Development Version

In [1]:
import os
import sys
from pathlib import Path

# Add local src to path (use development version, not installed)
project_root = Path.cwd().parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"Using source from: {src_path}")

# Load environment variables
from dotenv import load_dotenv
load_dotenv(project_root / ".env")

# Get API token
API_TOKEN = os.getenv("BRIGHTDATA_API_TOKEN")
if not API_TOKEN:
    raise ValueError("BRIGHTDATA_API_TOKEN not found in environment")

print(f"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}")
print("Setup complete!")

Using source from: /Users/ns/Desktop/projects/sdk-python/src
API Token: 7011787d-2...3336
Setup complete!


## Initialize Client

In [2]:
from brightdata import BrightDataClient

# Verify we're using local version
import brightdata
print(f"brightdata module location: {brightdata.__file__}")

# Initialize client
client = BrightDataClient(token=API_TOKEN)

# Verify Perplexity scraper is accessible
print(f"\nPerplexityScraper: {type(client.scrape.perplexity).__name__}")

# Check for scraper methods
print("\nAvailable methods:")
print([m for m in dir(client.scrape.perplexity) if not m.startswith('_') and callable(getattr(client.scrape.perplexity, m))])

brightdata module location: /Users/ns/Desktop/projects/sdk-python/src/brightdata/__init__.py

PerplexityScraper: PerplexityScraper

Available methods:
['normalize_result', 'scrape', 'scrape_async', 'search', 'search_fetch', 'search_fetch_sync', 'search_status', 'search_status_sync', 'search_sync', 'search_trigger', 'search_trigger_sync']


---
## Test 1: Single Prompt Search

Basic search with a single prompt.

In [4]:
# Test single prompt search
PROMPT = "What are the latest trends in artificial intelligence in 2026?"

print("Searching Perplexity with prompt:")
print(f"  '{PROMPT}'")
print("\nThis may take up to 11 minutes...\n")

async with client.scrape.perplexity.engine:
    result = await client.scrape.perplexity.search(
        prompt=PROMPT,
        country="US",
        poll_timeout=660
    )

print(f"Success: {result.success}")
print(f"Status: {result.status}")
print(f"Snapshot ID: {result.snapshot_id}")
print(f"Cost: ${result.cost:.4f}" if result.cost else "Cost: N/A")

if result.success and result.data:
    print("\n--- Perplexity Response ---")
    data = result.data
    
    # Handle list response
    if isinstance(data, list) and len(data) > 0:
        data = data[0]
    
    print(f"Available keys: {list(data.keys()) if isinstance(data, dict) else 'N/A'}")
    
    if isinstance(data, dict):
        # Check for error
        if 'error' in data:
            print(f"\nAPI Error: {data.get('error')}")
            print(f"Error Code: {data.get('error_code')}")
        else:
            print(f"\nPrompt: {data.get('prompt', 'N/A')[:100]}...")
            
            # Answer
            answer = data.get('answer_html', data.get('answer', 'N/A'))
            if answer and answer != 'N/A':
                print("\nAnswer (first 500 chars):")
                print(f"  {str(answer)[:500]}...")
            
            # Citations
            citations = data.get('citations', [])
            if citations:
                print(f"\nCitations ({len(citations)} sources):")
                for i, cite in enumerate(citations[:5]):
                    print(f"  {i+1}. {cite.get('title', 'N/A')[:50]}")
                    print(f"     URL: {cite.get('url', 'N/A')[:60]}")
            
            # Follow-up questions
            followups = data.get('suggested_followup', [])
            if followups:
                print("\nSuggested follow-ups:")
                for q in followups[:3]:
                    print(f"  - {q}")
else:
    print(f"\nError: {result.error}")

Searching Perplexity with prompt:
  'What are the latest trends in artificial intelligence in 2026?'

This may take up to 11 minutes...

Success: True
Status: ready
Snapshot ID: sd_mkuuq1g21ftyx65xqk
Cost: $0.0050

--- Perplexity Response ---
Available keys: ['url', 'prompt', 'answer_html', 'answer_text', 'answer_text_markdown', 'sources', 'source_html', 'is_shopping_data', 'shopping_data', 'index', 'response_raw', 'answer_section_html', 'exported_markdown', 'related_prompts', 'citations', 'web_search_query', 'timestamp', 'input']

Prompt: What are the latest trends in artificial intelligence in 2026?...

Answer (first 500 chars):
  <html lang="ru-RU" data-color-scheme="light" dir="ltr">
<head>
<meta name="min-version" content="1769380062633">
<meta name="version" content="ed51641">
<meta charset="utf-8">
<link rel="preconnect" crossorigin href="https://pplx-next-static-public.perplexity.ai">
<link rel="preconnect" crossorigin href="https://r2cdn.perplexity.ai">
<link rel="preload" hre

---
## Test 2: Search with Different Country

Test country-specific search context.

In [None]:
# Test search with different country
PROMPT = "What are the top news stories today?"

print("Searching with country=GB (UK):")
print(f"  '{PROMPT}'")
print("\nThis may take up to 11 minutes...\n")

async with client.scrape.perplexity.engine:
    result = await client.scrape.perplexity.search(
        prompt=PROMPT,
        country="GB",
        poll_timeout=660
    )

print(f"Success: {result.success}")
print(f"Status: {result.status}")
print(f"Snapshot ID: {result.snapshot_id}")

if result.success and result.data:
    print("\n--- Perplexity Response (UK) ---")
    data = result.data
    
    if isinstance(data, list) and len(data) > 0:
        data = data[0]
    
    if isinstance(data, dict) and 'error' not in data:
        answer = data.get('answer_html', data.get('answer', 'N/A'))
        if answer and answer != 'N/A':
            print("Answer (first 500 chars):")
            print(f"  {str(answer)[:500]}...")
        
        citations = data.get('citations', [])
        if citations:
            print(f"\nCitations ({len(citations)} sources):")
            for i, cite in enumerate(citations[:3]):
                print(f"  {i+1}. {cite.get('domain', 'N/A')} - {cite.get('title', 'N/A')[:40]}")
    elif isinstance(data, dict) and 'error' in data:
        print(f"API Error: {data.get('error')}")
else:
    print(f"\nError: {result.error}")

---
## Test 3: Batch Prompts

Test multiple prompts in a single request.

In [None]:
# Test batch prompts
PROMPTS = [
    "What is Python programming language?",
    "What is machine learning?"
]

print(f"Batch search with {len(PROMPTS)} prompts:")
for i, p in enumerate(PROMPTS):
    print(f"  {i+1}. {p}")
print("\nThis may take up to 11 minutes...\n")

async with client.scrape.perplexity.engine:
    result = await client.scrape.perplexity.search(
        prompt=PROMPTS,
        country="US",
        poll_timeout=660
    )

print(f"Success: {result.success}")
print(f"Status: {result.status}")
print(f"Snapshot ID: {result.snapshot_id}")
print(f"Cost: ${result.cost:.4f}" if result.cost else "Cost: N/A")

if result.success and result.data:
    print("\n--- Batch Results ---")
    data = result.data
    
    if isinstance(data, list):
        print(f"Number of responses: {len(data)}")
        
        for i, item in enumerate(data):
            print(f"\n=== Response {i+1} ===")
            if isinstance(item, dict):
                if 'error' in item:
                    print(f"  Error: {item.get('error')}")
                else:
                    prompt = item.get('prompt', 'N/A')
                    print(f"  Prompt: {prompt[:60]}...")
                    
                    answer = item.get('answer_html', item.get('answer', ''))
                    if answer:
                        print(f"  Answer: {str(answer)[:200]}...")
                    
                    citations = item.get('citations', [])
                    print(f"  Citations: {len(citations)} sources")
    else:
        print(f"Unexpected data type: {type(data)}")
else:
    print(f"\nError: {result.error}")

---
## Test 4: Technical Question

Test with a technical/coding question.

In [None]:
# Test technical question
PROMPT = "How do I implement async/await in Python? Give me a simple example."

print("Technical question:")
print(f"  '{PROMPT}'")
print("\nThis may take up to 11 minutes...\n")

async with client.scrape.perplexity.engine:
    result = await client.scrape.perplexity.search(
        prompt=PROMPT,
        country="US",
        poll_timeout=660
    )

print(f"Success: {result.success}")
print(f"Status: {result.status}")

if result.success and result.data:
    data = result.data
    if isinstance(data, list) and len(data) > 0:
        data = data[0]
    
    if isinstance(data, dict) and 'error' not in data:
        print("\n--- Technical Answer ---")
        answer = data.get('answer_html', data.get('answer', 'N/A'))
        print(f"{str(answer)[:1000]}..." if len(str(answer)) > 1000 else answer)
        
        # Web search queries used
        queries = data.get('web_search_query', [])
        if queries:
            print(f"\nSearch queries used: {queries}")
    elif isinstance(data, dict) and 'error' in data:
        print(f"\nAPI Error: {data.get('error')}")
else:
    print(f"\nError: {result.error}")

---
## Test 5: Export Raw Data

Export the raw response data to a JSON file for inspection.

In [None]:
# Export raw data to JSON file for inspection
import json
from pathlib import Path

if result.success and result.data:
    output_file = Path.cwd() / "perplexity_result.json"

    export_data = {
        "success": result.success,
        "status": result.status,
        "snapshot_id": result.snapshot_id,
        "cost": result.cost,
        "row_count": result.row_count,
        "data": result.data,
        "error": result.error,
    }

    with open(output_file, "w") as f:
        json.dump(export_data, f, indent=2, default=str)

    print(f"Exported to: {output_file}")
    print(f"\nData type: {type(result.data)}")
else:
    print("No data to export")

---
## Test 6: Check Timing Metadata

In [None]:
# Check timing metadata from last result
print("=== Timing Metadata ===")
print(f"trigger_sent_at: {result.trigger_sent_at}")
print(f"snapshot_id_received_at: {result.snapshot_id_received_at}")
print(f"snapshot_polled_at: {result.snapshot_polled_at}")
print(f"data_fetched_at: {result.data_fetched_at}")
print(f"\nrow_count: {result.row_count}")
print(f"cost: {result.cost}")

---
## Summary

### PerplexityScraper Methods

| Method | Description |
|--------|-------------|
| `search(prompt, country, ...)` | Async search with prompt(s) |
| `search_sync(...)` | Sync version |
| `search_trigger(...)` | Manual trigger (returns Job) |
| `search_status(snapshot_id)` | Check status |
| `search_fetch(snapshot_id)` | Fetch results |

### Response Fields

| Field | Description |
|-------|-------------|
| `url` | Perplexity search URL generated |
| `prompt` | The full prompt with context |
| `answer_html` | HTML-formatted response |
| `suggested_followup` | Suggested follow-up questions |
| `citations` | Citation sources with domain, title, url |
| `web_search_query` | Search queries used |