# Index Articles and Test Search

Interactive notebook for indexing articles and testing search performance.

In [None]:
import json
import requests
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import concurrent.futures
from datetime import datetime, timedelta
import random
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv('.env', override=True)

# Configuration
SEARCH_ENGINE_BASE_URL = os.getenv('SEARCH_ENGINE_BASE_URL', 'https://search_api.toy.x.upstage.ai')
UPSTAGE_API_KEY = os.getenv('UPSTAGE_API_KEY')
COLLECTION_NAME = "themilk"
BATCH_SIZE = 20
MAX_WORKERS = 1

if not UPSTAGE_API_KEY:
    raise ValueError("UPSTAGE_API_KEY not found in .env.local file")


print(f"Target: {SEARCH_ENGINE_BASE_URL}")
print(f"Collection: {COLLECTION_NAME}")

Target: https://search_api.toy.x.upstage.ai
Collection: themilk




## 1. Load Data

In [2]:
articles_path = "./data/articles.json"

print(f"Loading articles from {articles_path}...")
with open(articles_path, 'r', encoding='utf-8') as f:
    articles = json.load(f)
    
print(f"Loaded {len(articles)} articles")

# Preview first article
if articles:
    print("\nSample article:")
    print(json.dumps(articles[0], indent=2, ensure_ascii=False)[:500] + "...")

Loading articles from ./data/articles.json...
Loaded 8730 articles

Sample article:
{
  "id": 3,
  "content": "ÎØ∏Íµ≠ Ï∂îÏàòÍ∞êÏÇ¨Ï†à ÏµúÎåÄ ÏáºÌïë Ìï†Ïù∏ ÌñâÏÇ¨Ïù∏ 'Î∏îÎûô ÌîÑÎùºÏù¥Îç∞Ïù¥'Í∞Ä ÏòÅÌñ•Î†•ÏùÑ ÏûÉÏñ¥Í∞ÑÎã§. Î∏îÎûô¬†ÌîÑÎùºÏù¥Îç∞Ïù¥(Black Friday, Ïù¥Ìïò Î∏îÌîÑ: Ï∂îÏàòÍ∞êÏÇ¨Ï†à Îã§Ïùå¬†ÎÇ† ÏãúÏûëÎêòÎäî ÎåÄÍ∑úÎ™® Ìï†Ïù∏ ÌñâÏÇ¨)ÎùºÍ≥† ÌïòÎäî ÎåÄÎ∞ï Ìï†Ïù∏ ÌñâÏÇ¨Í∞Ä ÏÇ¨Ïã§ÏÉÅ ÏÇ¨ÎùºÏßÑÎã§. Î∏îÌîÑÎäî Ïú†ÌÜµ(Î¶¨ÌÖåÏùº) ÏÇ∞ÏóÖÏóê ÏÉÅÎãπÌïú ÏòÅÌñ•ÏùÑ ÎØ∏ÏπòÎäî ÌñâÏÇ¨Îã§. Ïú†ÌÜµÍ∏∞ÏóÖÏùÄ Î∏îÌîÑÎ•º Í≥ÑÍ∏∞Î°ú ÎìúÎîîÏñ¥ Í∑∏ Ìï¥ ÌùëÏûêÎ°ú ÎèåÏïÑÏÑ†Îã§. ÎØ∏Íµ≠ ÏµúÎåÄ Î∞±ÌôîÏ†ê Î©îÏù¥Ïãú(Macy's)Ïùò Ï†úÌîÑÎ¶¨ Ï†úÎÑ§Ìä∏ ÌöåÏû•ÏùÄ \"Ïó∞Îßê Ïó∞Ïãú ÏàòÏöîÎäî ÏùºÎ∞òÏ†ÅÏúºÎ°ú Ï∂îÏàò Í∞êÏÇ¨Ï†àÍ≥º ÌÅ¨Î¶¨Ïä§ÎßàÏä§ ÏÇ¨Ïù¥Ïóê Î∞úÏÉùÌïúÎã§. ÌïòÏßÄÎßå Ïò¨Ìï¥Îäî Îçî ÏùºÏ∞ç ÏàòÏöîÍ∞Ä ÏãúÏûëÎêòÍ≥† ÏûàÎã§‚ÄùÍ≥† ÎßêÌñàÎã§. Ï∂îÏàòÍ∞êÏÇ¨Ï†à ÏãúÏ¶åÏóê ÎßûÏ∂∞ ÏùºÏ†úÌûà Ìï†Ïù∏ÏùÑ ÌïòÎäî Î∏îÌîÑÏùò ÏãúÎåÄÎäî ÎÅùÎÇ¨Îã§. Î∏îÌîÑÏùò Îß§Î†•ÏùÄ ‚ÄòÎèÑÏñ¥Î≤ÑÏä§ÌÑ∞(doorbuster: Î∏îÎûô ÌîÑÎùºÏù¥Îç∞Ïù¥ ÎãπÏùº ÏÑ†Ï∞©ÏàúÏúºÎ°ú 

## 2. Prepare & Index Documents

In [3]:
def prepare_document(article):
    """Helper to transform article into indexing format"""
    article_id = article['id']
    content = article['content']
    
    # Use first 100 characters as title
    title = content[:100] if len(content) > 100 else content
    
    # Generate a random date in 2024-2025
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2025, 11, 23)
    random_days = random.randint(0, (end_date - start_date).days)
    date = (start_date + timedelta(days=random_days)).strftime("%Y-%m-%d")
    
    return {
        "title": title,
        "url": f"https://themiilk.com/articles/{article_id}",
        "content": content,
        "date": date
    }

def index_batch(batch_data):
    """Helper to index a single batch"""
    batch, batch_idx = batch_data
    documents = [prepare_document(a) for a in batch]

    index_endpoint = f"{SEARCH_ENGINE_BASE_URL}/index/{COLLECTION_NAME}"   
    
    headers = {
        "Authorization": f"Bearer {UPSTAGE_API_KEY}",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(
            index_endpoint, 
            headers=headers, 
            json={"documents": documents}, 
            verify=False, 
            timeout=120
        )
        if response.status_code == 200:
            return len(documents), 0, None
        return 0, len(documents), f"Batch {batch_idx}: {response.status_code} - {response.text}"
    except Exception as e:
        return 0, len(documents), f"Batch {batch_idx}: {str(e)}"

In [4]:
# Create batches
batches = []
for i in range(0, len(articles), BATCH_SIZE):
    batches.append((articles[i:i+BATCH_SIZE], i // BATCH_SIZE))

print(f"Created {len(batches)} batches from {len(articles)} articles")

# Run Indexing
print(f"Starting indexing with {MAX_WORKERS} workers...")
start_time = time.time()
indexed_count = 0
failed_count = 0

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(index_batch, b) for b in batches]
    
    for future in concurrent.futures.as_completed(futures):
        success, fail, error = future.result()
        indexed_count += success
        failed_count += fail
        
        if error:
            print(f"‚úó {error}")
            
        if (indexed_count + failed_count) % 100 == 0:
            print(f"Progress: {indexed_count + failed_count}/{len(articles)}")

elapsed = time.time() - start_time
print(f"\nDone! Indexed: {indexed_count}, Failed: {failed_count}")
print(f"Time: {elapsed:.2f}s ({indexed_count/elapsed if elapsed > 0 else 0:.2f} docs/s)")

Created 437 batches from 8730 articles
Starting indexing with 1 workers...
Progress: 100/8730
Progress: 200/8730
Progress: 300/8730
Progress: 400/8730
Progress: 500/8730
Progress: 600/8730
Progress: 700/8730
Progress: 800/8730
Progress: 900/8730
Progress: 1000/8730
Progress: 1100/8730
Progress: 1200/8730
Progress: 1300/8730
Progress: 1400/8730
Progress: 1500/8730
Progress: 1600/8730
Progress: 1700/8730
Progress: 1800/8730
Progress: 1900/8730
Progress: 2000/8730
Progress: 2100/8730
Progress: 2200/8730
Progress: 2300/8730
Progress: 2400/8730
Progress: 2500/8730
Progress: 2600/8730
Progress: 2700/8730
Progress: 2800/8730
Progress: 2900/8730
Progress: 3000/8730
Progress: 3100/8730
Progress: 3200/8730
Progress: 3300/8730
Progress: 3400/8730
Progress: 3500/8730
Progress: 3600/8730
Progress: 3700/8730
Progress: 3800/8730
Progress: 3900/8730
Progress: 4000/8730
Progress: 4100/8730
Progress: 4200/8730
Progress: 4300/8730
Progress: 4400/8730
Progress: 4500/8730
Progress: 4600/8730
Progress: 4700

## 3. Test Search

In [6]:
test_queries = [
    "Î∏îÎûô ÌîÑÎùºÏù¥Îç∞Ïù¥",
    "Ìï†Ïù∏",
    "ÏáºÌïë",
    "ÎØ∏Íµ≠",
    "Ïò®ÎùºÏù∏",
    "ÏÜåÎπÑÏûê",
    "Îß§Ïû•",
    "Ïú†ÌÜµ",
    "ÌÅ¨Î¶¨Ïä§ÎßàÏä§",
    "Ï∂îÏàòÍ∞êÏÇ¨Ï†à"
]

print(f"Testing {len(test_queries)} queries...\n")

headers = {
    "Authorization": f"Bearer {UPSTAGE_API_KEY}",
    "Content-Type": "application/json"
}

search_endpoint = f"{SEARCH_ENGINE_BASE_URL}/search/{COLLECTION_NAME}"

for query in test_queries:
    print(f"Query: '{query}'")
    
    try:
        t0 = time.time()
        response = requests.post(
            search_endpoint, 
            headers=headers, 
            json={"query": query}, 
            verify=False, 
            timeout=60
        )
        dt = time.time() - t0
        
        if response.status_code == 200:
            results = response.json().get('results', [])
            print(f"  ‚úì Found {len(results)} results in {dt*1000:.1f}ms")
            if results:
                top = results[0]
                print(f"  Top: {top.get('title', 'No Title')[:50]}... (Score: {top.get('score', 'N/A')})")
        else:
            print(f"  ‚úó Failed: {response.status_code} - {response.text}")
            
    except Exception as e:
        print(f"  ‚úó Error: {e}")
    
    print()

Testing 10 queries...

Query: 'Î∏îÎûô ÌîÑÎùºÏù¥Îç∞Ïù¥'
  ‚úì Found 10 results in 1303.4ms
  Top: Î∞±ÏïÖÍ¥Ä "ÌÖåÏä¨Îùº, Îã§Î•∏ Ï∞®ÎüâÏóê Ï∂©Ï†ÑÏÜå Í∞úÎ∞©"  ÎØ∏Íµ≠Ïùò Ï†ÑÎèôÌôîÍ∞Ä Îπ†Î•¥Í≤å ÏßÑÌñâÎêòÍ≥† ÏûàÎäîÎç∞Ïöî. ... (Score: 0.5)

Query: 'Ìï†Ïù∏'
  ‚úì Found 10 results in 409.5ms
  Top: ÏïàÎÖïÌïòÏÑ∏Ïöî.Ïó¨ÌñâÏùÑ Í≥ÑÌöçÌï†Îïå Ìï≠Í≥µÍ∂å Îã§ÏùåÏúºÎ°ú ÏßÄÏ∂úÏù¥ ÌÅ∞ ÎÇ¥Ïó≠ÏùÄ Î∞îÎ°ú Ìò∏ÌÖîÏûÖÎãàÎã§. ÌïòÏßÄÎßå Ìò∏ÌÖî... (Score: 0.5)

Query: 'ÏáºÌïë'
  ‚úì Found 10 results in 361.5ms
  Top: ÏïàÎÖïÌïòÏÑ∏Ïöî. Ïã§Î¶¨ÏΩòÎ∞∏Î¶¨Ïùò Ïä§ÌÉÄÌä∏ÏóÖ Ï†ÑÎèÑÏÇ¨, ÎçîÎ∞ÄÌÅ¨ Ïä§ÌÉÄÌä∏ÏóÖ Ìè¨Ïª§Ïä§ÏûÖÎãàÎã§.‚ÄòÎπÖÎç∞Ïù¥ÌÑ∞‚ÄôÍ∞Ä ÏÑ∏ÏÉÅ... (Score: 0.5)

Query: 'ÎØ∏Íµ≠'
  ‚úì Found 10 results in 415.1ms
  Top: ÎØ∏Íµ≠ÏùÄ ÏΩîÎ°úÎÇò Î∞îÏù¥Îü¨Ïä§Î°ú Ïù∏Ìï¥ 3Ïõî 29Ïùº ÌòÑÏû¨ 12Îßå2653Î™ÖÏùò ÌôïÏßÑÏûêÏôÄ 2112Î™ÖÏùò ÏÇ¨... (Score: 0.5)

Query: 'Ïò®ÎùºÏù∏'
  ‚úì Found 10 results in 352.7ms
  Top: AI Í∏∞Ïà†ÏóÖÏ≤¥ Ïò§ÌîàAIÍ∞Ä ÏàèÌèº ÎèôÏòÅÏÉÅ ÌîåÎû´Ìèº ‚ÄòÌã±ÌÜ°(Tiktok)‚Äô Í≥µÏãù Í≥ÑÏ†ïÏùÑ Í∞úÏÑ§, Î≥∏... (Score: 0.5)



## 4. AI Overview with the Milk Index

In [10]:
AI_OVERVIEW_URL = "https://solar-web-search.cosmic.upstage.ai/v1/chat/completions"

payload = {
  "model": "solar-pro2-search",
  "messages": [
    {
      "role": "user",
      "content": "Ïú§ÏÜ°Ïù¥ Î∞ïÏÇ¨Îãò Í∞ïÏùò"
    }
  ],
  "stream": False,
  "web_search_options": {
    "search_context_size": "medium",
    "external_search_engine_only": True,
    "servers": [
      {
        "url": search_endpoint,
        "key": UPSTAGE_API_KEY
      }
    ]
  }
}

headers = {
  'Authorization': f'Bearer {UPSTAGE_API_KEY}',
  'Content-Type': 'application/json'
}

print(f"Querying Solar API with external search: {search_endpoint}...")
response = requests.post(AI_OVERVIEW_URL, headers=headers, json=payload)

print(json.dumps(response.json(), indent=2))


Querying Solar API with external search: https://search_api.toy.x.upstage.ai/search/themilk...
{
  "id": "6d3552e5-536c-4dcf-8902-62e1b2c7ddf3",
  "object": "chat.completion",
  "created": 1764422960,
  "model": "solar-pro2-search",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "**\uc724\uc1a1\uc774 \ubc15\uc0ac\ub294 AI\ub97c \uc778\ub958 \uc9c4\ud654\uc758 \ubcc0\uace1\uc810\uc73c\ub85c \ubcf4\uba70, \ubd88\uacfc \uc5b8\uc5b4\uc5d0 \uc774\uc740 \uc138 \ubc88\uc9f8 \uadfc\uc6d0\uc801 \ubcc0\ud654 \ub3c4\uad6c\ub85c \uac15\uc870\ud588\uc2b5\ub2c8\ub2e4 [1][4][8].**  \n\n### \uc0c1\uc138 \uc124\uba85  \n1. **AI\ub97c \uc778\ub958 \ubb38\uba85\uc758 \uc138 \ubc88\uc9f8 \ubd88\uae38\ub85c \uaddc\uc815**  \n   - \uc724\uc1a1\uc774 PVP \ub300\ud45c\ub294 2026 \uae30\uc870\uc5f0\uc124\uc5d0\uc11c \"AI\ub294 \ub300\ud55c\ubbfc\uad6d\uc758 \uc138 \ubc88\uc9f8 \ubd88\"\uc774\ub77c\uace0 \uc8fc\uc7a5\ud558\uba70, \ubd88\uacfc \uc5b8\uc5