# Story Keyword Extraction

This notebook uses an LLM to extract domain-specific keywords (LLM and DEV related) from story titles and content, with resumable processing to handle large datasets efficiently.

## Configuration

In [None]:
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv

load_dotenv()

input_file = Path("data/stories_deduplicated.csv")
output_file = Path("data/stories_deduplicated_meta.csv")

df = pd.read_csv(input_file)
df

## Functions

Define the core utilities for LLM-based keyword extraction, progress tracking, CSV I/O, and error handling with resumable processing.

In [None]:
from llm import Agent
import json
import csv
from typing import Set


def initialize_output_file(output_file: Path) -> None:
    """Create output file with headers if it doesn't exist"""
    if not output_file.exists():
        output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(output_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["story_id", "llm_keywords", "dev_keywords"])
        print(f"Created new output file: {output_file}")


def get_processed_story_ids(output_file: Path) -> Set[str]:
    """Get set of already processed story IDs"""
    if output_file.exists():
        existing_df = pd.read_csv(output_file)
        processed_ids = set(existing_df["story_id"].tolist())
        print(f"Found {len(processed_ids)} already processed stories")
        return processed_ids
    return set()


def process_single_story(row: pd.Series, output_file: Path) -> bool:
    """Process a single story and save its keywords. Returns True if successful."""
    story_id = row["story_id"]
    title = str(row.get("title", ""))
    story_text = str(row.get("story_text", ""))

    try:
        keywords = extract_keywords(title, story_text)
        save_story_keywords(output_file, story_id, keywords)
        return True
    except Exception as e:
        print(f"❌ Error processing story {story_id}: {e}")
        return False


def extract_keywords(title: str, content: str) -> dict[str, Set[str]]:
    """
    Extract LLM and DEV keywords from title and story text.

    Returns:
        Dictionary with 'llm' and 'dev' keys containing sets of keywords
    """

    SYSTEM_PROMPT = Path("03_system_prompt.md").read_text()
    INPUT = f"""
    <story>
      <title>{title}</title>
      <content>{content}</content>
    </story>
    """

    response = (
        Agent("deepseek-v3-0324:free", SYSTEM_PROMPT)
        .configure(json_mode=True)
        .complete(INPUT)
    )
    result = json.loads(response)
    keywords = {"llm": result["LLM"], "dev": result["DEV"]}

    return keywords


def save_story_keywords(
        output_file: Path, story_id: str, keywords: dict[str, str]
) -> None:
    """Save keywords for a single story to CSV file"""
    llm_keywords_str = json.dumps(keywords["llm"]) if keywords["llm"] else "[]"
    dev_keywords_str = json.dumps(keywords["dev"]) if keywords["dev"] else "[]"

    with open(output_file, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([story_id, llm_keywords_str, dev_keywords_str])

## Prompt Fine-Tuning

Test the keyword extraction on a random story sample to validate the LLM prompt and output quality before batch processing.

In [None]:
random_story = df.sample(n=1).iloc[0]
keywords = extract_keywords(random_story["title"], random_story["story_text"])

print(f"ID: {random_story['story_id']}")
print(f"Title: {random_story['title']}")
print(f"Content: {random_story['story_text']}")
print("\nExtracted Keywords:")
print(f"LLM Keywords: {keywords['llm']}")
print(f"DEV Keywords: {keywords['dev']}")

## Processing

Execute the main pipeline with resume capability to extract keywords from all stories and save results incrementally.

In [None]:
initialize_output_file(output_file)
processed_story_ids = get_processed_story_ids(output_file)

total_stories = len(df)
processed_count = len(processed_story_ids)

for idx, row in df.iterrows():
    story_id = row["story_id"]

    # Skip if already processed
    if story_id in processed_story_ids:
        continue

    print(f"🔍 Processing story {processed_count + 1}/{total_stories}: {story_id}")

    if process_single_story(row, output_file):
        processed_count += 1

print(f"\n🎉 Processing complete!")
print(f"Total stories processed: {processed_count}")
print(f"Output saved to: {output_file}")