In [3]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from keybert import KeyBERT
from nltk.tokenize import sent_tokenize
from nltk.data import find
import nltk
import re
from typing import List
from sentence_transformers import SentenceTransformer, util

# Download required NLTK resources if not already available
try:
    find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Initialize FastAPI app
app = FastAPI(
    title="Topic Extraction API",
    description="Extracts topics from raw text using KeyBERT + sentence chunking + semantic deduplication",
    version="1.0.0"
)

# Load models globally
kw_model = KeyBERT()
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

# Request body model
class TextRequest(BaseModel):
    text: str
    n_sentences: int = 5
    top_n: int = 3
    max_topics: int = 10  # Max number of topics after deduplication

# Response model
class TopicResponse(BaseModel):
    topics: List[str]

# Utility to clean input text
def clean_text(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()

# Chunk text and extract keywords with scores
def extract_topics_by_chunk(text: str, n_sentences: int = 5, top_n: int = 3) -> List[tuple]:
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i + n_sentences]) for i in range(0, len(sentences), n_sentences)]
    
    all_keywords = []
    for chunk in chunks:
        keywords = kw_model.extract_keywords(chunk, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=top_n)
        all_keywords.extend(keywords)  # List of (keyword, score)

    return all_keywords

# Semantic deduplication (keep highest scored topic among similar ones)
def semantically_deduplicate_scored(topics_scored: List[tuple], similarity_threshold: float = 0.75) -> List[tuple]:
    if len(topics_scored) < 2:
        return topics_scored

    keywords = [kw for kw, _ in topics_scored]
    embeddings = semantic_model.encode(keywords, convert_to_tensor=True)

    retained = []
    used = set()

    for i in range(len(keywords)):
        if i in used:
            continue
        retained.append((keywords[i], topics_scored[i][1]))  # (keyword, score)
        for j in range(i + 1, len(keywords)):
            if j in used:
                continue
            score = util.cos_sim(embeddings[i], embeddings[j])
            if score.item() > similarity_threshold:
                used.add(j)

    return retained

# API endpoint
@app.post("/extract-topics", response_model=TopicResponse)
def extract_topics(request: TextRequest):
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="Text input cannot be empty.")

    cleaned_text = clean_text(request.text)
    topics_with_scores = extract_topics_by_chunk(cleaned_text, request.n_sentences, request.top_n)
    deduped = semantically_deduplicate_scored(topics_with_scores)

    # Sort by importance (score descending), limit to max_topics
    deduped_sorted = sorted(deduped, key=lambda x: x[1], reverse=True)
    final_keywords = [kw for kw, _ in deduped_sorted[:request.max_topics]]

    return {"topics": final_keywords}


  from .autonotebook import tqdm as notebook_tqdm
