In [None]:
!pip -q install pdfplumber groq pydantic requests
!pip install requests tqdm openalexapi

import os, io, re, json, requests, pdfplumber
from typing import List, Dict, Any
from pydantic import BaseModel, Field
from groq import Groq

# ====== CONFIG ======
os.environ["GROQ_API_KEY"] = "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"
MODEL = "llama-3.3-70b-versatile"
url = "https://arxiv.org/pdf/1706.03762"  # Transformer paper
# ====================


# -------- LLM Backend --------
class LLMBackend:
    def __init__(self, model=MODEL):
        self.client = Groq(api_key=os.environ["GROQ_API_KEY"])
        self.model = model

    def chat(self, prompt, system=None, temperature=0.2):
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        resp = self.client.chat.completions.create(
            model=self.model, messages=messages, temperature=temperature
        )
        return resp.choices[0].message.content.strip()


# -------- Schema --------
class ResearchPaperAnalysis(BaseModel):
    title: str
    authors: List[str]
    abstract: str
    key_concepts: List[str]
    methodology: str
    main_findings: List[str]


# -------- Main Processor --------
class RAGResearchProcessorLLM:
    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    def extract_document_text(self, url):
        response = requests.get(url)
        pdf_file = io.BytesIO(response.content)
        full_text, pages = "", []
        with pdfplumber.open(pdf_file) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                full_text += text + "\n"
                pages.append({
                    "page_number": i + 1,
                    "content": text,
                    "word_count": len(text.split()),
                })
        return full_text, pages

    def analyze_research_paper(self, text):
        prompt = f"""
You are a precise research summarizer.
Extract the following information from the paper text below.
Respond ONLY with valid JSON ‚Äî no explanations.

Keys:
"title": string,
"authors": list of author names,
"abstract": string,
"key_concepts": list of 5‚Äì10 technical terms,
"methodology": string,
"main_findings": list of 3‚Äì5 key findings.

--- PAPER TEXT (truncated) ---
{text[:10000]}
--- END OF TEXT ---
"""
        raw = self.llm.chat(prompt)
        match = re.search(r"\{.*\}", raw, re.S)
        if match:
            raw = match.group()
        try:
            data = json.loads(raw)
        except Exception:
            repair_prompt = f"Fix and return valid JSON only:\n\n{raw}"
            repaired = self.llm.chat(repair_prompt)
            try:
                data = json.loads(repaired)
            except Exception:
                print("‚ö†Ô∏è Could not parse model output, returning fallback")
                data = {
                    "title": "Parsing Error",
                    "authors": [],
                    "abstract": "",
                    "key_concepts": [],
                    "methodology": "",
                    "main_findings": [],
                }
        return ResearchPaperAnalysis(**data)

    def create_rag_chunks(self, pages, chunk_size=500):
        chunks = []
        for page in pages:
            text = page["content"]
            if not text.strip():
                continue
            sentences = re.split(r"[.!?]+", text)
            current = ""
            for sent in sentences:
                sent = sent.strip()
                if len(current) + len(sent) < chunk_size:
                    current += sent + ". "
                else:
                    chunks.append({
                        "page": page["page_number"],
                        "content": current.strip(),
                        "word_count": len(current.split()),
                    })
                    current = sent + ". "
            if current:
                chunks.append({
                    "page": page["page_number"],
                    "content": current.strip(),
                    "word_count": len(current.split()),
                })
        return chunks

    def process_paper(self, url):
        print("üì• Downloading and extracting...")
        text, pages = self.extract_document_text(url)
        print("üîç LLM extracting structured info...")
        analysis = self.analyze_research_paper(text)
        print("üìö Chunking text...")
        chunks = self.create_rag_chunks(pages)
        return {"paper_analysis": analysis.model_dump(), "chunks": chunks}


# -------- LLM-Based UCR Evaluator --------
class LLMUCREvaluator:
    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    def analyze_claim_support(self, generated_text: str, chunks: List[Dict]) -> Dict:
        claims = [
            s.strip()
            for s in re.split(r"[.!?]+", generated_text)
            if len(s.split()) > 5
        ]
        supported, unsupported = [], []
        for claim in claims:
            context = "\n\n".join(ch["content"] for ch in chunks[:5])
            prompt = f"""
Determine if the following claim is supported by the given research context.

CLAIM:
{claim}

CONTEXT:
{context}

Respond only with one word: SUPPORTED or UNSUPPORTED.
"""
            resp = self.llm.chat(prompt)
            if "SUPPORTED" in resp.upper():
                supported.append(claim)
            else:
                unsupported.append(claim)
        total = len(claims)
        return {
            "total_claims": total,
            "supported_claims": len(supported),
            "unsupported_claims": len(unsupported),
            "ucr_rate": len(unsupported) / total if total else 0,
            "supported_examples": supported[:2],
            "unsupported_examples": unsupported[:2],
        }


# -------- CIR Estimator --------
class CIREstimator:
    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    def fetch_citation_count(self, title: str) -> int:
        try:
            api = f"https://api.semanticscholar.org/graph/v1/paper/search?query={title}&limit=1&fields=citationCount"
            r = requests.get(api)
            if r.status_code == 200:
                data = r.json()
                if data.get("data"):
                    return data["data"][0].get("citationCount", 0)
        except Exception as e:
            print("‚ö†Ô∏è Citation fetch failed:", e)
        return 0

    def estimate_impact_and_relevance(self, abstract: str) -> Dict[str, float]:
        prompt = f"""
Rate the following research abstract on a scale of 0 to 1 for:
(1) Scientific impact (importance of results)
(2) Research novelty (originality of contribution)

Return only JSON:
{{"impact": value, "novelty": value}}

Abstract:
{abstract}
"""
        try:
            raw = self.llm.chat(prompt)
            match = re.search(r"\{.*\}", raw, re.S)
            if match:
                return json.loads(match.group())
        except Exception:
            pass
        return {"impact": 0.5, "novelty": 0.5}

    def compute_cir_score(self, title: str, abstract: str) -> Dict[str, Any]:
        citations = self.fetch_citation_count(title)
        scores = self.estimate_impact_and_relevance(abstract)
        norm_citations = min(1.0, (1 + (citations / 100))) if citations < 100 else 1.0
        impact = scores["impact"]
        novelty = scores["novelty"]
        cir = 0.5 * norm_citations + 0.3 * impact + 0.2 * novelty
        return {
            "citations": citations,
            "impact_score": round(impact, 2),
            "novelty_score": round(novelty, 2),
            "normalized_citations": round(norm_citations, 2),
            "estimated_CIR": round(cir, 2),
        }


    # ---------- CIR 2 ----------



import math, time
from tqdm import tqdm

class CIREstimator:
    """Semi-realistic CIR using Semantic Scholar + CrossRef + OpenAlex data."""

    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    # ---------- Citation count ----------
    def fetch_citations_semanticscholar(self, title: str) -> int:
        try:
            api = f"https://api.semanticscholar.org/graph/v1/paper/search?query={title}&limit=1&fields=citationCount"
            r = requests.get(api, timeout=15)
            if r.status_code == 200:
                data = r.json()
                if data.get("data"):
                    return data["data"][0].get("citationCount", 0)
        except Exception as e:
            print("‚ö†Ô∏è  Semantic Scholar fetch failed:", e)
        return 0

    def fetch_citations_crossref(self, title: str) -> int:
        """Fallback citation count via CrossRef."""
        try:
            r = requests.get(
                "https://api.crossref.org/works",
                params={"query.title": title, "rows": 1},
                timeout=15
            )
            if r.status_code == 200:
                items = r.json().get("message", {}).get("items", [])
                if items:
                    return items[0].get("is-referenced-by-count", 0)
        except Exception as e:
            print("‚ö†Ô∏è  CrossRef fetch failed:", e)
        return 0

    # ---------- Field normalization ----------
    def fetch_field_avg_from_openalex(self, title: str) -> float:
        """Rough expected citation baseline for the field/year."""
        try:
            api = f"https://api.openalex.org/works?filter=title.search:{title}&per-page=1"
            r = requests.get(api, timeout=15)
            if r.status_code == 200:
                data = r.json().get("results", [])
                if data and "cited_by_count" in data[0]:
                    field = data[0]["primary_topic"]["display_name"] \
                            if data[0].get("primary_topic") else "Unknown"
                    year = data[0].get("publication_year", 2020)
                    # Estimate expected citations by age
                    age = max(1, 2025 - year)
                    return max(1.0, 10.0 / age)  # heuristic: older ‚Üí higher baseline
        except Exception as e:
            print("‚ö†Ô∏è  OpenAlex fetch failed:", e)
        return 10.0  # default expected baseline

    # ---------- Optional novelty estimation ----------
    def estimate_novelty_llm(self, abstract: str) -> float:
        prompt = f"""
Rate the following research abstract for scientific novelty (0‚Äì1).
Return JSON: {{"novelty": value}}
Abstract: {abstract}
"""
        try:
            raw = self.llm.chat(prompt)
            m = re.search(r"\{.*\}", raw, re.S)
            if m:
                return json.loads(m.group())["novelty"]
        except Exception:
            pass
        return 0.5

    # ---------- Final composite CIR ----------
    def compute_realistic_cir(self, title: str, abstract: str) -> Dict[str, Any]:
        # Step 1: Gather data
        ss_cit = self.fetch_citations_semanticscholar(title)
        cr_cit = self.fetch_citations_crossref(title)
        citations = max(ss_cit, cr_cit)
        expected = self.fetch_field_avg_from_openalex(title)
        novelty = self.estimate_novelty_llm(abstract)

        # Step 2: Normalize
        c_norm = min(1.0, citations / (expected * 10))  # field-normalized
        i_norm = math.tanh(citations / (expected * 5))  # impact saturation curve

        # Step 3: Combine
        cir = 0.5 * c_norm + 0.3 * i_norm + 0.2 * novelty
        return {
            "semantic_citations": ss_cit,
            "crossref_citations": cr_cit,
            "citations_final": citations,
            "expected_field_avg": round(expected, 2),
            "novelty_score": round(novelty, 2),
            "normalized_citations": round(c_norm, 2),
            "impact_norm": round(i_norm, 2),
            "estimated_CIR": round(cir, 2)
        }








# ======== RUN DEMO ========
llm = LLMBackend()
processor = RAGResearchProcessorLLM(llm)
results = processor.process_paper(url)
evaluator = LLMUCREvaluator(llm)

# CIR estimation now runs AFTER results exist
cir_estimator = CIREstimator(llm)
cir_results = cir_estimator.compute_realistic_cir(
    results["paper_analysis"]["title"],
    results["paper_analysis"]["abstract"]
)


results["cir_estimation"] = cir_results

print("\nüéØ PAPER ANALYSIS SUMMARY:")
print(json.dumps(results["paper_analysis"], indent=2))

print("\nüìà ESTIMATED CIR METRICS:")
print(json.dumps(cir_results, indent=2))

# Example generated claims for UCR check
generated_summary = """
The Transformer architecture introduced self-attention for sequence modeling.
It eliminated recurrence and convolution, achieving state-of-the-art results in translation tasks.
However, it requires large computational resources for training.
"""
ucr = evaluator.analyze_claim_support(generated_summary, results["chunks"])

print("\nüìä LLM-BASED UCR RESULTS:")
print(json.dumps(ucr, indent=2))


üì• Downloading and extracting...
üîç LLM extracting structured info...
üìö Chunking text...

üéØ PAPER ANALYSIS SUMMARY:
{
  "title": "Attention Is All You Need",
  "authors": [
    "Ashish Vaswani",
    "Noam Shazeer",
    "Niki Parmar",
    "Jakob Uszkoreit",
    "Llion Jones",
    "Aidan N. Gomez",
    "\u0141ukasz Kaiser",
    "Illia Polosukhin"
  ],
  "abstract": "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train.",
  "key_concepts": [
    "Transformer",
    "Attention Mechanism",
    "Self-Attention",
    "Multi-Head Attention",
    "Scaled Dot-Product Attention",
    "Sequence Transduction",
    "Machine Translation",
    "Recurrent Neural Networks",
    "Convolutional Neural Networks"
  ],
  "methodology"