In [22]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from langsmith import Client
from typing import TypedDict
from typing_extensions import Annotated
from langchain.chat_models import init_chat_model
from src.agent import graph

# Define your golden dataset

In [23]:
dataset_examples = [
    # 1
    {
        "inputs": {
            "topic": "An Analytical Study of the Impact of Social Media Usage on Academic Performance among University Students"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "This section should contain the paper title ('An Analytical Study of the Impact of Social Media Usage on Academic Performance among University Students'), the full list of authors, their institutional affiliations, and the contact information of the corresponding author. Its purpose is administrative, establishing authorship and ownership of the work.",
                },
                {
                    "title": "Abstract",
                    "description": "This section should provide a concise (150–250 words) summary of the research question, the analytical approach, and the key insights. It should emphasize interpretation and patterns identified, rather than making strong claims. The abstract should not include citations or references.",
                },
                {
                    "title": "Introduction",
                    "description": "This section should introduce the context of social media usage among university students, outline the research problem, and present the guiding questions. It should define the scope and boundaries of the study (e.g., types of social media, academic performance indicators) and highlight the contributions of the paper. It should end with an outline of the paper’s structure.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "This section should synthesize existing research on the relationship between social media and academic performance. It should highlight key debates (e.g., distraction vs. collaboration), trends in prior studies, and knowledge gaps that justify the current analysis. The purpose is to position this paper within the broader academic conversation.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "This section should explain the analytical framework or model used to examine the relationship between social media usage and academic performance. It should specify the sources of evidence (e.g., prior studies, datasets), inclusion/exclusion criteria, and assumptions. The goal is to make the analysis transparent and reproducible.",
                },
                {
                    "title": "Analysis",
                    "description": "This section should present the systematic breakdown of evidence. It can be organized thematically (e.g., social media as a learning aid vs. distraction), chronologically, or comparatively. Subsections may be used for clarity. Tables, figures, or diagrams should be included if they help illustrate the patterns found.",
                },
                {
                    "title": "Discussion",
                    "description": "This section should interpret the findings, connect them back to the literature, and explain their significance. It should also reflect on the strengths and limitations of the analysis. The discussion should go beyond summarizing results, offering insights into why the patterns matter and how they relate to broader educational practices.",
                },
                {
                    "title": "Conclusion",
                    "description": "This section should summarize the key findings of the study and their implications. It should also discuss the limitations of the study and suggest areas for future research. The conclusion should provide a clear answer to the research question, based on the results and discussion.",
                },
                {
                    "title": "Future Work",
                    "description": "This section should outline open questions and opportunities for further research. For example, future studies could investigate long-term effects of social media usage, cross-cultural comparisons, or the role of emerging platforms. Recommendations should be actionable and point toward ways of building on the current analysis.",
                },
                {
                    "title": "References",
                    "description": "This section should list all the sources that were cited in the study. The references should be formatted according to a specific citation style (e.g., APA, MLA, Chicago). This section ensures that all the information used in the study is properly credited and allows readers to find the original sources if they wish to learn more.",
                },
                {
                    "title": "Appendices (optional)",
                    "description": "This section should provide supplementary material that supports the analysis but is too detailed for the main text. Examples include extended data tables, coding schemes, or additional figures. Appendices improve transparency and replicability without interrupting the flow of the paper.",
                },
            ]
        },
    },
    # 2
    {
        "inputs": {
            "topic": "An Analytical Study of Renewable Energy Adoption in Rural Communities"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Include the paper title, author names, institutional affiliations, and correspondence details. This establishes ownership and allows readers to contact the authors.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize in 150–250 words the research question (how rural communities adopt renewable energy), the analytical methods, and the main insights (barriers, enablers, trends). No citations here.",
                },
                {
                    "title": "Introduction",
                    "description": "Provide background on energy challenges in rural areas, highlight the importance of renewable energy adoption, state guiding research questions, scope, and contributions. Close with paper organization.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Synthesize previous studies on renewable energy adoption, highlighting debates on cost, policy, infrastructure, and cultural acceptance. Identify research gaps.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain the framework used to analyze adoption (e.g., technology adoption models, policy analysis). Describe inclusion criteria, data sources, and assumptions.",
                },
                {
                    "title": "Analysis",
                    "description": "Present systematic evidence of adoption patterns, barriers (financial, technical, social), and enablers (policy incentives, community engagement). Use figures/tables where useful.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret results, compare them with prior literature, highlight insights such as the role of subsidies, discuss strengths and weaknesses of current approaches.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize findings, provide clear answers to research questions, highlight implications for policy and practice.",
                },
                {
                    "title": "Future Work",
                    "description": "Identify areas for further study, such as long-term sustainability, comparative studies across regions, or new financing models.",
                },
                {
                    "title": "References",
                    "description": "List all cited studies in the required style.",
                },
                {
                    "title": "Appendices (optional)",
                    "description": "Include supplementary data tables, regional breakdowns, or survey instruments.",
                },
            ]
        },
    },
    # 3
    {
        "inputs": {
            "topic": "An Analytical Study of the Effects of Remote Work on Employee Productivity"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Include paper title, authors, affiliations, and correspondence.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize question (remote work and productivity), analytical method, and key insights (benefits, drawbacks, hybrid trends).",
                },
                {
                    "title": "Introduction",
                    "description": "Provide background on remote work, state guiding question, contributions, and scope.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Review literature on productivity measures, work-life balance, organizational impact.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain framework for analyzing productivity (quantitative metrics, qualitative interviews). State data sources and assumptions.",
                },
                {
                    "title": "Analysis",
                    "description": "Present evidence of productivity changes, thematic breakdowns (communication, autonomy, burnout).",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret findings, connect to literature, note strengths and weaknesses.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize insights and implications for employers/employees.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest studies on long-term remote work impact, hybrid models.",
                },
                {"title": "References", "description": "Cite all relevant sources."},
                {
                    "title": "Appendices (optional)",
                    "description": "Supplementary data tables or survey results.",
                },
            ]
        },
    },
    # 4
    {
        "inputs": {
            "topic": "An Analytical Study of Climate Change Communication in the Media"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Paper title, authors, affiliations, correspondence.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize analytical question (how media frames climate change), approach, and key insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Background on climate communication, guiding questions, scope, and contributions.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Review literature on media influence, framing, misinformation, and public perception.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain discourse/content analysis approach, inclusion criteria, data sources.",
                },
                {
                    "title": "Analysis",
                    "description": "Break down media framing, themes (alarmist vs. solution-focused), cross-platform comparisons.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret what these framings mean for public understanding and policy debates.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize findings and implications for communication strategies.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest further research into social media, international comparisons.",
                },
                {"title": "References", "description": "Cite all studies."},
                {
                    "title": "Appendices (optional)",
                    "description": "Include coding schemes or extended examples.",
                },
            ]
        },
    },
    # 5
    {
        "inputs": {
            "topic": "An Analytical Study of Artificial Intelligence in Healthcare Diagnostics"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Paper title, authors, affiliations, correspondence.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize research question (AI in diagnostics), methods, and insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Provide background, outline significance, state scope and contributions.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Summarize literature on AI tools, accuracy, and ethical issues.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain frameworks (comparative evaluation of algorithms, ethical review).",
                },
                {
                    "title": "Analysis",
                    "description": "Systematic breakdown of AI performance, adoption challenges, ethical dilemmas.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret significance, align with existing research, note limitations.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize key findings, implications for practice.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest work on interpretability, regulation, cross-country adoption.",
                },
                {"title": "References", "description": "Cite all relevant sources."},
                {
                    "title": "Appendices (optional)",
                    "description": "Include algorithm comparison tables or supplementary datasets.",
                },
            ]
        },
    },
    # 6
    {
        "inputs": {
            "topic": "An Analytical Study of Online Learning Platforms and Student Engagement"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Include title, authors, affiliations.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize question, approach, insights on engagement.",
                },
                {
                    "title": "Introduction",
                    "description": "Background on e-learning, problem definition, scope.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Synthesize prior studies on MOOCs, LMS adoption, and student engagement.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain models for measuring engagement (behavioral, emotional, cognitive).",
                },
                {
                    "title": "Analysis",
                    "description": "Breakdown engagement factors: interactivity, content quality, peer support.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret meaning, compare across platforms, highlight implications.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize contributions and insights.",
                },
                {
                    "title": "Future Work",
                    "description": "Identify directions like VR/AR in learning, personalization.",
                },
                {"title": "References", "description": "List all references."},
                {
                    "title": "Appendices (optional)",
                    "description": "Extended survey data or codebooks.",
                },
            ]
        },
    },
    # 7
    {
        "inputs": {
            "topic": "An Analytical Study of Gender Representation in STEM Careers"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Include authors, affiliations.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize research question, analytical approach, key insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Background on gender disparities in STEM, scope, contributions.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Summarize studies on gender bias, retention, career progression.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain framework (statistical analysis, policy review).",
                },
                {
                    "title": "Analysis",
                    "description": "Systematic review of gender participation patterns, barriers, interventions.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret meaning, connect to wider policy debates.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize findings and their implications.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest future studies on intersectionality, long-term tracking.",
                },
                {"title": "References", "description": "List sources cited."},
                {
                    "title": "Appendices (optional)",
                    "description": "Supplementary tables or case studies.",
                },
            ]
        },
    },
    # 8
    {
        "inputs": {
            "topic": "An Analytical Study of Cybersecurity Threats in Financial Institutions"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Paper title, authors, affiliations.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize question (cyber threats in finance), approach, insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Provide background on digital finance and rising threats.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Summarize prior studies on malware, phishing, insider threats.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain frameworks for assessing threats and risk models.",
                },
                {
                    "title": "Analysis",
                    "description": "Present systematic breakdown of threat categories, attack trends.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret meaning, align with regulatory practices, implications for resilience.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize insights and contributions.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest work on AI-driven threat detection, global cooperation.",
                },
                {"title": "References", "description": "List cited sources."},
                {
                    "title": "Appendices (optional)",
                    "description": "Extended case data or logs.",
                },
            ]
        },
    },
    # 9
    {
        "inputs": {
            "topic": "An Analytical Study of Urban Transportation and Traffic Congestion"
        },
        "outputs": {
            "sections": [
                {"title": "Title Page", "description": "Title, authors, affiliations."},
                {
                    "title": "Abstract",
                    "description": "Summarize research question (traffic congestion), analytical method, and insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Background on urban transport challenges, scope of study.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Summarize literature on transport policy, congestion management, urban planning.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain methods such as traffic flow models or policy review.",
                },
                {
                    "title": "Analysis",
                    "description": "Present breakdown of causes, effects, and interventions for congestion.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret findings, align with policy debates, implications.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize contributions and insights.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest further research into smart transport systems, autonomous vehicles.",
                },
                {"title": "References", "description": "Cite references used."},
                {
                    "title": "Appendices (optional)",
                    "description": "Supplementary traffic data or simulation outputs.",
                },
            ]
        },
    },
    # 10
    {
        "inputs": {
            "topic": "An Analytical Study of Mental Health Awareness Campaigns in Universities"
        },
        "outputs": {
            "sections": [
                {
                    "title": "Title Page",
                    "description": "Include title, authors, affiliations.",
                },
                {
                    "title": "Abstract",
                    "description": "Summarize question (effectiveness of awareness campaigns), analytical approach, and key insights.",
                },
                {
                    "title": "Introduction",
                    "description": "Provide background on student mental health, guiding questions, and contributions.",
                },
                {
                    "title": "Related Work / Literature Context",
                    "description": "Summarize studies on awareness campaigns, stigma reduction, engagement methods.",
                },
                {
                    "title": "Analytical Framework / Method",
                    "description": "Explain framework used to assess campaigns (survey analysis, message framing).",
                },
                {
                    "title": "Analysis",
                    "description": "Break down campaign effectiveness, thematic insights, and limitations.",
                },
                {
                    "title": "Discussion",
                    "description": "Interpret meaning, connect to broader discussions on student well-being.",
                },
                {
                    "title": "Conclusion",
                    "description": "Summarize findings and implications for future awareness strategies.",
                },
                {
                    "title": "Future Work",
                    "description": "Suggest directions such as digital campaigns, long-term impact measurement.",
                },
                {"title": "References", "description": "List cited works."},
                {
                    "title": "Appendices (optional)",
                    "description": "Include extended survey instruments or campaign materials.",
                },
            ]
        },
    },
]

In [24]:
from langsmith import Client

client = Client()
dataset_name = "analytical_paper_sections_dataset"

# Create (or append to) the dataset using converted inputs
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name)
else:
    dataset = client.read_dataset(dataset_name=dataset_name)

client.create_examples(dataset_id=dataset.id, examples=dataset_examples)


{'example_ids': ['4fccc253-0f95-4f57-9e56-343e542aa500',
  '99e6b2b2-709b-47aa-a5e7-fc2ef8a666c7',
  'fa15b2a1-5174-4edb-91ad-3033644ce8bd',
  'b4a87601-4051-44d3-91b0-d6bdd7ab23c0',
  '1fa83d27-b22a-4196-aff7-7fd8f9d1369b',
  'aa1e57a8-65af-4b73-89fd-11805e5a38c3',
  '9351694d-003a-4628-94ff-416f9d6b75c4',
  '82ceee9e-20c8-4f37-8e34-ae5ec9734370',
  'a4861e44-11a7-48c7-bafb-17b0a2248d70',
  '1bb42d14-6a37-4a0c-aac9-6c735b9b3e04'],
 'count': 10}

# Define the target function (Part of your application to be evaluated)

In [25]:
def target_function(inputs: dict) -> dict:
    """
    This function takes in a dictionary of inputs and returns a dictionary of outputs.
    """
    
    response = graph.nodes['generate_sections'].invoke(inputs)
    
    return response



# Define Evaluator

In [26]:
    
import json

def format_objects_for_llm_judge(actual_output, expected_output) -> str:
    """Format actual vs expected section outputs into a readable string for LLM evaluation."""
    
    def format_sections(obj, object_name: str) -> str:
        # Handle stringified JSON
        if isinstance(obj, str):
            obj = json.loads(obj)
        
        formatted = [f"\n{object_name} contains the following sections:"]
        
        # Handle both list and dict inputs
        if isinstance(obj, list):
            sections = obj
        elif isinstance(obj, dict):
            sections = obj.get("sections", [])
        else:
            sections = []
        
        for idx, section in enumerate(sections, start=1):
            # Handle both dict and object with attributes
            if hasattr(section, 'title') and hasattr(section, 'description'):
                # Pydantic model object
                title = section.title
                desc = section.description
            elif isinstance(section, dict):
                # Dictionary
                title = section.get("title", "Untitled")
                desc = section.get("description", "")
            else:
                # Fallback
                title = "Untitled"
                desc = str(section)
            
            formatted.append(f"\nSection {idx}: {title}\nDescription: {desc}")
        
        return "\n".join(formatted)
    
    actual_text = format_sections(actual_output, "Actual Output")
    expected_text = format_sections(expected_output, "Reference Output")
    
    return actual_text + "\n\n" + expected_text


In [27]:
from pydantic import BaseModel, Field


class Score(BaseModel):
    """Score for correctness of the generated research paper sections."""

    reason: str = Field(
        description=(
            "A clear explanation of how well the model’s generated sections align with the reference sections. "
            "Focus on whether each section title is present and whether the description captures the intended purpose, "
            "scope, and key elements of that section. Variations in wording are acceptable as long as the same core "
            "intent is preserved."
        )
    )
    correctness: bool = Field(
        description=(
            "A binary score (0 or 1) that indicates whether the model’s generated sections adequately match the reference. "
            "Score 1 if the output includes all required sections (or close equivalents) with descriptions that cover the "
            "same essential content. Score 0 if critical sections are missing, misinterpreted, or if the descriptions "
            "contradict or omit the core intent of the reference."
        )
    )


CORRECTNESS_PROMPT = """
You are evaluating how well a model’s generated research paper sections match a reference ground truth.

Context:
- The reference output is the ground truth section plan for the paper.
- The model’s output is being evaluated against this reference for accuracy and completeness.
- Both contain structured sections with titles and descriptions.

Evaluation Guidelines:
- Score 1 (true) if the model’s output:
  * Includes all reference section titles (or clear equivalents).
  * Provides descriptions that capture the same purpose, scope, and key elements as the reference.
  * Allows for paraphrasing or slightly different expression as long as meaning is preserved.
  * May contain additional valid details beyond the reference.

- Score 0 (false) only if the model’s output:
  * Misses or omits critical sections from the reference.
  * Provides descriptions that contradict, distort, or ignore the core intent of the reference.
  * Contains placeholders, irrelevant content, or fabricated information that undermines correctness.

Remember: The reference is the ground truth. Evaluate correctness by judging whether the model captured the same essential
structure and intent, not whether the wording is identical.

Outputs to Evaluate:
- **Model Output:** {model_output}
- **Reference Ground Truth:** {reference_output}
"""


def evaluate_correctness(outputs: dict, reference_outputs: dict) -> dict:
    """
    This function evaluates the correctness of the output by comparing it to the reference output.
    """

    actual_output = outputs["sections"]
    expected_output = reference_outputs["sections"]

    # Convert Pydantic objects to dictionaries for JSON serialization
    def convert_to_dict(obj):
        if hasattr(obj, 'model_dump'):
            return obj.model_dump()
        elif hasattr(obj, 'dict'):
            return obj.dict()
        elif isinstance(obj, list):
            return [convert_to_dict(item) for item in obj]
        elif isinstance(obj, dict):
            return {key: convert_to_dict(value) for key, value in obj.items()}
        else:
            return obj

    actual_output_dict = convert_to_dict(actual_output)
    expected_output_dict = convert_to_dict(expected_output)

    system_instruction = CORRECTNESS_PROMPT.format(
        model_output=json.dumps(actual_output_dict),
        reference_output=json.dumps(expected_output_dict),
    )

    messages = [
        {
            "role": "system",
            "content": system_instruction,
        },
    ]

    structured_llm = init_chat_model(
        model="gpt-4.1",
        model_provider="openai",
    ).with_structured_output(Score)

    response = structured_llm.invoke(messages)

    return {
        "key": "correctness",
        "score": response.correctness,
        "reason": response.reason,
    }

# Run evaluation

In [28]:
experiment_results = client.evaluate(
    target_function,
    data=dataset_name,
    evaluators=[evaluate_correctness],
    experiment_prefix="gpt-4.1-nano",
    max_concurrency=5,
    num_repetitions=3
)


View the evaluation results for experiment: 'gpt-4.1-nano-54b59506' at:
https://smith.langchain.com/o/a2fe08bb-88d4-4fd6-a6a1-d326bdac58ad/datasets/3b297d70-2670-4b3f-a7a2-2e930efdf98d/compare?selectedSessions=150c32a5-c517-4f37-b001-1958451343f6




120it [07:18,  3.65s/it]


## LInk to the dataset and experiments

https://smith.langchain.com/public/3a942270-3bf0-4707-aee8-656d03256dd7/d/compare?selectedSessions=150c32a5-c517-4f37-b001-1958451343f6&baseline=undefined