In [None]:
pip install -U requests langchain langchain-aws langchain-community pydantic boto3 tavily-python

In [None]:
import csv
import json
import requests
import re
from typing import List, Dict, Any

# --- LangChain Imports ---
from langchain_core.tools import tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_aws import ChatBedrock
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from pydantic import BaseModel, Field

HUDL_API_KEY = "your_hudl_auth_key_here"  # Replace with your actual API key
GRAPHQL_ENDPOINT = "https://master.thorhudl.com/api/graphql/query"
HEADERS = {
    "Authorization": f"Bearer {HUDL_API_KEY}",
    "Content-Type": "application/json"
}

# ---Step 1: Tool Definitions ---
# --- Tool 1: Get Entity to be Merged ---
class MergeEntityDetailsInput(BaseModel):
    gsl_id: str = Field(description="The unique GSL ID of the source entity to be merged.")

@tool(args_schema=MergeEntityDetailsInput)
def get_entity_by_id(gsl_id: str) -> Dict[str, Any]:
    """Fetches the full details for a single entity by its unique GSL ID."""
    get_team_query = f"""
        query getTeams {{
          searchableTeams(query: [{{ field: ID, operator: EQUALS, values: ["{gsl_id}"] }}]) {{
            items {{ id name sport gender teamMembers {{ individual {{ id commonName {{ fullName }} }} }} competitions {{ id name }} }}
          }}
        }}
    """
    try:
        response = requests.post(GRAPHQL_ENDPOINT, headers=HEADERS, data=json.dumps({"query": get_team_query}))
        response.raise_for_status()
        data = response.json()
        if "errors" in data:
            return {"error": "GraphQL query failed.", "details": data['errors']}
        
        items = data.get("data", {}).get("searchableTeams", {}).get("items", [])
        return items[0] if items else {"error": f"No entity found with GSL ID {gsl_id}"}
        
    except requests.exceptions.RequestException as e:
        return {"error": f"API call failed: {e}"}
    except (json.JSONDecodeError, IndexError):
        return {"error": "Failed to parse API response or find entity."}


# --- Tool 2: Search for Matching Entities ---
class TargetEntityDetailsInput(BaseModel):
    search_term: str = Field(description="A name or keyword to search for matching entities.")
    
@tool(args_schema=TargetEntityDetailsInput)
def find_matching_entities(search_term: str) -> List[Dict[str, Any]]:
    """Searches for entities by a name or keyword and returns a list of potential matches."""
    get_teams_query = f"""
        query searchTeamsByName {{
          searchableTeams(query: [{{ field: NAME, operator: CONTAINS, values: ["{search_term}"] }}]) {{
            items {{ id name sport gender }}
          }}
        }}
    """
    try:
        response = requests.post(GRAPHQL_ENDPOINT, headers=HEADERS, data=json.dumps({"query": get_teams_query}))
        response.raise_for_status()
        data = response.json()
        if "errors" in data:
            print(f"GraphQL API returned an error: {data['errors']}")
            return []
        
        return data.get("data", {}).get("searchableTeams", {}).get("items", [])

    except requests.exceptions.RequestException as e:
        print(f"API call failed: {e}")
        return []
    except json.JSONDecodeError:
        print("Failed to parse API response.")
        return []

# --- Tool 3: The Google Search Tool ---
tavily_tool = TavilySearchResults(max_results=3)
tavily_tool.name = "tavily_search_results_json"


# --- Step 2: Create the Agent with a Modified Prompt for Parsing ---
# The list of tools
tools = [get_entity_by_id, find_matching_entities, tavily_tool]

llm = ChatBedrock(
    model_id="anthropic.claude-3-haiku-20240307-v1:0",
    model_kwargs={"temperature": 0.0}
)

# This is the core prompt - the "brain" of the agent.
# It tells the agent HOW to perform the comparison task step-by-step.
BATCH_PROCESSING_PROMPT = """
You are a highly meticulous data analyst specializing in entity resolution. Your goal is to combine internal API data with external information from a Google search to find the most accurate merge candidate for a source entity.

**Your Comprehensive and Strict Analysis Protocol:**

1.  **Get Source Data:** First, use the `get_entity_by_id` tool to fetch the complete data for the source entity using its GSL ID. This gives you the source entity's details, including its own ID.

2.  **Conduct External Research:** After retrieving the source entity's data, use the `tavily_search_results_json` tool to perform a Google search using the source entity's name to gather external context.

3.  **Find Internal Candidates:** Next, execute your internal search protocol. Perform at least two distinct searches using the `find_matching_entities` tool (e.g., by the entity's name and by a competition name) to gather a list of potential merge candidates.

4.  **Consolidate and Deduplicate:** Combine the results from all your internal searches into a single, consolidated list of unique candidate entities based on their ID.

5.  **CRITICAL - Filter Self-Matches:** This is a mandatory step. Before you begin your final analysis, you MUST filter the original source entity out of your consolidated list of candidates. Compare the `id` of each candidate to the `id` of the source entity you fetched in Step 1. Remove any candidate where the IDs are an exact match. **It is a critical error to suggest merging an entity with itself.**

6.  **Synthesize and Conclude:** Now, with a clean list of *other* entities, perform your final analysis. Use all the information you have gathered (source data, external context, and filtered candidate data) to make your recommendation.

**CRITICAL: Final Answer Formatting**
Your final answer MUST be in the following format. First, provide the ID of the best match on its own line. Then, provide your justification.
- If a good match is found, format it like this:
Best Match ID: [GSL_ID_of_the_best_match]
Justification: [Your detailed reasoning here...]

- If no suitable match is found after your research, format it like this:
Best Match ID: no match found
Justification: [Your reasoning for why no match was found...]
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", BATCH_PROCESSING_PROMPT),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

# Create the agent and executor (remains the same)
agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


# --- Step 3: Main Batch Processing Logic ---

def parse_agent_output(output_text: str) -> (str, str):
    """Parses the agent's final answer to extract the best match ID and justification."""
    match_id = "no match found"
    justification = output_text # Default to the full output

    # Use regex to find the Best Match ID line
    match = re.search(r"Best Match ID:\s*(.*)", output_text)
    if match:
        match_id = match.group(1).strip()
    
    # Extract justification text that comes after the ID line
    justification_split = output_text.split("Justification:", 1)
    if len(justification_split) > 1:
        justification = justification_split[1].strip()

    return match_id, justification

def run_batch_process(input_file: str, output_file: str):
    """Reads source IDs from a CSV, processes them with the agent, and writes results to a new CSV."""
    print(f"Starting batch process from '{input_file}'...")
    
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Write header to the output file
        writer.writerow(["source_gsl_id", "best_match_gsl_id", "justification"])
        
        # Skip header row of the input file
        header = next(reader)
        print(f"Processing input file with header: {header}")

        for row in reader:
            source_id = row[0]
            print(f"\n--- Processing Source GSL ID: {source_id} ---")

            user_input = f"Please research and compare the entity with GSL ID {source_id} to find the best merge candidate. Follow your instructions precisely."
            
            try:
                response = agent_executor.invoke({"input": user_input})
                output_data = response.get("output", [])
                output_text = "" # Default to empty string

                # The response is a list like [{'type': 'text', 'text': '...'}].
                # We need to safely extract the 'text' value.
                if isinstance(output_data, list) and output_data:
                    if isinstance(output_data[0], dict) and 'text' in output_data[0]:
                        output_text = output_data[0]['text']
                elif isinstance(output_data, str): # Handle if it's already a string
                    output_text = output_data
                
                # Parse the structured output
                best_match_id, justification = parse_agent_output(output_text)

                print(f"--- Result for {source_id}: Found match '{best_match_id}' ---")
                writer.writerow([source_id, best_match_id, justification])

            except Exception as e:
                print(f"!! An error occurred while processing {source_id}: {e} !!")
                writer.writerow([source_id, "processing error", str(e)])

    print(f"\nBatch process complete. Results saved to '{output_file}'.")


if __name__ == "__main__":
    # Define input and output file names
    input_csv_file = ""
    output_csv_file = ""

    # Run the main process
    run_batch_process(input_file=input_csv_file, output_file=output_csv_file)