In [9]:
import pandas as pd
from custom_agents_LG import load_session_data, agent_executor
from llm import get_llama_3dot3_70b_versatile,get_llama_3dot1_8b_instant,get_70b_8192
from loaders_and_chroma_utils import vectorstore
from langchain_core.prompts import PromptTemplate
from typing_extensions import List, TypedDict
from langchain_core.prompts import PromptTemplate
from langgraph.graph import START, StateGraph
from typing import TypedDict, List, Annotated
from langgraph.graph import StateGraph, END
from langchain_core.prompts import PromptTemplate
from typing import TypedDict, List
from langchain_core.documents import Document
llm = get_70b_8192()

In [10]:
# # Modified prompt template that explicitly references player metrics
template = """Using the following research data and current player metrics, analyze the potential injury risks and performance insights.

Research Context:
{context}

Current Player Metrics:
{player_metrics}

Analysis Request: {question}

Based on both the research findings and the current metrics, provide:
1. Specific risk indicators identified in the current metrics
2. How these metrics compare to research thresholds
3. Evidence-based recommendations for load management

First show the players player metrics before going into the analysis if you cannot find the metrics tell me.

Sports Science Analysis:"""

custom_rag_prompt = PromptTemplate.from_template(template)

In [17]:
class State(TypedDict):
    question: str
    context: List[Document]
    player_metrics: str
    answer: str
    player_name: str  # Added to track player name

def extract_player_name(question: str) -> str:
    """
    Extract player name from the question using the LLM.
    This is more robust than regex as it can handle various phrasings.
    """
    name_extraction_prompt = """
    Extract the player name(s) from the following question. 
    If no specific player is mentioned, return None.
    Only return the name(s) without any additional text.
    
    Question: {question}
    """
    
    prompt = PromptTemplate.from_template(name_extraction_prompt)
    messages = prompt.invoke({"question": question})
    response = llm.invoke(messages).content.strip()
    
    return response if response.lower() != "none" else None

def get_player_metrics(player_name: str) -> dict:
    """Get metrics for a specific player."""
    return agent_executor.invoke({
        "input": f"Get {player_name}'s metrics as a dictionary format"
    })["output"]

def setup_state(question: str) -> State:
    """Initialize state with player name and metrics."""
    player_name = extract_player_name(question)
    
    if not player_name:
        return {
            "question": question,
            "context": [],
            "player_metrics": "No specific player mentioned in query",
            "answer": "",
            "player_name": ""
        }
    
    metrics = get_player_metrics(player_name)
    
    return {
        "question": question,
        "context": [],
        "player_metrics": metrics,
        "answer": "",
        "player_name": player_name
    }

def retrieve(state: State):
    """Retrieve relevant documents and combine with player metrics."""
    retrieved_docs = vectorstore.similarity_search(state["question"])
    
    if state["player_metrics"] != "No specific player mentioned in query":
        metrics_doc = Document(
            page_content=f"Current Player Metrics for {state['player_name']}:\n{state['player_metrics']}",
            metadata={"source": "player_metrics"}
        )
        retrieved_docs.insert(0, metrics_doc)
    
    return {"context": retrieved_docs}

def generate(state: State):
    """Generate response using context and metrics."""
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    
    if state["player_metrics"] == "No specific player mentioned in query":
        analysis_prompt = """
        To analyze a player's injury risks, please specify a player name in your query.
        Available players can be queried from the system.
        """
        return {"answer": analysis_prompt}
    
    messages = custom_rag_prompt.invoke({
        "question": state["question"],
        "context": docs_content,
        "player_metrics": state["player_metrics"],
        "player_name": state["player_name"]
    })
    
    response = llm.invoke(messages)
    return {"answer": response.content}

# Build and run the graph
def analyze_player_risk(question: str):
    """Main function to analyze player risk based on a question."""
    workflow = StateGraph(State)
    
    # Add nodes
    workflow.add_node("retrieve", retrieve)
    workflow.add_node("generate", generate)
    
    # Add edges
    workflow.add_edge("retrieve", "generate")
    workflow.add_edge("generate", END)
    
    # Set entry point
    workflow.set_entry_point("retrieve")
    
    # Compile graph
    graph = workflow.compile()
    
    # Initialize state with player info
    initial_state = setup_state(question)
    
    # Run analysis
    response = graph.invoke(initial_state)

    
    return response


In [18]:
query = "What are Hawk's injury risks based on his recent activity pattern?"
print(f"\nAnalyzing query: {query}")
print("-" * 50)
response = analyze_player_risk(query)
print("Analysis Results:")
print(response["answer"])
print("-" * 50)


Analyzing query: What are Hawk's injury risks based on his recent activity pattern?
--------------------------------------------------
Analysis Results:
**Hawk's Player Metrics:**

```
{
  'player_metrics': {
    'Duration': {
      'mean': 1.8324786324786322, 
      'min': 0.6, 
      'max': 3.4, 
      'std': 0.7400872571841712, 
      'count': 117
    }, 
    'Distance': {
      'mean': 5.457008547008546, 
      'min': 0.0, 
      'max': 15.81, 
      'std': 4.655160885578066, 
      'count': 117
    }, 
    'Magnitude': {
      'mean': 3.778290598290598, 
      'min': 3.01, 
      'max': 5.84, 
      'std': 0.5849510598424844, 
      'count': 117
    }, 
    'Avg Metabolic Power': {
      'mean': 37.13606837606838, 
      'min': 0.0, 
      'max': 91.29, 
      'std': 19.310866110206423, 
      'count': 117
    }, 
    'Dynamic Stress Load': {
      'mean': 0.18863247863247865, 
      'min': 0.0, 
      'max': 1.04, 
      'std': 0.1919406606529916, 
      'count': 117
    }
  }, 

In [19]:
response

{'question': "What are Hawk's injury risks based on his recent activity pattern?",
 'context': [Document(metadata={'source': 'player_metrics'}, page_content="Current Player Metrics for Hawk:\n{'player_metrics': {'Duration': {'mean': 1.8324786324786322, 'min': 0.6, 'max': 3.4, 'std': 0.7400872571841712, 'count': 117}, 'Distance': {'mean': 5.457008547008546, 'min': 0.0, 'max': 15.81, 'std': 4.655160885578066, 'count': 117}, 'Magnitude': {'mean': 3.778290598290598, 'min': 3.01, 'max': 5.84, 'std': 0.5849510598424844, 'count': 117}, 'Avg Metabolic Power': {'mean': 37.13606837606838, 'min': 0.0, 'max': 91.29, 'std': 19.310866110206423, 'count': 117}, 'Dynamic Stress Load': {'mean': 0.18863247863247865, 'min': 0.0, 'max': 1.04, 'std': 0.1919406606529916, 'count': 117}}, 'Activity_Distribution': {'Acceleration': 53.84615384615385, 'Deceleration': 46.15384615384615}, 'Time_Metrics': {'avg_time_between_actions': '01:40.12', 'min_time_between_actions': '00:00.00', 'max_time_between_actions': '03

In [20]:
from typing import TypedDict, List
from langchain_core.documents import Document
from langgraph.graph import StateGraph, END

class State(TypedDict):
    question: str
    context: List[Document]
    player_metrics: str
    answer: str
    player_name: str

def extract_player_name(question: str) -> str:
    """Extract player name from the question using the LLM."""
    name_extraction_prompt = """
    Extract the player name(s) from the following question. 
    If no specific player is mentioned, return None.
    Only return the name(s) without any additional text.
    
    Question: {question}
    """
    
    prompt = PromptTemplate.from_template(name_extraction_prompt)
    messages = prompt.invoke({"question": question})
    response = llm.invoke(messages).content.strip()
    
    return response if response.lower() != "none" else None

def setup_initial_state(question: str) -> State:
    """Setup initial state without metrics - moved to separate node."""
    return {
        "question": question,
        "context": [],
        "player_metrics": "",
        "answer": "",
        "player_name": ""
    }

def extract_name_node(state: State):
    """Node for extracting player name."""
    player_name = extract_player_name(state["question"])
    return {"player_name": player_name}

def get_metrics_node(state: State):
    """Node for getting player metrics."""
    if not state["player_name"]:
        return {"player_metrics": "No specific player mentioned in query"}
    
    metrics = agent_executor.invoke({
        "input": f"Get {state['player_name']}'s metrics as a dictionary format"
    })["output"]
    
    return {"player_metrics": metrics}

def retrieve_node(state: State):
    """Node for retrieving relevant documents."""
    retrieved_docs = vectorstore.similarity_search(state["question"])
    
    if state["player_metrics"] != "No specific player mentioned in query":
        metrics_doc = Document(
            page_content=f"Current Player Metrics for {state['player_name']}:\n{state['player_metrics']}",
            metadata={"source": "player_metrics"}
        )
        retrieved_docs.insert(0, metrics_doc)
    
    return {"context": retrieved_docs}

def generate_node(state: State):
    """Node for generating the analysis."""
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    
    if state["player_metrics"] == "No specific player mentioned in query":
        return {"answer": "To analyze a player's injury risks, please specify a player name in your query."}
    
    messages = custom_rag_prompt.invoke({
        "question": state["question"],
        "context": docs_content,
        "player_metrics": state["player_metrics"],
        "player_name": state["player_name"]
    })
    
    response = llm.invoke(messages)
    return {"answer": response.content}

def build_streaming_graph():
    """Build graph with streaming support."""
    workflow = StateGraph(State)
    
    # Add nodes
    workflow.add_node("extract_name", extract_name_node)
    workflow.add_node("get_metrics", get_metrics_node)
    workflow.add_node("retrieve", retrieve_node)
    workflow.add_node("generate", generate_node)
    
    # Add edges
    workflow.add_edge("extract_name", "get_metrics")
    workflow.add_edge("get_metrics", "retrieve")
    workflow.add_edge("retrieve", "generate")
    workflow.add_edge("generate", END)
    
    # Set entry point
    workflow.set_entry_point("extract_name")
    
    return workflow.compile()

def analyze_player_risk_streaming(question: str):
    """Stream the analysis process."""
    graph = build_streaming_graph()
    initial_state = setup_initial_state(question)
    
    print("Starting Analysis Stream:")
    print("-" * 50)
    
    for step in graph.stream(initial_state, stream_mode="updates"):
        print("Step Update:")
        for key, value in step.items():
            if key == "context":
                print(f"{key}: {len(value)} documents retrieved")
            else:
                print(f"{key}: {value}")
        print("-" * 50)
    
    return graph.invoke(initial_state)

# Example usage
if __name__ == "__main__":
    query = "What are Lee's injury risks based on his recent activity pattern?"
    
    print("Streaming Analysis Process:")
    result = analyze_player_risk_streaming(query)
    
    print("\nFinal Analysis:")
    print(result["answer"])

Streaming Analysis Process:
Starting Analysis Stream:
--------------------------------------------------
Step Update:
extract_name: {'player_name': 'Lee'}
--------------------------------------------------
Step Update:
get_metrics: {'player_metrics': {'player_metrics': {'Duration': {'mean': 1.9452702702702702, 'min': 0.7, 'max': 4.2, 'std': 0.7416963879507882, 'count': 148}, 'Distance': {'mean': 6.405405405405405, 'min': 0.36, 'max': 21.21, 'std': 4.574265396340822, 'count': 148}, 'Magnitude': {'mean': 3.984121621621621, 'min': 3.0, 'max': 5.62, 'std': 0.7037440484919418, 'count': 148}, 'Avg Metabolic Power': {'mean': 43.192635135135134, 'min': 2.41, 'max': 156.06, 'std': 24.524950071346165, 'count': 148}, 'Dynamic Stress Load': {'mean': 0.4142567567567567, 'min': 0.0, 'max': 2.11, 'std': 0.4290173730941682, 'count': 148}}, 'Activity_Distribution': {'Acceleration': 57.432432432432435, 'Deceleration': 42.567567567567565}, 'Time_Metrics': {'avg_time_between_actions': '01:28.39', 'min_tim

In [29]:
def retrieve_node(state: State):
    """Node for retrieving relevant documents with specific research focus."""
    # First get player metrics document if available
    documents = []
    if state["player_metrics"] != "No specific player mentioned in query":
        metrics_doc = Document(
            page_content=f"Current Player Metrics for {state['player_name']}:\n{state['player_metrics']}",
            metadata={"source": "player_metrics"}
        )
        documents.append(metrics_doc)
    
    # Create specific research queries based on the metrics
    research_queries = [
        f"injury risk thresholds for {state['player_name']}'s activity pattern",
        "critical thresholds for Dynamic Stress Load DSL",
        "metabolic power thresholds for injury risk",
        "recovery indicators and patterns research",
        "acceleration deceleration ratio research findings"
    ]
    
    # Get relevant research for each aspect
    for query in research_queries:
        retrieved_docs = vectorstore.similarity_search(
            query,
            k=2  # Get top 2 most relevant documents for each query
        )
        documents.extend(retrieved_docs)
    
    # Deduplicate documents
    seen_contents = set()
    unique_docs = []
    for doc in documents:
        if doc.page_content not in seen_contents:
            seen_contents.add(doc.page_content)
            unique_docs.append(doc)
    print(f"Retried : {'-'*80 } \n {unique_docs}")
    return {"context": unique_docs}

def generate_node(state: State):
    """Node for generating the analysis with structured research incorporation."""
    if state["player_metrics"] == "No specific player mentioned in query":
        return {"answer": "To analyze a player's injury risks, please specify a player name in your query."}
    
    # Organize documents by type
    metrics_doc = None
    threshold_docs = []
    recovery_docs = []
    
    for doc in state["context"]:
        if doc.metadata.get("source") == "player_metrics":
            metrics_doc = doc
        elif "threshold" in doc.page_content.lower():
            threshold_docs.append(doc)
        elif "recovery" in doc.page_content.lower():
            recovery_docs.append(doc)
    
    # Create structured context
    context_parts = []
    if metrics_doc:
        context_parts.append("PLAYER METRICS:\n" + metrics_doc.page_content)
    
    if threshold_docs:
        context_parts.append("INJURY RISK THRESHOLDS:\n" + 
                           "\n".join(doc.page_content for doc in threshold_docs))
    
    if recovery_docs:
        context_parts.append("RECOVERY PATTERNS:\n" + 
                           "\n".join(doc.page_content for doc in recovery_docs))
    
    context = "\n\n".join(context_parts)
    
    # Enhanced prompt to focus on research-based analysis
    analysis_prompt = """
    Based on the player metrics and research data provided, conduct a thorough injury risk analysis.
    
    Focus on comparing the player's current metrics to established research thresholds:
    1. Dynamic Stress Load (DSL) patterns
    2. Metabolic power thresholds
    3. Acceleration/Deceleration ratios
    4. Recovery patterns
    
    Research Context:
    {context}
    
    Player: {player_name}
    Question: {question}
    
    Provide:
    1. Specific comparisons between player metrics and research thresholds
    2. Identified risk factors based on research findings
    3. Evidence-based recommendations
    """
    
    messages = PromptTemplate.from_template(analysis_prompt).invoke({
        "context": context,
        "player_name": state["player_name"],
        "question": state["question"]
    })
    
    response = llm.invoke(messages)
    return {"answer": response.content}

# Example usage
if __name__ == "__main__":
    query = "What are the thresholds for injury according to the research?"
    
    print("Streaming Analysis Process:")
    result = analyze_player_risk_streaming(query)
    
    print("\nFinal Analysis:")
    print(result["answer"])

Streaming Analysis Process:
Starting Analysis Stream:
--------------------------------------------------
Step Update:
extract_name: {'player_name': None}
--------------------------------------------------
Step Update:
get_metrics: {'player_metrics': 'No specific player mentioned in query'}
--------------------------------------------------
Retried : -------------------------------------------------------------------------------- 
 []
Step Update:
retrieve: {'context': []}
--------------------------------------------------
Step Update:
generate: {'answer': "To analyze a player's injury risks, please specify a player name in your query."}
--------------------------------------------------
Retried : -------------------------------------------------------------------------------- 
 []

Final Analysis:
To analyze a player's injury risks, please specify a player name in your query.


In [38]:
from loaders_and_chroma_utils import load_and_split_document
def verify_document_loading(file_path: str):
    """Load, index, and verify document loading with detailed debugging."""
    print("\nSTEP 1: Loading document...")
    try:
        # First load and split the document
        splits = load_and_split_document(file_path)
        print(f"Successfully split document into {len(splits)} chunks")
        print("\nFirst chunk preview:")
        if splits:
            print(splits[0].page_content[:200])
        
        print("\nSTEP 2: Clearing existing vectorstore...")
        # Get all existing IDs
        existing_ids = vectorstore._collection.get()['ids']
        if existing_ids:
            vectorstore._collection.delete(ids=existing_ids)
        print("Vectorstore cleared")
        
        print("\nSTEP 3: Indexing document chunks...")
        # Add metadata and IDs to chunks
        for i, split in enumerate(splits):
            split.metadata['file_id'] = 1
            split.metadata['chunk_id'] = i
        
        # Add documents to vectorstore
        vectorstore.add_documents(splits)
        print(f"Added {len(splits)} chunks to vectorstore")
        
        print("\nSTEP 4: Verifying indexed content...")
        # Verify the count
        collection = vectorstore._collection
        count = len(collection.get()['ids'])
        print(f"Number of documents in vectorstore: {count}")
        
        if count > 0:
            print("\nSTEP 5: Testing retrieval...")
            test_queries = [
                "injury risk thresholds",
                "Dynamic Stress Load",
                "metabolic power",
                "critical thresholds"
            ]
            
            for query in test_queries:
                results = vectorstore.similarity_search(query, k=1)
                print(f"\nQuery: {query}")
                if results:
                    print(f"Found document. Preview: {results[0].page_content[:200]}")
                    print(f"Metadata: {results[0].metadata}")
                else:
                    print("No results found")
                    
            return True
        else:
            print("ERROR: No documents found in vectorstore after indexing")
            return False
            
    except Exception as e:
        print(f"ERROR during document processing: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test the function
if __name__ == "__main__":
    file_path = r"C:\Users\j.mundondo\OneDrive - Statsports\Desktop\statsportsdoc\Projects\frequency_chat_PH\data\multi_session_hias\Research Paper.docx"
    print("Starting document verification process...")
    success = verify_document_loading(file_path)
    print(f"\nDocument verification {'successful' if success else 'failed'}")

if __name__ == "__main__":
    file_path = r"C:\Users\j.mundondo\OneDrive - Statsports\Desktop\statsportsdoc\Projects\frequency_chat_PH\data\multi_session_hias\Research Paper.docx"
    print("Starting document verification process...")
    success = verify_document_loading(file_path)
    print(f"\nDocument verification {'successful' if success else 'failed'}")

Starting document verification process...

STEP 1: Loading document...
Successfully loaded DOCX!
Successfully split document into 11 chunks

First chunk preview:
# Relationship Between High-Intensity Activity Metrics and Injury Risk in Professional Athletes: A Longitudinal Analysis



## Abstract



This study investigated the relationship between high-intensi

STEP 2: Clearing existing vectorstore...
Vectorstore cleared

STEP 3: Indexing document chunks...
Added 11 chunks to vectorstore

STEP 4: Verifying indexed content...
Number of documents in vectorstore: 11

STEP 5: Testing retrieval...

Query: injury risk thresholds
Found document. Preview: # Relationship Between High-Intensity Activity Metrics and Injury Risk in Professional Athletes: A Longitudinal Analysis



## Abstract



This study investigated the relationship between high-intensi
Metadata: {'chunk_id': 0, 'file_id': 1, 'source': 'C:\\Users\\j.mundondo\\OneDrive - Statsports\\Desktop\\statsportsdoc\\Projects\\frequency_ch

In [39]:
# Import necessary libraries
from typing import TypedDict, List
from langchain_core.documents import Document
from langgraph.graph import StateGraph, END
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader, 
    UnstructuredPowerPointLoader, 
    CSVLoader, 
    UnstructuredExcelLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
import chromadb

# Initialize components
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Initialize Chroma with persistent client
client = chromadb.PersistentClient(path="./chroma_db")
vectorstore = Chroma(
    client=client,
    embedding_function=embedding_function,
    collection_name="my_collection"
)

# Define State class
class State(TypedDict):
    question: str
    context: List[Document]
    player_metrics: str
    answer: str
    player_name: str

# Document loading functions
def load_and_split_document(file_path: str) -> List[Document]:
    """Load and split a document into chunks."""
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith('.docx'):
        loader = Docx2txtLoader(file_path)
    elif file_path.endswith(('.ppt', '.pptx')):
        loader = UnstructuredPowerPointLoader(file_path, mode="elements")
    elif file_path.endswith(('.xls', '.xlsx')):
        loader = UnstructuredExcelLoader(file_path)
    elif file_path.endswith('.csv'):
        loader = CSVLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_path}")

    documents = loader.load()
    return text_splitter.split_documents(documents)

def verify_document_loading(file_path: str):
    """Load, index, and verify document loading with detailed debugging."""
    print("\nSTEP 1: Loading document...")
    try:
        splits = load_and_split_document(file_path)
        print(f"Successfully split document into {len(splits)} chunks")
        print("\nFirst chunk preview:")
        if splits:
            print(splits[0].page_content[:200])
        
        print("\nSTEP 2: Clearing existing vectorstore...")
        existing_ids = vectorstore._collection.get()['ids']
        if existing_ids:
            vectorstore._collection.delete(ids=existing_ids)
        print("Vectorstore cleared")
        
        print("\nSTEP 3: Indexing document chunks...")
        for i, split in enumerate(splits):
            split.metadata['file_id'] = 1
            split.metadata['chunk_id'] = i
        
        vectorstore.add_documents(splits)
        print(f"Added {len(splits)} chunks to vectorstore")
        
        return True
    except Exception as e:
        print(f"ERROR during document processing: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Graph nodes
def extract_player_name(question: str) -> str:
    """Extract player name from the question using the LLM."""
    name_extraction_prompt = """
    Extract the player name(s) from the following question. 
    If no specific player is mentioned, return None.
    Only return the name(s) without any additional text.
    
    Question: {question}
    """
    
    prompt = PromptTemplate.from_template(name_extraction_prompt)
    messages = prompt.invoke({"question": question})
    response = llm.invoke(messages).content.strip()
    
    return response if response.lower() != "none" else None

def get_player_metrics(player_name: str) -> dict:
    """Get metrics for a specific player."""
    return agent_executor.invoke({
        "input": f"Get {player_name}'s metrics as a dictionary format"
    })["output"]

def setup_initial_state(question: str) -> State:
    """Setup initial state without metrics - moved to separate node."""
    return {
        "question": question,
        "context": [],
        "player_metrics": "",
        "answer": "",
        "player_name": ""
    }

def extract_name_node(state: State):
    """Node for extracting player name."""
    player_name = extract_player_name(state["question"])
    return {"player_name": player_name}

def get_metrics_node(state: State):
    """Node for getting player metrics."""
    if not state["player_name"]:
        return {"player_metrics": "No specific player mentioned in query"}
    
    metrics = get_player_metrics(state["player_name"])
    return {"player_metrics": metrics}

def retrieve_node(state: State):
    """Node for retrieving relevant documents."""
    documents = []
    
    if state["player_metrics"] != "No specific player mentioned in query":
        metrics_doc = Document(
            page_content=f"Current Player Metrics for {state['player_name']}:\n{state['player_metrics']}",
            metadata={"source": "player_metrics"}
        )
        documents.append(metrics_doc)
    
    # Get research documents with specific queries
    research_queries = [
        f"injury risk thresholds for {state['player_name']}'s activity pattern",
        "critical thresholds for Dynamic Stress Load DSL",
        "metabolic power thresholds for injury risk",
        "recovery indicators and patterns research",
        "acceleration deceleration ratio research findings"
    ]
    
    for query in research_queries:
        retrieved_docs = vectorstore.similarity_search(
            query,
            k=2
        )
        documents.extend(retrieved_docs)
    
    return {"context": documents}

def generate_node(state: State):
    """Generate analysis with explicit reference to research data."""
    if state["player_metrics"] == "No specific player mentioned in query":
        return {"answer": "To analyze a player's injury risks, please specify a player name in your query."}
    
    # Separate metrics and research documents
    metrics_doc = None
    research_docs = []
    
    for doc in state["context"]:
        if doc.metadata.get("source") == "player_metrics":
            metrics_doc = doc
        else:
            research_docs.append(doc)
    
    # Format research findings
    research_findings = "\n\n".join([
        f"RESEARCH FINDING {i+1}:\n{doc.page_content}"
        for i, doc in enumerate(research_docs)
    ])
    
    analysis_prompt = """
    Using ONLY the provided research data and player metrics, conduct an injury risk analysis.
    
    PLAYER METRICS:
    {metrics}
    
    RESEARCH FINDINGS:
    {research}
    
    Analyze the following aspects, citing ONLY the provided research:
    1. Compare the player's metrics to the research thresholds
    2. Identify specific risk factors supported by the research
    3. Make recommendations based on the research findings
    
    Format your response with clear sections and evidence from the provided research.
    """
    
    messages = PromptTemplate.from_template(analysis_prompt).invoke({
        "metrics": metrics_doc.page_content if metrics_doc else "No metrics available",
        "research": research_findings
    })
    
    response = llm.invoke(messages)
    return {"answer": response.content}

def build_streaming_graph():
    """Build graph with streaming support."""
    workflow = StateGraph(State)
    
    # Add nodes
    workflow.add_node("extract_name", extract_name_node)
    workflow.add_node("get_metrics", get_metrics_node)
    workflow.add_node("retrieve", retrieve_node)
    workflow.add_node("generate", generate_node)
    
    # Add edges
    workflow.add_edge("extract_name", "get_metrics")
    workflow.add_edge("get_metrics", "retrieve")
    workflow.add_edge("retrieve", "generate")
    workflow.add_edge("generate", END)
    
    # Set entry point
    workflow.set_entry_point("extract_name")
    
    return workflow.compile()

def analyze_player_risk_streaming(question: str):
    """Stream the analysis process."""
    graph = build_streaming_graph()
    initial_state = setup_initial_state(question)
    
    print("Starting Analysis Stream:")
    print("-" * 50)
    
    for step in graph.stream(initial_state, stream_mode="updates"):
        print("Step Update:")
        for key, value in step.items():
            if key == "context":
                print(f"{key}: {len(value)} documents retrieved")
            else:
                print(f"{key}: {value}")
        print("-" * 50)
    
    return graph.invoke(initial_state)

# Main execution
if __name__ == "__main__":
    # First load and verify research document
    file_path = r"C:\Users\j.mundondo\OneDrive - Statsports\Desktop\statsportsdoc\Projects\frequency_chat_PH\data\multi_session_hias\Research Paper.docx"
    success = verify_document_loading(file_path)
    
    if success:
        # Test queries
        test_queries = [
            "What are Lee's injury risks based on his recent activity pattern?",
            "Analyze injury risks for Hawk",
            "What are the injury risks for this player's recent pattern?"
        ]
        
        for query in test_queries:
            print(f"\nAnalyzing query: {query}")
            print("-" * 50)
            response = analyze_player_risk_streaming(query)
            print("Analysis Results:")
            print(response["answer"])
            print("-" * 50)


STEP 1: Loading document...
Successfully split document into 11 chunks

First chunk preview:
# Relationship Between High-Intensity Activity Metrics and Injury Risk in Professional Athletes: A Longitudinal Analysis



## Abstract



This study investigated the relationship between high-intensi

STEP 2: Clearing existing vectorstore...
Vectorstore cleared

STEP 3: Indexing document chunks...
Added 11 chunks to vectorstore

Analyzing query: What are Lee's injury risks based on his recent activity pattern?
--------------------------------------------------
Starting Analysis Stream:
--------------------------------------------------
Step Update:
extract_name: {'player_name': 'Lee'}
--------------------------------------------------
Step Update:
get_metrics: {'player_metrics': {'player_metrics': {'Duration': {'mean': 1.9452702702702702, 'min': 0.7, 'max': 4.2, 'std': 0.7416963879507882, 'count': 148}, 'Distance': {'mean': 6.405405405405405, 'min': 0.36, 'max': 21.21, 'std': 4.57426539634082