In [13]:
import json
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from pymongo import MongoClient
from pymongo.collection import Collection
from typing import List, Dict, Any
from dotenv import load_dotenv
load_dotenv()
# Ensure the Google API key is set
google_api_key = os.getenv("GOOGLE_API_KEY")
mongodb_uri = os.getenv("MONGO_URI")  # Add this to your environment variables

if not google_api_key:
    raise ValueError("Google API key not found in environment variables.")
if not mongodb_uri:
    raise ValueError("MongoDB URI not found in environment variables.")

def get_mongodb_collection() -> Collection:
    """Get MongoDB collection instance."""
    client = MongoClient(mongodb_uri)
    db = client['cricket']
    return db['players_summaries']

def load_player_summaries(file_path: str) -> Dict:
    """Load player summaries from JSON file."""
    with open(file_path, "r") as f:
        return json.load(f)

def create_player_documents(player_summaries: Dict) -> List[Dict[str, Any]]:
    """
    Create player documents with embeddings for MongoDB.
    
    Args:
    - player_summaries (dict): Player summaries data.
    
    Returns:
    - list: List of documents with embeddings.
    """
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    documents = []
    
    for player_id, player_data in player_summaries.items():
        player_name = player_data.get("name", "Unknown")
        summary = player_data.get("summary", "No summary available.")
        
        # Create the text that will be embedded
        text_to_embed = f"Player ID: {player_id}\nName: {player_name}\n\n{summary}"
        
        # Generate embedding
        embedding_vector = embeddings.embed_query(text_to_embed)
        
        # Create document
        document = {
            "player_id": player_id,
            "name": player_name,
            "full_text": text_to_embed,
            "embedding": embedding_vector
        }
        documents.append(document)
    
    return documents

def store_player_documents(documents: List[Dict[str, Any]]) -> None:
    """
    Store player documents in MongoDB.
    
    Args:
    - documents (list): List of player documents with embeddings.
    """
    collection = get_mongodb_collection()
    
    # Drop existing documents and insert new ones
    collection.drop()
    collection.insert_many(documents)
    print(f"Successfully stored {len(documents)} player documents in MongoDB.")

def search_similar_players(query: str, limit: int = 2) -> List[Dict]:
    """
    Search for similar players using vector similarity.
    
    Args:
    - query (str): Search query
    - limit (int): Number of results to return
    
    Returns:
    - list: List of similar player documents
    """
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    query_embedding = embeddings.embed_query(query)
    
    collection = get_mongodb_collection()
    
    # Corrected Vector Search Pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",   # Ensure this matches your MongoDB index name
                "limit": limit,
                "numCandidates": 10,
                "path": "embedding",       # This should match the field where embeddings are stored
                "queryVector": query_embedding
            }
        },
        {
            "$project": {
                "_id": 0,
                "player_id": 1,
                "name": 1,
                "full_text": 1,
                "score": {"$meta": "vectorSearchScore"}  # Fix metadata field name
            }
        }
    ]
    
    results = list(collection.aggregate(pipeline))
    return results


# Update the get_player_context function to use MongoDB
def get_player_context(player_query: str, top_k: int = 2) -> List[Dict]:
    """
    Get player context using vector similarity search.
    
    Args:
    - player_query (str): Query string
    - top_k (int): Number of results to return
    
    Returns:
    - list: List of relevant documents
    """
    results = search_similar_players(player_query, top_k)
    
    # Convert to format compatible with existing code
    from langchain_core.documents import Document
    documents = []
    for result in results:
        doc = Document(
            page_content=result['full_text'],
            metadata={"player_id": result['player_id'], "name": result['name']}
        )
        documents.append(doc)
    
    return documents

# Example usage
if __name__ == "__main__":
    # Load player summaries
    file_path = "player_summaries.json"
    player_summaries = load_player_summaries(file_path)
    
    # Create and store player documents with embeddings
    player_documents = create_player_documents(player_summaries)
    store_player_documents(player_documents)
    
    # Query example
    player_query = "jadeja"
    relevant_contexts = get_player_context(player_query)
    
    # Print relevant context
    print(f"### Relevant Context for {player_query} ###\n")
    for doc in relevant_contexts:
        print(doc.page_content)
        print("\nMetadata:", doc.metadata)



Successfully stored 43 player documents in MongoDB.
### Relevant Context for jadeja ###



In [14]:
from groq import Groq
import os

# Initialize the Groq client
api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)

def query_ai_model(question, context):
    """
    Query the AI model with a user question and relevant context.
    
    Args:
    - question (str): User's question.
    - context (str): Relevant context retrieved from FAISS index.
    
    Returns:
    - str: AI-generated response.
    """
    prompt = f"""
    You are an expert cricket analyst with deep knowledge of the game. Your role is to analyze match data and provide insightful responses to questions.
### Guidelines:
1. Response Format:
   - Begin with a direct answer to the question
   - Support your analysis with relevant statistics from the provided match data
   - Use cricket terminology appropriately
   - Keep responses concise but informative
2. Analysis Guidelines:
   - When analyzing batting:
     * Consider phase-wise performance (powerplay, middle overs, death overs)
     * Look at strike rates and boundary percentages
     * Analyze dismissal patterns and batting control
   - When analyzing bowling:
     * Focus on economy rates and wicket-taking ability in different phases
     * Consider boundary percentage conceded
     * Analyze bowling variations and their effectiveness
3. Special Instructions:
   - Always reference specific data points to support your analysis
   - If asked about comparisons, use actual numbers from the match data
   - If data is not available for a specific query, clearly state that
   - For trend analysis, consider both basic and detailed statistics
   - When discussing match results, reference both venue and head-to-head analysis
4. Response Style:
   - Be professional but engaging
   - Use clear, concise language
   - Present complex statistics in an easy-to-understand manner
   - Highlight key insights and patterns
Remember to maintain objectivity and base all analysis on the provided match data.
    ### Context:
    {context}
    
    ### Question:
    {question}
    """
    
    try:
        response = client.chat.completions.create(
            # model="mixtral-8x7b-32768",
            model="llama3-70b-8192",
            # model="llama-3.1-8b-instant",  # Replace with your model
            messages=[
                {"role": "system", 
                 "content": '''
                    You are an expert cricket analyst. Analyze match data and provide insightful responses.
                    Remember to maintain objectivity and base all analysis on the provided match data.'''
                 },
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=512
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error querying AI model: {str(e)}"

# Example usage
def process_user_query(user_question, faiss_index, top_k=5):
    """
    Process the user question by retrieving relevant context and querying the AI model.
    
    Args:
    - user_question (str): User's question.
    - faiss_index: Loaded FAISS index object.
    - top_k (int): Number of top relevant chunks to retrieve.
    
    Returns:
    - None
    """
    # Retrieve relevant context
    relevant_docs = get_player_context(user_question, faiss_index, top_k=top_k)
    relevant_context = "\n".join([doc.page_content for doc in relevant_docs])
    
    if not relevant_context:
        print("No relevant context found for the question.")
        return

    # Query the AI model
    ai_response = query_ai_model(user_question, relevant_context)
    
    # Print the response
    print(f"### User Question:\n{user_question}\n")
    print(f"### AI Response:\n{ai_response}\n")

# Load FAISS index (assuming it's already created and saved locally)
faiss_index = load_faiss_index()


questions = [
    
    'which bowler has good stats against gaikwad in opponent team?'
]
for question in questions:
    process_user_query(question, faiss_index)

# Example question
# user_question = "Which players performed well in the last five matches?"
# process_user_query(user_question, faiss_index)


TypeError: get_player_context() got multiple values for argument 'top_k'