In [1]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m5.6 MB/s[0

In [2]:
import pandas as pd
import json
from tqdm import tqdm
import random
from IPython.display import Markdown
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from google import genai
from google.genai import types

In [3]:
from kaggle_secrets import UserSecretsClient
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")


In [4]:
client = genai.Client(api_key=GOOGLE_API_KEY)

for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


In [5]:
# Create embedding function for ChromaDB
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503}))
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]



In [6]:
df = pd.read_csv('/kaggle/input/climate-change-faqs/climate_change_faqs.csv')

# Display sample data
print("Sample FAQ data:")
print(df.head())


Sample FAQ data:
                                              source  \
0  https://www.ipcc.ch/site/assets/uploads/2020/0...   
1  https://www.ipcc.ch/site/assets/uploads/2020/0...   
2  https://www.ipcc.ch/site/assets/uploads/2020/0...   
3  https://www.ipcc.ch/site/assets/uploads/2020/0...   
4  https://www.ipcc.ch/site/assets/uploads/2020/0...   

                                                 faq text_type  
0  If Understanding of the Climate System Has Inc...         q  
1  The models used to calculate the IPCC’s temper...         a  
2               How Do We Know the World Has Warmed?         q  
3  Evidence for a warming world comes from multip...         a  
4   Have There Been Any Changes in Climate Extremes?         q  


In [7]:

# Process FAQ data
# Create a dictionary to map questions to answers using the source as a key
faq_dict = {}
documents = []
metadatas = []
ids = []

# First pass to collect all questions and answers by source
for _, row in df.iterrows():
    source = row['source']
    content = row['faq']
    content_type = row['text_type']
    
    if source not in faq_dict:
        faq_dict[source] = {'q': [], 'a': []}
    
    faq_dict[source][content_type].append(content)

In [8]:
# Second pass to pair questions with answers and prepare documents
doc_id = 0
for source, content_dict in faq_dict.items():
    questions = content_dict.get('q', [])
    answers = content_dict.get('a', [])
    
    # Match questions with answers
    for i in range(min(len(questions), len(answers))):
        question = questions[i]
        answer = answers[i]
        
        # Format the document
        document = f"Question: {question}\nAnswer: {answer}"
        documents.append(document)
        
        # Create metadata
        metadata = {
            "source": source,
            "question": question,
            "answer": answer
        }
        metadatas.append(metadata)
        
        # Create unique ID
        ids.append(str(doc_id))
        doc_id += 1

print(f"Processed {len(documents)} FAQ pairs")


Processed 188 FAQ pairs


## ChromaDB

In [9]:
# Initialize ChromaDB
embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True  # Set to document mode for indexing

chroma_client = chromadb.Client()
collection_name = "climate_faq_db"

# Delete collection if it exists (for clean restart)
try:
    chroma_client.delete_collection(collection_name)
except:
    pass

# Create new collection
db = chroma_client.create_collection(name=collection_name, embedding_function=embed_fn)

# Add documents to the database in smaller batches to avoid potential errors
BATCH_SIZE = 50
for i in range(0, len(documents), BATCH_SIZE):
    end_idx = min(i + BATCH_SIZE, len(documents))
    print(f"Adding documents {i} to {end_idx-1}...")
    
    batch_docs = documents[i:end_idx]
    batch_metadata = metadatas[i:end_idx]
    batch_ids = ids[i:end_idx]
    
    db.add(
        documents=batch_docs,
        metadatas=batch_metadata,
        ids=batch_ids
    )

print(f"Total documents in database: {db.count()}")


Adding documents 0 to 49...
Adding documents 50 to 99...
Adding documents 100 to 149...
Adding documents 150 to 187...
Total documents in database: 188


In [10]:
# Function to query the FAQ database
def query_faq_database(query, n_results=3):
    # Switch to query mode for searching
    embed_fn.document_mode = False
    
    # Search the database
    results = db.query(
        query_texts=[query], 
        n_results=min(n_results, db.count())
    )
    
    return results

# Function to determine confidence level
def calculate_confidence(distance):
    # Convert distance to confidence score (0-1)
    # Lower distance means higher confidence
    if distance is None:
        return 0.5  # Default confidence if distance is not available
    return max(0, min(1, 1 - distance / 2))

# Define thresholds for agent handoff
CONFIDENCE_THRESHOLD = 0.75  # Minimum confidence required
UNCERTAIN_THRESHOLD = 0.65   # Below this is uncertain, trigger potential handoff

# Function to generate response with Gemini model
def generate_response(prompt, model="gemini-2.0-flash"):
    try:
        response = client.models.generate_content(
            model=model,
            contents=prompt
        )
        return response.text
    except Exception as e:
        print(f"Error generating response: {e}")
        return f"I'm sorry, I encountered an error processing your request. Please try again."


In [11]:
# Function to handle customer query with structured JSON output
def handle_customer_query(query, include_json=True):
    # Get relevant FAQ matches
    results = query_faq_database(query)
    
    # Check if we have results
    if not results["documents"] or len(results["documents"][0]) == 0:
        # No relevant documents found
        return {
            "answer": "I don't have specific information to answer your question accurately. Let me connect you with a human agent who can help you better.",
            "confidence_score": 0.0,
            "needs_human_handoff": True,
            "reference_sources": [],
            "uncertain_response": True
        }
    
    documents = results["documents"][0]
    distances = results.get("distances", [[0.5] * len(documents)])[0]  # Default distance if not available
    metadatas = results["metadatas"][0]
    
    # Calculate confidence scores
    confidence_scores = [calculate_confidence(dist) for dist in distances]
    max_confidence = max(confidence_scores) if confidence_scores else 0
    
    # Determine if we need human handoff
    needs_human_handoff = max_confidence < CONFIDENCE_THRESHOLD
    uncertain_response = max_confidence < UNCERTAIN_THRESHOLD
    
    # Build context for the model
    context = ""
    for i, doc in enumerate(documents):
        context += f"Reference {i+1} (Confidence: {confidence_scores[i]:.2f}):\n{doc}\n\n"
    
    # Prepare prompt based on confidence level
    if max_confidence >= CONFIDENCE_THRESHOLD:
        prompt_template = f"""You are a helpful climate science assistant. Answer the question using the provided references.
If the references don't contain enough information, say you don't have enough information.

QUESTION: {query}

REFERENCES:
{context}

Generate your response in a friendly, conversational tone. Include relevant facts from the references.
"""
    else:
        prompt_template = f"""You are a helpful climate science assistant that works with human agents.
Based on the query and references below, you need to:
1. Try to provide a helpful preliminary response based on available information
2. Acknowledge that you're not entirely confident in your answer
3. Mention that you're connecting the customer to a human agent for better assistance

QUESTION: {query}

REFERENCES:
{context}

Generate your response in a friendly, conversational tone.
"""
    
    # Generate response using Gemini model
    if include_json:
        # Add structured output format instruction
        prompt_template += """
Return your response in a JSON format with the following structure:
{
  "answer": "Your answer to the question",
  "confidence_score": float between 0 and 1,
  "needs_human_handoff": boolean,
  "reference_sources": ["list of source URLs used"],
  "uncertain_response": boolean
}
"""
    
    response_text = generate_response(prompt_template)
    
    # If JSON output was requested, parse it
    if include_json:
        try:
            # Try to extract and parse JSON from the response
            json_str = response_text.strip()
            if '```json' in json_str:
                json_str = json_str.split('```json')[1].split('```')[0].strip()
            elif '```' in json_str:
                json_str = json_str.split('```')[1].split('```')[0].strip()
                
            result = json.loads(json_str)
            
            # Add sources if not present
            if "reference_sources" not in result or not result["reference_sources"]:
                result["reference_sources"] = [meta.get("source", "") for meta in metadatas]
                
            # Ensure all fields are present
            result.setdefault("confidence_score", max_confidence)
            result.setdefault("needs_human_handoff", needs_human_handoff)
            result.setdefault("uncertain_response", uncertain_response)
            
            return result
        except Exception as e:
            print(f"Error parsing JSON response: {e}")
            # Fallback to text response with added fields
            return {
                "answer": response_text,
                "confidence_score": max_confidence,
                "needs_human_handoff": needs_human_handoff,
                "reference_sources": [meta.get("source", "") for meta in metadatas],
                "uncertain_response": uncertain_response
            }
    else:
        # Return regular text response
        return response_text

## agent handoff

In [12]:
# Live agent handoff function
def transfer_to_human_agent(query, customer_info, ai_response):
    """
    Simulate transferring the conversation to a human agent
    In a real implementation, this would integrate with your support ticket system
    """
    ticket = {
        "query": query,
        "customer_info": customer_info,
        "ai_response": ai_response,
        "ticket_id": f"TICKET-{random.randint(10000, 99999)}",
        "status": "open",
        "priority": "medium",
        "assigned_to": "next_available_agent",
        "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    # In a real system, you would:
    # 1. Create a ticket in your support system
    # 2. Queue for next available agent
    # 3. Transfer the chat context
    
    return ticket

# Sample FAQ for few-shot examples
sample_faqs = [
    {
        "question": "How Do We Know the World Has Warmed?",
        "answer": "Evidence for a warming world comes from multiple independent climate indicators, from high up in the atmosphere to the depths of the oceans. They include changes in surface, atmospheric and oceanic temperatures, glaciers, snow cover, sea ice, sea level and atmospheric water vapor."
    },
    {
        "question": "Have There Been Any Changes in Climate Extremes?",
        "answer": "Since about 1950, changes in many extreme weather and climate events have been observed. Some of these changes have been linked to human influences, including a decrease in cold temperature extremes, an increase in warm temperature extremes, an increase in extreme high sea levels and an increase in the number of heavy precipitation events in various regions."
    }
]

## few-shot prompting

In [13]:
def get_few_shot_prompt(query):
    few_shot_template = """I'll demonstrate how to answer climate science questions with accurate information:

Example 1:
Question: {question1}
Answer: {answer1}

Example 2:
Question: {question2}
Answer: {answer2}

Now, please answer this question in a similar style:
Question: {user_query}
""".format(
        question1=sample_faqs[0]["question"],
        answer1=sample_faqs[0]["answer"],
        question2=sample_faqs[1]["question"],
        answer2=sample_faqs[1]["answer"],
        user_query=query
    )
    
    return few_shot_template

# Enhanced function for handling complex queries with few-shot prompting
def handle_complex_query(query):
    # First try standard RAG approach
    standard_result = handle_customer_query(query)
    
    # If confidence is low, try few-shot approach
    if standard_result["confidence_score"] < 0.6:
        few_shot_prompt = get_few_shot_prompt(query)
        
        # Get relevant FAQ matches
        results = query_faq_database(query)
        if results["documents"] and len(results["documents"][0]) > 0:
            documents = results["documents"][0]
            context = "\n\n".join(documents)
            
            # Create enhanced prompt with few-shot examples and context
            enhanced_prompt = f"{few_shot_prompt}\n\nRelevant information:\n{context}"
            
            # Generate improved response
            response_text = generate_response(enhanced_prompt)
            
            # Update result
            standard_result["answer"] = response_text
            standard_result["confidence_score"] += 0.1  # Slightly boost confidence
    
    return standard_result

## test the system

In [14]:
# Demo function to test the system
def run_customer_support_demo():
    print("🌍 Climate Science Customer Support AI Demo")
    print("-------------------------------------------")
    print("Type 'exit' to quit the demo")
    
    # Keep conversation history
    conversation_history = []
    
    while True:
        query = input("\n👤 Customer: ")
        if query.lower() in ['exit', 'quit']:
            print("Thank you for using our demo!")
            break
            
        # Add to conversation history
        conversation_history.append({"role": "user", "content": query})
        
        # Mock customer info (in production this would come from your system)
        customer_info = {
            "customer_id": "demo-user-123",
            "name": "Demo User",
            "contact": "demo@example.com"
        }
        
        # Process the query
        print("\n⏳ Processing query...")
        
        # Try complex query handling for better responses
        result = handle_complex_query(query)
        
        # Display response
        print(f"\n🤖 AI Assistant: {result['answer']}")
        
        # Add to conversation history
        conversation_history.append({"role": "assistant", "content": result['answer']})
        
        # Show metadata (for demo purposes)
        print(f"\n📊 Response metadata:")
        print(f"Confidence: {result['confidence_score']:.2f}")
        print(f"Sources: {', '.join(result['reference_sources'])}")
        
        # Handle agent transfer if needed
        if result['needs_human_handoff']:
            print("\n🔄 Transferring to human agent...")
            ticket = transfer_to_human_agent(query, customer_info, result)
            print(f"\n👨‍💼 Human Agent: Hello! I'm taking over from our AI assistant (Ticket #{ticket['ticket_id']})")
            print("How can I help you further with your question about climate science?")
            
            # Exit the loop after human handoff
            print("\nDemo ended after human handoff. Type 'run_customer_support_demo()' to start a new session.")
            break

# Demo function for testing structured JSON output
def demo_structured_output():
    query = "What does the IPCC say about sea level rise?"
    print(f"Query: {query}\n")
    
    # Get response with structured output
    result = handle_customer_query(query, include_json=True)
    
    # Print formatted JSON
    print("Structured JSON Output:")
    print(json.dumps(result, indent=2))
    
    # Show how this can be used programmatically
    if result["needs_human_handoff"]:
        print("\nSystem would automatically transfer to human agent")
    else:
        print("\nAI can handle this query confidently")

# Run the demo
if __name__ == "__main__":
    # Print info about the data
    print(f"\nTotal FAQs available: {len(df)}")
    print(f"Questions: {len(df[df['text_type'] == 'q'])}")
    print(f"Answers: {len(df[df['text_type'] == 'a'])}")
    print("\nType run_customer_support_demo() to start the interactive demo")
    print("Type demo_structured_output() to see JSON structured output example")


Total FAQs available: 376
Questions: 188
Answers: 188

Type run_customer_support_demo() to start the interactive demo
Type demo_structured_output() to see JSON structured output example


In [15]:
demo_structured_output()

Query: What does the IPCC say about sea level rise?

Structured JSON Output:
{
  "answer": "Based on the IPCC reports, sea levels are expected to continue rising due to melting ice and thermal expansion of the ocean as it warms. The rate of sea level rise has been increasing. Depending on future emissions, sea level rise could be between 29-59 cm or up to 1 meter by 2100 relative to 1986-2005 levels. There's also a risk of much larger increases from the melting of ice sheets in Greenland and West Antarctica, which could eventually lead to several meters of sea level rise.",
  "confidence_score": 0.7,
  "needs_human_handoff": true,
  "reference_sources": [
    "https://www.imperial.ac.uk/grantham/publications/climate-change-faqs/",
    "https://www.theguardian.com/environment/series/the-ultimate-climate-change-faq",
    "https://www.theguardian.com/environment/series/the-ultimate-climate-change-faq"
  ],
  "uncertain_response": true
}

System would automatically transfer to human agent


### Things to improve
Proper implementation of live agent hang off