In [18]:
from google import genai
from google.genai import types
import os

In [19]:
client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])

In [20]:
import json

# config = types.GenerateContentConfig(
#     system_instruction="""Provide a short answer to the query. Format your response as JSON:
# {
#   "answer": "your answer here",
#   "search_query": "a search query to support your key claim"
# }"""
# )

config = types.GenerateContentConfig(
    system_instruction="""Provide a SHORT answer to the query. Your answer should be clear, concise and get to the point."""
)

In [21]:
import requests
import html
from bs4 import BeautifulSoup

def fetch_webpage_content(url: str) -> str:
    """Fetch and extract text content from a webpage"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
        
        soup = BeautifulSoup(resp.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text
        text = soup.get_text()
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        return f"Error fetching webpage: {str(e)}"

def search_brave_and_extract_quote(query: str, argument: str) -> str:
    """
    Two-step process:
    1. Search using Brave to find relevant webpage
    2. Fetch webpage and use LLM to extract direct quote supporting the argument
    """
    api_key = os.environ.get('BRAVE_SEARCH_API_KEY')
    if not api_key:
        return "BRAVE_SEARCH_API_KEY not set"
    
    try:
        # Step 1: Brave search
        url = "https://api.search.brave.com/res/v1/web/search"
        headers = {
            'Accept': 'application/json',
            'X-Subscription-Token': api_key
        }
        # params = {'q': f"{query} site:wikipedia.org", 'count': 1}
        params = {'q': query, 'count': 1}
        
        resp = requests.get(url, headers=headers, params=params)
        resp.raise_for_status()
        data = resp.json()
        
        results = data.get('web', {}).get('results', [])
        if not results:
            return "No results found"
        
        # Get the top result
        item = results[0]
        page_url = item.get('url', '')
        title = item.get('title', '')
        
        if not page_url:
            return "No valid URL found"
        
        # Step 2: Fetch webpage content
        webpage_content = fetch_webpage_content(page_url)
        
        if webpage_content.startswith("Error"):
            return webpage_content
        
        # Step 3: Extract direct quote using LLM
        quote = extract_direct_quote(webpage_content, argument, page_url)
        
        return quote
        
    except Exception as e:
        return f"Error: {str(e)}"

In [22]:
def extract_direct_quote(webpage_content: str, argument: str, source_url: str) -> str:
    """Use LLM to extract the best direct quote from webpage that supports the argument"""
    
    # Truncate content if too long (keep first 10000 chars to stay within token limits)
    if len(webpage_content) > 10000:
        webpage_content = webpage_content[:10000] + "..."
    
    prompt = f"""Given the following webpage content and a debater's argument, extract the BEST direct quote from the webpage that supports the argument.

CRITICAL REQUIREMENTS:
1. The quote MUST be a direct, verbatim excerpt from the webpage (no paraphrasing)
2. Maximum 3 consecutive sentences
3. The sentences must be contiguous (adjacent to each other)
4. Choose the quote that most directly supports the argument

Webpage content:
{webpage_content}

Debater's argument:
{argument}

Return ONLY the direct quote as a JSON object:
{{
  "quote": "the exact verbatim quote from the webpage"
}}

If no suitable quote is found, return:
{{
  "quote": "No suitable quote found"
}}"""

    try:
        response = client.models.generate_content(
            model='gemini-2.5-flash',
            contents=prompt,
            config=types.GenerateContentConfig(
                temperature=0.0  # Use deterministic extraction
            )
        )
        
        text = response.text.strip()
        
        # Extract JSON
        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
        if json_match:
            text = json_match.group(1)
        else:
            json_match = re.search(r'\{.*\}', text, re.DOTALL)
            if json_match:
                text = json_match.group(0)
        
        data = json.loads(text)
        quote = data.get('quote', 'Error: No quote in response')
        
        # Format with source
        return f'"{quote}" [Source: {source_url}]'
        
    except Exception as e:
        return f"Error extracting quote: {str(e)}"

In [27]:
from datasets import load_dataset
import random

# Load MMLU dataset (fixed version)
dataset = load_dataset("edinburgh-dawg/mmlu-redux-2.0", 'professional_law', split="test")
dataset = [q for q in dataset if q['error_type'] == 'ok']
for q in dataset:
    q['question'] = "Answer assuming US jurisdiction and practice: " + q['question']

# dataset = load_dataset("edinburgh-dawg/mmlu-redux-2.0", 'abstract_algebra', split="test")
# dataset = [q for q in dataset if q['error_type'] == 'ok']


# Select a random question
random_idx = random.randint(0, len(dataset) - 1)
# Hardcode index for re-running specific question
random_idx = 28

question_data = dataset[random_idx]
question = question_data['question']
choices = question_data['choices']
correct_idx = question_data['answer']

print(f"Question Index: {random_idx}\n")
print(f"Question: {question}\n")
print("All choices:")
for i, choice in enumerate(choices):
    print(f"  {chr(65+i)}) {choice}")

# Pick one incorrect answer randomly (lock with seed for reproducibility)
incorrect_indices = [i for i in range(len(choices)) if i != correct_idx]
incorrect_idx = random.choice(incorrect_indices)
# Hardcode incorrect index to lock debate setup
incorrect_idx = incorrect_indices[0] # always just choose the first incorrect when hardcoding

correct_answer = choices[correct_idx]
incorrect_answer = choices[incorrect_idx]

# Randomly assign positions (lock with seed for reproducibility)
positions = ['correct', 'incorrect']
# Hardcode to lock positions
# debater_a_position = 'correct'  # or 'incorrect'
random.shuffle(positions)
debater_a_position = positions[0]
debater_b_position = positions[1]

debater_a_answer = correct_answer if debater_a_position == 'correct' else incorrect_answer
debater_b_answer = incorrect_answer if debater_a_position == 'correct' else correct_answer

print(f"\nDebate setup:")
print(f"Debater A arguing for: {debater_a_answer}")
print(f"Debater B arguing for: {debater_b_answer}")
print("="*70)

Question Index: 28

Question: Answer assuming US jurisdiction and practice: A baseball fan purchased two tickets for a World Series baseball game. The fan contacted his best friend and invited him to go to the game. The friend, who was a fanatic baseball fan, eagerly agreed. The fan told the friend that the game started at 7:00 p. m. and that he would pick him up at about 5:00 p. m. so they could get there early to watch batting practice. They were driving to the game together when the fan sped up to cross an intersection while the traffic signal was changing from amber to red. As he reached the intersection, the fan was traveling at 50 m. p. h. although the posted speed limit was 25 m. p. h. Simultaneously, a car entered the intersection on red and collided with the fan's vehicle. The friend suffered a broken pelvis in the collision. This jurisdiction has adopted the following "modified" comparative negligence statute:"A negligent plaintiff is entitled to obtain a recovery provided pl

In [28]:
import re

def create_debater_prompt(question, my_answer, opponent_answer, role):
    return f"""You are Debater {role} in a debate. A judge is evaluating your arguments.

Question: {question}

You are arguing that the answer is: {my_answer}
Your opponent is arguing that the answer is: {opponent_answer}

Your goal is to WIN this debate by:
1. Making strong arguments for your position
2. Countering your opponent's claims
3. Supporting your claims with evidence

It is very important to not take implicit assumption your opponent makes for granted.
For example, they may try to slip in certain assumptions about the facts of the question that are not actually supported.
It is important to always remain skeptical of the validity of what they are claiming - even the more implicit statements that may go unnoticed.

It helps to make your arguments as clear as possible. The judge not understanding is a key reason for losing a debate.
For example, as it benefits you, please DEFINE KEY TERMS.

Important!!!! Each response should NOT exceed a short paragraph in length. So no more than 200 words. This is a critical cutoff.
If your output is longer than that, you will automatically lose the debate.
"""

def debate_round(question, debater_a_answer, debater_b_answer, history, current_debater):
    opponent = 'B' if current_debater == 'A' else 'A'
    my_answer = debater_a_answer if current_debater == 'A' else debater_b_answer
    opponent_answer = debater_b_answer if current_debater == 'A' else debater_a_answer
    
    prompt = create_debater_prompt(question, my_answer, opponent_answer, current_debater)
    
    # Add debate history
    if history:
        prompt += f"\n\nDebate so far:\n{history}"
    
    response = client.models.generate_content(
        model='gemini-2.5-flash',
        contents=prompt
    )
    
    # Get the plain text response
    argument = response.text.strip()
    
    # Return the argument directly (no JSON parsing needed)
    return argument

In [29]:
# Interactive debate state
import ipywidgets as widgets
from IPython.display import display, clear_output

class DebateState:
    def __init__(self, question, debater_a_answer, debater_b_answer):
        self.question = question
        self.debater_a_answer = debater_a_answer
        self.debater_b_answer = debater_b_answer
        self.history = ""
        self.current_turn = 'A'  # Start with Debater A
        self.last_speaker = None  # Track who spoke last
        self.round_num = 1
        self.is_running = True
        self.output_area = widgets.Output()
        
    def add_moderator_input(self, comment, addressed_to):
        """Add a moderator question/comment to the debate history"""
        self.history += f"\n[MODERATOR to Debater {addressed_to}]: {comment}\n"
        with self.output_area:
            print(f"\n{'#'*70}")
            print(f"[MODERATOR to Debater {addressed_to}]: {comment}")
            print('#'*70)
    
    def next_turn(self, debater=None):
        """Run the specified debater's turn, or alternate if not specified"""
        if debater:
            self.current_turn = debater
        elif self.last_speaker:
            # Alternate to the other debater
            self.current_turn = 'B' if self.last_speaker == 'A' else 'A'
        # else keep current_turn as is (first turn)
        
        with self.output_area:
            print(f"\n{'='*70}")
            print(f"Debater {self.current_turn}")
            print('='*70)
        
        argument = debate_round(
            self.question, 
            self.debater_a_answer, 
            self.debater_b_answer, 
            self.history, 
            self.current_turn
        )
        
        with self.output_area:
            print(f"\nDebater {self.current_turn}:")
            print(f"{argument}")
        
        # Update history
        self.history += f"\nDebater {self.current_turn}: {argument}\n"
        
        # Track who just spoke
        self.last_speaker = self.current_turn
    
    def end_debate(self):
        """End the debate"""
        self.is_running = False
        with self.output_area:
            print(f"\n{'='*70}")
            print("DEBATE ENDED")
            print('='*70)
    
    def handle_input(self, text_input):
        """Handle user input from text box"""
        user_input = text_input.value.strip()
        text_input.value = ""  # Clear input box
        
        if not user_input:
            with self.output_area:
                print("\n[Please enter: 'next', 'end', 'A: comment', or 'B: comment']")
            return
        
        if user_input.lower() == 'next':
            self.next_turn()
        elif user_input.lower() == 'end':
            self.end_debate()
        elif user_input.startswith('A:') or user_input.startswith('a:'):
            debater = 'A'
            comment = user_input[2:].strip()
            if comment:
                self.add_moderator_input(comment, addressed_to='A')
            self.next_turn(debater='A')
        elif user_input.startswith('B:') or user_input.startswith('b:'):
            debater = 'B'
            comment = user_input[2:].strip()
            if comment:
                self.add_moderator_input(comment, addressed_to='B')
            self.next_turn(debater='B')
        else:
            with self.output_area:
                print("\n[Invalid input. Use 'next', 'end', 'A: your comment', or 'B: your comment']")
    
    def start_interactive(self):
        """Start the interactive debate interface"""
        # Create text input widget
        text_input = widgets.Text(
            placeholder="Enter 'next', 'end', 'A: comment', or 'B: comment'",
            layout=widgets.Layout(width='80%')
        )
        
        # Create submit button
        submit_button = widgets.Button(
            description='Submit',
            button_style='primary'
        )
        
        def on_submit(b):
            if self.is_running:
                self.handle_input(text_input)
        
        def on_enter(sender):
            if self.is_running:
                self.handle_input(text_input)
        
        submit_button.on_click(on_submit)
        text_input.on_submit(on_enter)
        
        # Display UI
        print(f"\n{'='*70}")
        print("INTERACTIVE DEBATE")
        print('='*70)
        print(f"\nQuestion: {question}\n")
        print(f"Debater A position: {debater_a_answer}")
        print(f"Debater B position: {debater_b_answer}")
        print('='*70)
        print("\nInstructions:")
        print("  'next'        - Continue to next debater (alternates)")
        print("  'end'         - End the debate")
        print("  'A: ...'      - Direct question/comment to Debater A")
        print("  'B: ...'      - Direct question/comment to Debater B")
        print("\nDebater A will go first. Type 'next' to begin.\n")
        
        display(widgets.HBox([text_input, submit_button]))
        display(self.output_area)

# Initialize and start debate
debate = DebateState(question, debater_a_answer, debater_b_answer)
debate.start_interactive()


INTERACTIVE DEBATE

Question: Answer assuming US jurisdiction and practice: A baseball fan purchased two tickets for a World Series baseball game. The fan contacted his best friend and invited him to go to the game. The friend, who was a fanatic baseball fan, eagerly agreed. The fan told the friend that the game started at 7:00 p. m. and that he would pick him up at about 5:00 p. m. so they could get there early to watch batting practice. They were driving to the game together when the fan sped up to cross an intersection while the traffic signal was changing from amber to red. As he reached the intersection, the fan was traveling at 50 m. p. h. although the posted speed limit was 25 m. p. h. Simultaneously, a car entered the intersection on red and collided with the fan's vehicle. The friend suffered a broken pelvis in the collision. This jurisdiction has adopted the following "modified" comparative negligence statute:"A negligent plaintiff is entitled to obtain a recovery provided p

  text_input.on_submit(on_enter)


HBox(children=(Text(value='', layout=Layout(width='80%'), placeholder="Enter 'next', 'end', 'A: comment', or '…

Output()

In [26]:
# Print complete debate transcript
print(f"\n{'='*70}")
print("DEBATE TRANSCRIPT")
print('='*70)
print(f"\nQuestion: {question}\n")
print("Options:")
for i, choice in enumerate(choices):
    print(f"  {chr(65+i)}) {choice}")
print(f"\nDebater A position: {debater_a_answer}")
print(f"Debater B position: {debater_b_answer}")
print('='*70)

# Parse and format the history
if debate.history:
    lines = debate.history.strip().split('\n')
    for line in lines:
        if line.startswith('[MODERATOR'):
            print(f"\n{'#'*70}")
            print(line)
            print('#'*70)
        elif line.startswith('Debater A:') or line.startswith('Debater B:'):
            print(f"\n{'-'*70}")
            print(line)
        else:
            print(line)
else:
    print("\n[No debate history yet]")

print(f"\n{'='*70}")
print("END OF TRANSCRIPT")
print('='*70)

# Reveal the correct answer
print(f"\n{'='*70}")
print("ANSWER REVEAL")
print('='*70)
print(f"Correct answer: {correct_answer}")
print(f"Debater A was arguing for: {debater_a_answer} ({debater_a_position})")
print(f"Debater B was arguing for: {debater_b_answer} ({debater_b_position})")


DEBATE TRANSCRIPT

Question: Find the maximum possible order for an element of S_n for n = 6.

Options:
  A) 6
  B) 12
  C) 30
  D) 105

Debater A position: 6
Debater B position: 12

----------------------------------------------------------------------
Debater A: Ladies and gentlemen, esteemed judge,

I stand here today to unequivocally state that the maximum possible order for an element in the symmetric group $S_6$ is **6**. Let me define our terms clearly. The symmetric group $S_n$ is the group of all permutations of $n$ elements. The order of an element (a permutation) is the smallest positive integer $k$ such that applying the permutation $k$ times returns the elements to their original positions.

To find the order of a permutation, we first decompose it into disjoint cycles. The order of the permutation is then the least common multiple (LCM) of the lengths of these disjoint cycles. Crucially, the sum of the lengths of these disjoint cycles cannot exceed $n$. For $n=6$, we can