In [1]:
API_KEY = "AIzaSyDsFFc9Oboi1vBchAgFTZzvlTuPUwtmTVo"
import google.generativeai as genai

# Configure with your API key
genai.configure(api_key=API_KEY)

# Check available models
models = genai.list_models()
print("Available models:")
for model in models:
    print(f" - {model.name}: {model.display_name}")


  from .autonotebook import tqdm as notebook_tqdm


Available models:
 - models/chat-bison-001: PaLM 2 Chat (Legacy)
 - models/text-bison-001: PaLM 2 (Legacy)
 - models/embedding-gecko-001: Embedding Gecko
 - models/gemini-1.0-pro-vision-latest: Gemini 1.0 Pro Vision
 - models/gemini-pro-vision: Gemini 1.0 Pro Vision
 - models/gemini-1.5-pro-latest: Gemini 1.5 Pro Latest
 - models/gemini-1.5-pro-001: Gemini 1.5 Pro 001
 - models/gemini-1.5-pro-002: Gemini 1.5 Pro 002
 - models/gemini-1.5-pro: Gemini 1.5 Pro
 - models/gemini-1.5-flash-latest: Gemini 1.5 Flash Latest
 - models/gemini-1.5-flash-001: Gemini 1.5 Flash 001
 - models/gemini-1.5-flash-001-tuning: Gemini 1.5 Flash 001 Tuning
 - models/gemini-1.5-flash: Gemini 1.5 Flash
 - models/gemini-1.5-flash-002: Gemini 1.5 Flash 002
 - models/gemini-1.5-flash-8b: Gemini 1.5 Flash-8B
 - models/gemini-1.5-flash-8b-001: Gemini 1.5 Flash-8B 001
 - models/gemini-1.5-flash-8b-latest: Gemini 1.5 Flash-8B Latest
 - models/gemini-1.5-flash-8b-exp-0827: Gemini 1.5 Flash 8B Experimental 0827
 - models

In [8]:
import json
import random
import os
import google.generativeai as genai
from google.api_core import retry
import time

# Configuration
JSONL_FILE_PATH = "math_problems.jsonl"  # Update with your actual JSONL file path
CONCEPT = "Number Theory"  # Target concept
DIFFICULTY_LEVEL = 5       # Target difficulty level
OUTPUT_FILE = "modified_math_problems.json"

# Prompt template for Gemini
PROMPT_TEMPLATE = """
Generate 5 different versions of the following math problem according to these specific categories:

1. Large numbers: Change values to extremely large numbers (e.g., billions or scientific notation)
2. Impossible context: Create a mathematically impossible or inconsistent scenario 
3. Ambiguous: Remove key assumptions or make the problem underspecified
4. Paradox: Create a counter-intuitive or paradoxical version
5. Irrelevant info: Add extraneous details unrelated to the solution

Important: 
- Preserve the core mathematical concept
- Keep LaTeX formatting intact for mathematical expressions
- For each version, maintain the same subject and difficulty level

Original Problem:
{problem}

Output must be valid JSON (without code blocks or extra text) formatted exactly like this:
[
  {{
    "problem_type": "large_numbers",
    "modified_problem": "problem text with LaTeX formatting...",
    "explanation": "brief explanation of what was changed"
  }},
  {{
    "problem_type": "impossible_context",
    "modified_problem": "problem text with LaTeX formatting...",
    "explanation": "brief explanation of what was changed"
  }},
  {{
    "problem_type": "ambiguous",
    "modified_problem": "problem text with LaTeX formatting...",
    "explanation": "brief explanation of what was changed"
  }},
  {{
    "problem_type": "paradox",
    "modified_problem": "problem text with LaTeX formatting...",
    "explanation": "brief explanation of what was changed"
  }},
  {{
    "problem_type": "irrelevant_info",
    "modified_problem": "problem text with LaTeX formatting...",
    "explanation": "brief explanation of what was changed"
  }}
]
"""

def load_jsonl(file_path):
    """Load JSONL file line by line"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {e}")
                    print(f"Problematic line: {line[:100]}...")
    return data

def filter_problems(data, subject, level):
    """Filter problems by subject and level"""
    return [item for item in data 
            if item.get('subject', '').lower() == subject.lower() 
            and item.get('level', 0) == level]

def generate_modified_versions(problem_data):
    """Generate modified versions of a problem using Gemini API"""
    # Set up the Gemini API
    genai.configure(api_key=API_KEY)
    model = genai.GenerativeModel('gemini-1.5-flash')
    
    problem_text = problem_data.get('problem', '')
    
    # Prepare the prompt with the problem
    prompt = PROMPT_TEMPLATE.format(problem=problem_text)
    
    try:
        # Generate response with retry mechanism
        attempts = 0
        max_attempts = 3
        while attempts < max_attempts:
            try:
                response = model.generate_content(prompt)
                
                # Extract text from response based on Gemini API version
                if hasattr(response, 'text'):
                    response_text = response.text
                elif hasattr(response, 'parts') and response.parts:
                    response_text = response.parts[0].text
                else:
                    response_text = str(response)
                
                # Clean up the response to get just the JSON part
                response_text = response_text.strip()
                
                # Find JSON array in the text
                start_idx = response_text.find('[')
                end_idx = response_text.rfind(']') + 1
                
                if start_idx >= 0 and end_idx > start_idx:
                    json_part = response_text[start_idx:end_idx]
                    modified_versions = json.loads(json_part)
                    
                    # Verify we have 5 versions
                    if isinstance(modified_versions, list) and len(modified_versions) == 5:
                        # Add metadata to each version
                        for version in modified_versions:
                            version.update({
                                "original_problem": problem_data.get("problem", ""),
                                "original_answer": problem_data.get("answer", ""),
                                "subject": problem_data.get("subject", ""),
                                "level": problem_data.get("level", 0),
                                "unique_id": problem_data.get("unique_id", "")
                            })
                        return modified_versions
                    else:
                        print(f"Expected 5 versions, got {len(modified_versions) if isinstance(modified_versions, list) else 'not a list'}")
                        attempts += 1
                else:
                    print("No valid JSON array found in response")
                    attempts += 1
                
            except Exception as e:
                print(f"Attempt {attempts+1} failed: {str(e)}")
                attempts += 1
                time.sleep(2)  # Wait before retrying
        
        print(f"Failed to generate versions after {max_attempts} attempts")
        return []
        
    except Exception as e:
        print(f"Error generating versions: {str(e)}")
        return []

def save_results(modified_problems, output_file):
    """Save modified problems to JSON file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(modified_problems, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(modified_problems)} modified problems to {output_file}")

def main():
    # Check if file exists
    if not os.path.exists(JSONL_FILE_PATH):
        print(f"Error: File not found at {JSONL_FILE_PATH}")
        return
    
    # Load and filter data
    print(f"Loading JSONL file from {JSONL_FILE_PATH}...")
    all_problems = load_jsonl(JSONL_FILE_PATH)
    print(f"Loaded {len(all_problems)} problems")
    
    # Filter for number theory problems at level 5
    filtered_problems = filter_problems(all_problems, CONCEPT, DIFFICULTY_LEVEL)
    print(f"Found {len(filtered_problems)} problems in {CONCEPT} at level {DIFFICULTY_LEVEL}")
    
    if not filtered_problems:
        print("No matching problems found. Try different criteria.")
        return
    
    # Process each problem
    all_modified_problems = []
    for i, problem in enumerate(filtered_problems):
        print(f"\nProcessing problem {i+1}/{len(filtered_problems)}:")
        print(f"ID: {problem.get('unique_id', 'Unknown')}")
        print("Problem excerpt:", problem.get('problem', '')[:100] + "..." if len(problem.get('problem', '')) > 100 else problem.get('problem', ''))
        
        # Generate modified versions
        modified_versions = generate_modified_versions(problem)
        
        if modified_versions:
            all_modified_problems.extend(modified_versions)
            print(f"Successfully generated {len(modified_versions)} versions")
        else:
            print("Failed to generate modified versions for this problem")
    
    # Save results
    if all_modified_problems:
        save_results(all_modified_problems, OUTPUT_FILE)
    else:
        print("No modified problems were generated")


In [6]:
import json
import random
import os
import time
import sys

# Try importing google.generativeai, with a helpful error message if it fails
try:
    import google.generativeai as genai
except ImportError:
    print("Error: The 'google-generativeai' package is not installed.")
    print("Please install it using: pip install google-generativeai")
    sys.exit(1)

# Configuration
JSONL_FILE_PATH = "math_problems.jsonl"
CONCEPT = "Number Theory"
DIFFICULTY_LEVEL = 5
OUTPUT_FILE = "modified_math_problems.json"

# Prompt template (simplified for reliability)
PROMPT_TEMPLATE = """
Create 5 modified versions of this math problem:

Original Problem:
{problem}

Create these versions:
1. Version with extremely large numbers
2. Version with mathematically impossible scenario
3. Version missing key assumptions
4. Version with counter-intuitive elements
5. Version with irrelevant information

Format your response as JSON like this:
[
  {{
    "problem_type": "large_numbers",
    "modified_problem": "problem text...",
    "explanation": "explanation..."
  }},
  ...and so on for all 5 types...
]
"""

def load_jsonl(file_path):
    """Load JSONL file line by line"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSONL: {e}")
    return data

def filter_problems(data, subject, level):
    """Filter problems by subject and level"""
    return [item for item in data 
            if item.get('subject', '').lower() == subject.lower() 
            and item.get('level', 0) == level]

def get_available_model():
    """Get the first available text generation model"""
    try:
        # Configure API
        genai.configure(api_key=API_KEY)
        
        # List available models
        models = genai.list_models()
        
        print("Available models:")
        available_models = []
        
        for model in models:
            print(f" - {model.name}")
            # Check if the model supports text generation
            if hasattr(model, 'supported_generation_methods') and 'generateContent' in model.supported_generation_methods:
                available_models.append(model.name)
        
        if not available_models:
            print("No models available that support text generation!")
            return None
            
        # Use the first available model
        model_name = available_models[0]
        print(f"Selected model: {model_name}")
        return model_name
        
    except Exception as e:
        print(f"Error getting available models: {str(e)}")
        return None

def generate_modified_versions(problem_data):
    """Generate modified versions using available Gemini model"""
    # Get a valid model name
    model_name = get_available_model()
    
    if not model_name:
        print("No suitable model found. Cannot proceed.")
        return []
    
    # Configure Gemini
    genai.configure(api_key=API_KEY)
    model = genai.GenerativeModel(model_name)
    
    problem_text = problem_data.get('problem', '')
    
    # Prepare the prompt
    prompt = PROMPT_TEMPLATE.format(problem=problem_text)
    
    try:
        # Generate with retries
        for attempt in range(3):
            try:
                print(f"Attempt {attempt+1}: Generating with model {model_name}")
                
                # Simple generation config for better compatibility
                generation_config = {
                    "temperature": 0.7,
                    "max_output_tokens": 1024,
                }
                
                # Generate content
                response = model.generate_content(prompt, generation_config=generation_config)
                
                # Extract text based on response structure
                response_text = ""
                if hasattr(response, 'text'):
                    response_text = response.text
                elif hasattr(response, 'parts') and response.parts:
                    response_text = ''.join(part.text for part in response.parts)
                else:
                    response_text = str(response)
                
                # Debug response
                print(f"Response type: {type(response)}")
                print(f"Response preview: {response_text[:100]}...")
                
                # Extract JSON array
                response_text = response_text.strip()
                
                # Find and extract JSON
                json_data = None
                
                # Try to extract JSON array with [] brackets
                start_idx = response_text.find('[')
                end_idx = response_text.rfind(']') + 1
                
                if start_idx >= 0 and end_idx > start_idx:
                    json_part = response_text[start_idx:end_idx]
                    
                    # Handle code blocks
                    if "```" in json_part:
                        # Extract content from code blocks
                        parts = json_part.split("```")
                        for part in parts:
                            if part.strip() and '[' in part and ']' in part:
                                json_try = part.strip()
                                start = json_try.find('[')
                                end = json_try.rfind(']') + 1
                                json_part = json_try[start:end]
                                break
                    
                    try:
                        json_data = json.loads(json_part)
                    except json.JSONDecodeError:
                        print(f"Error parsing JSON: {json_part[:100]}...")
                
                # If JSON parsing worked and we have a list
                if json_data and isinstance(json_data, list):
                    # Add metadata
                    for version in json_data:
                        version.update({
                            "original_problem": problem_data.get("problem", ""),
                            "original_answer": problem_data.get("answer", ""),
                            "subject": problem_data.get("subject", ""),
                            "level": problem_data.get("level", 0),
                            "unique_id": problem_data.get("unique_id", "")
                        })
                    return json_data
                
                print("Failed to extract valid JSON from response")
                time.sleep(2)  # Wait before retry
                
            except Exception as e:
                print(f"Attempt {attempt+1} failed: {str(e)}")
                time.sleep(2)
        
        # If all attempts failed
        print("All attempts to generate versions failed")
        return []
        
    except Exception as e:
        print(f"Error in generate_modified_versions: {str(e)}")
        return []

def save_results(modified_problems, output_file):
    """Save modified problems to JSON file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(modified_problems, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(modified_problems)} modified problems to {output_file}")

def main():
    # Check if file exists
    if not os.path.exists(JSONL_FILE_PATH):
        print(f"Error: File not found at {JSONL_FILE_PATH}")
        return
    
    # Load and filter data
    print(f"Loading JSONL file from {JSONL_FILE_PATH}...")
    all_problems = load_jsonl(JSONL_FILE_PATH)
    print(f"Loaded {len(all_problems)} problems")
    
    # Filter for target problems
    filtered_problems = filter_problems(all_problems, CONCEPT, DIFFICULTY_LEVEL)
    print(f"Found {len(filtered_problems)} problems in {CONCEPT} at level {DIFFICULTY_LEVEL}")
    
    if not filtered_problems:
        print("No matching problems found. Try different criteria.")
        return
    
    # Process problems
    all_modified_problems = []
    for i, problem in enumerate(filtered_problems):
        print(f"\nProcessing problem {i+1}/{len(filtered_problems)}:")
        print(f"ID: {problem.get('unique_id', 'Unknown')}")
        print("Problem excerpt:", problem.get('problem', '')[:100] + "...")
        
        # Generate modified versions
        modified_versions = generate_modified_versions(problem)
        
        if modified_versions:
            all_modified_problems.extend(modified_versions)
            print(f"Successfully generated {len(modified_versions)} versions")
        else:
            print("Failed to generate modified versions for this problem")
    
    # Save results
    if all_modified_problems:
        save_results(all_modified_problems, OUTPUT_FILE)
    else:
        print("No modified problems were generated")

if __name__ == "__main__":
    main()

Loading JSONL file from math_problems.jsonl...
Loaded 500 problems
Found 12 problems in Number Theory at level 5

Processing problem 1/12:
ID: test/number_theory/737.json
Problem excerpt: The proper divisors of 12 are 1, 2, 3, 4 and 6. A proper divisor of an integer $N$ is a positive div...
Available models:
 - models/chat-bison-001
 - models/text-bison-001
 - models/embedding-gecko-001
 - models/gemini-1.0-pro-vision-latest
 - models/gemini-pro-vision
 - models/gemini-1.5-pro-latest
 - models/gemini-1.5-pro-001
 - models/gemini-1.5-pro-002
 - models/gemini-1.5-pro
 - models/gemini-1.5-flash-latest
 - models/gemini-1.5-flash-001
 - models/gemini-1.5-flash-001-tuning
 - models/gemini-1.5-flash
 - models/gemini-1.5-flash-002
 - models/gemini-1.5-flash-8b
 - models/gemini-1.5-flash-8b-001
 - models/gemini-1.5-flash-8b-latest
 - models/gemini-1.5-flash-8b-exp-0827
 - models/gemini-1.5-flash-8b-exp-0924
 - models/gemini-2.5-pro-exp-03-25
 - models/gemini-2.5-pro-preview-03-25
 - models/gemi

In [9]:
main()

Loading JSONL file from math_problems.jsonl...
Loaded 500 problems
Found 12 problems in Number Theory at level 5

Processing problem 1/12:
ID: test/number_theory/737.json
Problem excerpt: The proper divisors of 12 are 1, 2, 3, 4 and 6. A proper divisor of an integer $N$ is a positive div...
Successfully generated 5 versions

Processing problem 2/12:
ID: test/number_theory/1055.json
Problem excerpt: You have seven bags of gold coins. Each bag has the same number of gold coins. One day, you find a b...
Successfully generated 5 versions

Processing problem 3/12:
ID: test/number_theory/838.json
Problem excerpt: If $x^3$ is a positive factor of $10!,$ how many possible integer values of $x$ are there?  (Reminde...
Successfully generated 5 versions

Processing problem 4/12:
ID: test/number_theory/631.json
Problem excerpt: One gear turns $33\frac{1}{3}$ times in a minute.  Another gear turns   45 times in a minute.  Initi...
Successfully generated 5 versions

Processing problem 5/12:
ID: test