# Gemini Pydantic


In [3]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from .env file
load_dotenv()

# Get the API key from environment variables
api_key = os.getenv("GEMINI_API_KEY")

if not api_key:
    raise ValueError("GEMINI_API_KEY not found in environment variables")

# Configure the Gemini client
genai.configure(api_key=api_key)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Initialize the Gemini model
model = genai.GenerativeModel('gemini-2.5-flash')

# Test connection with a simple "hello world" example
def test_connection():
    """Test the Gemini API connection with a simple prompt."""
    try:
        response = model.generate_content("Tell me a joke about programming.")
        print("✅ Connection successful!")
        print(f"Response: {response.text}")
        return True
    except Exception as e:
        print(f"❌ Connection failed: {e}")
        return False

# Test the connection
if test_connection():
    print("Ready to proceed with structured data generation!")

✅ Connection successful!
Response: Why do programmers prefer dark mode?

Because light attracts bugs!
Ready to proceed with structured data generation!


In [5]:
def generate_library_data():
    """Generate library data from Gemini API."""
    
    # Multi-line prompt for generating structured JSON
    prompt = """
    Generate a JSON object representing a library with the following structure:
    - A library name (string)
    - A list of books, where each book has:
      - title (string)
      - author (string) 
      - year (positive integer greater than 1000 and less than or equal to the current year)
    
    Please include exactly 5 books from different time periods.
    Return ONLY the JSON object, no additional text or markdown formatting.
    """
    
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error generating content: {e}")
        return None

# Generate the data
raw_response = generate_library_data()
print("Raw response from Gemini:")
print(raw_response)
print(f"Response type: {type(raw_response)}")

Raw response from Gemini:
```json
{
  "library_name": "Central Fictional Library",
  "books": [
    {
      "title": "Pride and Prejudice",
      "author": "Jane Austen",
      "year": 1813
    },
    {
      "title": "1984",
      "author": "George Orwell",
      "year": 1949
    },
    {
      "title": "To Kill a Mockingbird",
      "author": "Harper Lee",
      "year": 1960
    },
    {
      "title": "The Hitchhiker's Guide to the Galaxy",
      "author": "Douglas Adams",
      "year": 1979
    },
    {
      "title": "Project Hail Mary",
      "author": "Andy Weir",
      "year": 2021
    }
  ]
}
```
Response type: <class 'str'>


In [6]:
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime

### Creating the Book Model

In [9]:
class Book(BaseModel):
    """Pydantic model for a book with validation."""
    
    title: str
    author: str
    year: int = Field(
        gt=1,  # Greater than 1
        le=datetime.now().year,  # Less than or equal to current year
        description="Publication year must be between 1000 and current year"
    )
    
    class Config:
        # Enable additional validations
        str_strip_whitespace = True  # Automatically strip whitespace from strings
        validate_assignment = True   # Validate on assignment after creation

### Creating the Library Model

In [11]:
class Library(BaseModel):
    """Pydantic model for a library containing books."""
    
    library_name: str = Field(min_length=1, description="Library name cannot be empty")
    books: List[Book] = Field(description="List of books in the library")
    
    class Config:
        str_strip_whitespace = True
        validate_assignment = True
    
    def get_books_by_author(self, author: str) -> List[Book]:
        """Get all books by a specific author."""
        return [book for book in self.books if book.author.lower() == author.lower()]
    
    def get_books_after_year(self, year: int) -> List[Book]:
        """Get all books published after a specific year."""
        return [book for book in self.books if book.year > year]
    
    def get_average_publication_year(self) -> float:
        """Calculate average publication year of all books."""
        if not self.books:
            return 0
        return sum(book.year for book in self.books) / len(self.books)

In [12]:
def clean_json_response(response_text):
    """Remove markdown formatting from JSON response."""
    
    # Remove common markdown JSON block formatting
    cleaned = response_text.strip()
    
    # Remove ```json and ``` markers
    if cleaned.startswith('```json'):
        cleaned = cleaned[7:]  # Remove ```json
    elif cleaned.startswith('```'):
        cleaned = cleaned[3:]   # Remove ```
        
    if cleaned.endswith('```'):
        cleaned = cleaned[:-3]  # Remove trailing ```
    
    return cleaned.strip()

# Example usage
raw_response = generate_library_data()
clean_json = clean_json_response(raw_response)
print("Cleaned JSON:")
print(clean_json)

Cleaned JSON:
{"library_name": "Literary Haven Library", "books": [{"title": "Pride and Prejudice", "author": "Jane Austen", "year": 1813}, {"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "year": 1925}, {"title": "1984", "author": "George Orwell", "year": 1949}, {"title": "Beloved", "author": "Toni Morrison", "year": 1987}, {"title": "Project Hail Mary", "author": "Andy Weir", "year": 2021}]}


### Converting JSON to Pydantic Objects

In [13]:
def parse_library_data(json_string: str) -> Library:
    """Parse JSON string into a validated Library object."""
    
    try:
        # This is the key method - it validates and parses in one step
        library = Library.model_validate_json(json_string)
        print("✅ Successfully parsed and validated library data!")
        return library
        
    except Exception as e:
        print(f"❌ Validation error: {e}")
        raise

# Complete workflow example
def create_validated_library():
    """Complete workflow: Generate -> Clean -> Validate -> Return"""
    
    # Step 1: Generate raw data from Gemini
    raw_response = generate_library_data()
    
    # Step 2: Clean the response
    clean_json = clean_json_response(raw_response)
    
    # Step 3: Parse and validate
    library = parse_library_data(clean_json)
    
    return library

# Create your validated library
library = create_validated_library()
print(f"Library: {library.library_name}")
print(f"Number of books: {len(library.books)}")

✅ Successfully parsed and validated library data!
Library: City Central Library
Number of books: 5


### demonstrate Validation

In [14]:
# Example of validation error handling
def demonstrate_validation():
    """Show how Pydantic handles invalid data."""
    
    invalid_json = '''
    {
        "name": "",
        "books": [
            {
                "title": "Invalid Book",
                "author": "Test Author",
                "year": 3000
            }
        ]
    }
    '''
    
    try:
        library = Library.model_validate_json(invalid_json)
    except Exception as e:
        print("Validation errors:")
        print(e)
        
# This will show detailed error messages about invalid fields
demonstrate_validation()

Validation errors:
2 validation errors for Library
library_name
  Field required [type=missing, input_value={'name': '', 'books': [{'...Author', 'year': 3000}]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
books.0.year
  Input should be less than or equal to 2025 [type=less_than_equal, input_value=3000, input_type=int]
    For further information visit https://errors.pydantic.dev/2.11/v/less_than_equal


### Exporting to JSON

In [15]:
import json

def export_library_to_json(library: Library, filename: str = "library_data.json"):
    """Export library data to a JSON file."""
    
    # Convert Pydantic object to JSON string with formatting
    json_string = library.model_dump_json(indent=4)
    
    # Write to file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json_string)
    
    print(f"✅ Library data exported to {filename}")
    
    # Also print formatted JSON
    print("\nFormatted JSON:")
    print(json_string)

# Export the library
export_library_to_json(library)

✅ Library data exported to library_data.json

Formatted JSON:
{
    "library_name": "City Central Library",
    "books": [
        {
            "title": "Pride and Prejudice",
            "author": "Jane Austen",
            "year": 1813
        },
        {
            "title": "The Great Gatsby",
            "author": "F. Scott Fitzgerald",
            "year": 1925
        },
        {
            "title": "To Kill a Mockingbird",
            "author": "Harper Lee",
            "year": 1960
        },
        {
            "title": "A Brief History of Time",
            "author": "Stephen Hawking",
            "year": 1988
        },
        {
            "title": "The Midnight Library",
            "author": "Matt Haig",
            "year": 2020
        }
    ]
}


### Integrating with Pandas for Analysis

In [16]:
import pandas as pd
import matplotlib.pyplot as plt

def library_to_dataframe(library: Library) -> pd.DataFrame:
    """Convert library data to a Pandas DataFrame."""
    
    # Extract data into separate lists
    titles = [book.title for book in library.books]
    authors = [book.author for book in library.books] 
    years = [book.year for book in library.books]
    
    # Create DataFrame
    df = pd.DataFrame({
        'title': titles,
        'author': authors,
        'year': years
    })
    
    return df

def analyze_library_data(library: Library):
    """Perform basic analysis on library data."""
    
    # Create DataFrame
    df = library_to_dataframe(library)
    
    print("📊 Library Data Analysis")
    print("=" * 50)
    
    # Basic statistics
    print(f"Total books: {len(df)}")
    print(f"Unique authors: {df['author'].nunique()}")
    print(f"Year range: {df['year'].min()} - {df['year'].max()}")
    print(f"Average publication year: {df['year'].mean():.1f}")
    
    print("\n📈 Publication Years Distribution:")
    print(df['year'].describe())
    
    # Group by decade
    df['decade'] = (df['year'] // 10) * 10
    decade_counts = df['decade'].value_counts().sort_index()
    print(f"\n📅 Books by Decade:")
    for decade, count in decade_counts.items():
        print(f"{decade}s: {count} book(s)")
    
    return df

# Perform analysis
df = analyze_library_data(library)
print("\n📋 DataFrame Preview:")
print(df)

📊 Library Data Analysis
Total books: 5
Unique authors: 5
Year range: 1813 - 2020
Average publication year: 1941.2

📈 Publication Years Distribution:
count       5.000000
mean     1941.200000
std        79.766534
min      1813.000000
25%      1925.000000
50%      1960.000000
75%      1988.000000
max      2020.000000
Name: year, dtype: float64

📅 Books by Decade:
1810s: 1 book(s)
1920s: 1 book(s)
1960s: 1 book(s)
1980s: 1 book(s)
2020s: 1 book(s)

📋 DataFrame Preview:
                     title               author  year  decade
0      Pride and Prejudice          Jane Austen  1813    1810
1         The Great Gatsby  F. Scott Fitzgerald  1925    1920
2    To Kill a Mockingbird           Harper Lee  1960    1960
3  A Brief History of Time      Stephen Hawking  1988    1980
4     The Midnight Library            Matt Haig  2020    2020
