In [None]:
import sys
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    !pip install -q google-genai google-auth python-dotenv
    from google.colab import auth
    auth.authenticate_user()
    try:
        PROJECT_ID = input("Enter your Google Cloud Project ID (press Enter to use default ADC): ").strip()
    except Exception:
        PROJECT_ID = ""
    if PROJECT_ID:
        import os
        os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

import os
import google.auth
from google import genai
from google.genai import types

creds, project = google.auth.default()
project = os.environ.get("GOOGLE_CLOUD_PROJECT", project)
client = genai.Client(vertexai=True, project=project, location="us-central1")
print(f"Using project: {project}")

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kgweber-cwru/coding-with-ai-wn26/blob/main/series-2-coding-llms/week-1-llm-basics-and-api/concepts.ipynb)

# Week 1: Understanding LLMs and API Basics

## Learning Objectives
By the end of this session, you will:
- Understand how large language models work at a conceptual level
- Successfully make API calls to Vertex AI
- Understand key API parameters and their effects
- Build simple text generation scripts

## Part 1: How LLMs Work (Conceptual Overview)

### Tokens: The Building Blocks
- LLMs don't see words, they see **tokens**
- A token is roughly 3-4 characters or about 0.75 words
- "Hello world" ≈ 2-3 tokens
- This matters for cost and context limits!

### Prediction and Probability
- LLMs predict the next token based on all previous tokens
- They assign probabilities to many possible next tokens
- They don't "know" things - they predict statistically likely continuations
- Temperature controls randomness in selection

### Key Limitations
- No real-time information (knowledge cutoff dates)
- Can "hallucinate" plausible-sounding but false information
- Cannot count tokens or characters perfectly
- Context window limits (how much text they can "remember")

## Part 2: Setting Up Your Environment

### Load Environment Variables
We use `python-dotenv` to keep API keys secure and separate from code.

In [None]:
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types
import google.auth

# Load environment variables from .env file
load_dotenv()

# Initialize the Vertex AI client
creds, project = google.auth.default()
client = genai.Client(vertexai=True, project=project, location="us-central1")

print("✓ Environment loaded successfully!")
print(f"✓ Project ID found: {project}")

## Part 3: Your First API Call

The basic structure of a Vertex AI API call:
```python
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="Your prompt here"
)
```

In [None]:
# Simple completion
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="Say hello!"
)

print(response.text)

### Understanding the Response Object

Let's examine what the API returns:

In [None]:
# Make another call and explore the response
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="What is 2+2?"
)

print("Full response object:")
print(response)
print("\n" + "="*50 + "\n")

print("Just the content:")
print(response.text)
print("\n" + "="*50 + "\n")

print("Token usage:")
# Check if usage metadata is available
if response.usage_metadata:
    print(f"Prompt tokens: {response.usage_metadata.prompt_token_count}")
    print(f"Completion tokens: {response.usage_metadata.candidates_token_count}")
    print(f"Total tokens: {response.usage_metadata.total_token_count}")

## Part 4: Key API Parameters

### Temperature (0.0 to 2.0)
Controls randomness:
- **0.0**: Deterministic, always picks most likely token
- **0.7**: Balanced (default for most uses)
- **1.5+**: Very creative/random

In [None]:
# Let's compare different temperatures
prompt = "Complete this sentence: The best thing about learning to code is"

for temp in [0.0, 0.7, 1.5]:
    response = client.models.generate_content(
        model="gemini-1.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(
            temperature=temp,
            max_output_tokens=50
        )
    )
    print(f"Temperature {temp}:")
    print(response.text)
    print("\n" + "-"*50 + "\n")

### Max Tokens
Limits the length of the response. Important for cost control!

In [None]:
# Compare different max_output_tokens
prompt = "Explain what a large language model is."

for max_tok in [20, 50, 150]:
    response = client.models.generate_content(
        model="gemini-1.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(
            max_output_tokens=max_tok
        )
    )
    print(f"Max tokens: {max_tok}")
    print(response.text)
    if response.usage_metadata:
        print(f"Actual tokens used: {response.usage_metadata.candidates_token_count}")
    print("\n" + "-"*50 + "\n")

### System Instructions
Set the behavior and personality of the model:

In [None]:
# Without system message
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="What is DNA?"
)
print("Without system message:")
print(response.text)
print("\n" + "="*50 + "\n")

# With system instruction
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="What is DNA?",
    config=types.GenerateContentConfig(
        system_instruction="You are a biology teacher explaining concepts to 10-year-olds. Use simple language and fun analogies."
    )
)
print("With system message (10-year-old level):")
print(response.text)

## Part 5: Practical Examples

### Example 1: Text Summarization

In [None]:
long_text = """
Large language models are artificial intelligence systems trained on vast amounts of text data. 
They learn patterns in language by predicting the next word in a sequence. These models have billions 
of parameters and can generate human-like text, answer questions, write code, and perform various 
language tasks. They work by converting text into numerical representations called tokens, processing 
these tokens through neural network layers, and generating probability distributions for likely next tokens.
"""

response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents=long_text,
    config=types.GenerateContentConfig(
        system_instruction="Summarize the following text in one sentence.",
        temperature=0.3
    )
)

print("Summary:")
print(response.text)

### Example 2: Text Classification

In [None]:
def classify_sentiment(text):
    response = client.models.generate_content(
        model="gemini-1.5-flash",
        contents=text,
        config=types.GenerateContentConfig(
            system_instruction="Classify the sentiment of the text as: positive, negative, or neutral. Respond with only one word.",
            temperature=0,
            max_output_tokens=10
        )
    )
    return response.text.strip()

# Test it
test_texts = [
    "I love this new feature!",
    "This is the worst experience ever.",
    "The product arrived on time."
]

for text in test_texts:
    sentiment = classify_sentiment(text)
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}")
    print()

### Example 3: Information Extraction

In [None]:
def extract_info(text):
    response = client.models.generate_content(
        model="gemini-1.5-flash",
        contents=text,
        config=types.GenerateContentConfig(
            system_instruction="Extract the person's name, email, and phone number from the text. Format as: Name: X | Email: Y | Phone: Z",
            temperature=0
        )
    )
    return response.text

contact_text = "Hi, I'm John Smith. You can reach me at john.smith@email.com or call me at 555-123-4567."

extracted = extract_info(contact_text)
print("Extracted information:")
print(extracted)

## Part 6: Cost Awareness

Understanding and tracking your API costs:

In [None]:
# Pricing (Example pricing for Gemini 1.5 Flash, check official pricing!)
# Gemini 1.5 Flash: ~$0.075 per 1M input tokens, ~$0.30 per 1M output tokens (approximate)

def estimate_cost(response, model="gemini-1.5-flash"):
    """Estimate the cost of an API call"""
    # Example pricing (verify at cloud.google.com/vertex-ai/pricing)
    pricing = {
        "gemini-1.5-flash": {"input": 0.075, "output": 0.30}, 
        "gemini-1.5-pro": {"input": 3.50, "output": 10.50}
    }
    
    # Handle unknown models or default
    if model not in pricing:
        model = "gemini-1.5-flash"
        
    if not response.usage_metadata:
        return {"cost_usd": 0, "total_tokens": 0}

    input_tokens = response.usage_metadata.prompt_token_count
    output_tokens = response.usage_metadata.candidates_token_count
    total_tokens = response.usage_metadata.total_token_count

    input_cost = (input_tokens / 1_000_000) * pricing[model]["input"]
    output_cost = (output_tokens / 1_000_000) * pricing[model]["output"]
    total_cost = input_cost + output_cost
    
    return {
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": total_tokens,
        "cost_usd": total_cost,
        "cost_cents": total_cost * 100
    }

# Test it
response = client.models.generate_content(
    model="gemini-1.5-flash",
    contents="Write a haiku about programming."
)

print(response.text)
print("\n" + "="*50)
cost_info = estimate_cost(response, model="gemini-1.5-flash")
print(f"\nTokens used: {cost_info['total_tokens']}")
print(f"Estimated cost: ${cost_info['cost_usd']:.8f} (or {cost_info['cost_cents']:.6f} cents)")

## Key Takeaways

1. **LLMs predict tokens** based on probability, they don't "know" facts
2. **API structure** is simple: model + contents + parameters
3. **Temperature** controls randomness (0 = deterministic, higher = creative)
4. **max_output_tokens** limits response length and controls costs
5. **System instructions** shape the model's behavior
6. **Always monitor costs** - even small calls add up!

## Next Steps

In Week 2, we'll learn how to:
- Maintain conversation history
- Build multi-turn conversations
- Manage context effectively

Complete the assignment to practice these concepts!