### Comparing Groq model and Gemini 2.5 pro on UCI Diabetes Dataset
- Let's see


In [3]:
# --- Rest of imports and setup from previous Cell 1 ---
import pandas as pd
import numpy as np
import json
import os
import warnings
from dotenv import load_dotenv

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Opacus
from opacus import PrivacyEngine
from opacus.validators import ModuleValidator # Optional

In [4]:
# Description: Import libraries, set up constants, initialize/manage results list.

# --- Make sure results_list is either reset or you append with clear names ---
# Option 1: Reset for a fresh run on this dataset
# results_list = []
# Option 2: Keep previous results (make sure Run Type names are distinct)
#print(f"Starting Diabetes run. Current results count: {len(results_list)}")

# --- Constants for Diabetes ---
DATA_FILE_DIABETES = 'diabetic_data.csv' # Assuming this is the filename
TARGET_COLUMN_DIABETES = 'readmitted_binary' # Our new binary target

DATA_FILE_INSURANCE = 'insurance_cleaned.csv' # YOUR FILENAME HERE
TARGET_COLUMN_INSURANCE = 'charges' # Target for regression

# LLM Clients
from groq import Groq
from google import genai
from google.genai import types

# Suppress warnings
warnings.filterwarnings('ignore')
load_dotenv()

# --- Base Training Hyperparameters (can be reused) ---
LEARNING_RATE = 0.01
EPOCHS = 10
BATCH_SIZE = 64
RANDOM_STATE = 42
TEST_SIZE = 0.2

# --- Default DP Parameters (will be recalculated) ---
DEFAULT_TARGET_EPSILON = 1.0
DEFAULT_TARGET_DELTA = 1e-5 # Placeholder, recalculate based on N
DEFAULT_MAX_GRAD_NORM = 1.0

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- LLM Clients Initialization (reuse from previous) ---
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GROQ_MODEL_NAME = os.getenv("GROQ_MODEL_NAME", "llama3-70b-8192")
GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-1.5-flash")

groq_client = None
gemini_client = None
# ... (rest of client initialization logic from previous Cell 1) ...
if GROQ_API_KEY:
    try:
        groq_client = Groq(api_key=GROQ_API_KEY)
        print(f"Groq client initialized successfully for model: {GROQ_MODEL_NAME}")
    except Exception as e:
        print(f"Error initializing Groq client: {e}")
else:
    print("Warning: GROQ_API_KEY not found. Groq LLM will not be used.")

if GEMINI_API_KEY:
    try:
        gemini_client = genai.Client(api_key=GEMINI_API_KEY)
        print(f"Gemini client initialized successfully for model: {GEMINI_MODEL_NAME}")
    except Exception as e:
        print(f"Error initializing Gemini client: {e}")
else:
    print("Warning: GEMINI_API_KEY not found. Gemini LLM will not be used.")

# --- LLM Helper Functions (reuse from previous Cell 6) ---
# Includes: create_llm_prompt, get_gemini_config, get_groq_config
# Make sure create_llm_prompt is the latest version including ML context

# --- DP Training Function (reuse from previous Cell 7) ---
# Includes: train_evaluate_dp_model (ensure it handles classification)

# --- Logistic Regression Model Class (reuse from previous Cell 4) ---
class LogisticRegression(nn.Module):
    def __init__(self, n_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_features, 1)
    def forward(self, x):
        return self.linear(x)
print("\nBasic setup complete.")

Using device: cpu
Groq client initialized successfully for model: deepseek-r1-distill-llama-70b
Gemini client initialized successfully for model: gemini-2.5-pro-exp-03-25

Basic setup complete.


In [5]:
# Description: Load the Insurance (Medical Cost) dataset.

print(f"\n--- Loading Dataset: {DATA_FILE_INSURANCE} ---")
try:
    df_insurance = pd.read_csv(DATA_FILE_INSURANCE)
    print("Insurance Dataset loaded successfully.")
    print("Dataset shape:", df_insurance.shape)
    print("\nFirst 5 rows (Insurance):")
    print(df_insurance.head())
    print("\nDataset Info (Insurance):")
    df_insurance.info()

    # Check for missing values (insurance.csv usually clean, but good practice)
    print("\nMissing values per column (Insurance):")
    print(df_insurance.isnull().sum())

    if TARGET_COLUMN_INSURANCE not in df_insurance.columns:
        print(f"ERROR: Target column '{TARGET_COLUMN_INSURANCE}' not found!")
        df_insurance = None

    if df_insurance is not None:
        original_columns_insurance = df_insurance.columns.tolist()
        print("\nOriginal Columns (Insurance):", original_columns_insurance)
    else:
        original_columns_insurance = []

except FileNotFoundError:
    print(f"Error: File not found at {DATA_FILE_INSURANCE}.")
    df_insurance = None
    original_columns_insurance = []
except Exception as e:
    print(f"An error occurred: {e}")
    df_insurance = None
    original_columns_insurance = []


--- Loading Dataset: insurance_cleaned.csv ---
Insurance Dataset loaded successfully.
Dataset shape: (1338, 7)

First 5 rows (Insurance):
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

Dataset Info (Insurance):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   f

In [7]:
# Description: Preprocess insurance data for regression task, including log transformation of target.

print("\n--- Preprocessing Insurance Data (Regression with Log Transform) ---")
if df_insurance is not None:
    X_insurance = df_insurance.drop(TARGET_COLUMN_INSURANCE, axis=1)
    
    # --- Log transform the target variable ---
    # Using np.log1p to handle potential zeros gracefully (log(1+x))
    # Ensure target is numeric before transformation
    if pd.api.types.is_numeric_dtype(df_insurance[TARGET_COLUMN_INSURANCE]):
        y_insurance_original_scale = df_insurance[TARGET_COLUMN_INSURANCE].copy() # Keep original for final eval
        y_insurance = np.log1p(df_insurance[TARGET_COLUMN_INSURANCE])
        print(f"Target variable '{TARGET_COLUMN_INSURANCE}' log-transformed (using np.log1p).")
    else:
        print(f"ERROR: Target column '{TARGET_COLUMN_INSURANCE}' is not numeric. Cannot apply log transform.")
        y_insurance = None # Prevent downstream errors
        y_insurance_original_scale = None
    # -----------------------------------------

    if y_insurance is not None:
        # Identify column types for Insurance dataset
        categorical_features_insurance = X_insurance.select_dtypes(include=['object', 'category']).columns.tolist()
        numerical_features_insurance = X_insurance.select_dtypes(include=np.number).columns.tolist()

        print(f"\nIdentified Categorical Features (Insurance): {categorical_features_insurance}")
        print(f"Identified Numerical Features (Insurance): {numerical_features_insurance}")

        # Create preprocessing pipelines
        numerical_transformer_insurance = StandardScaler()
        categorical_transformer_insurance = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

        preprocessor_insurance = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer_insurance, numerical_features_insurance),
                ('cat', categorical_transformer_insurance, categorical_features_insurance)
            ],
            remainder='passthrough'
        )

        # Split data - NO STRATIFICATION for regression
        # Use the log-transformed y for training
        X_train_ins, X_test_ins, y_train_log_ins, y_test_log_ins = train_test_split(
            X_insurance, y_insurance, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )
        # Also keep a split of the original scale y_test for final evaluation
        _, _, _, y_test_original_scale_ins = train_test_split(
            X_insurance, y_insurance_original_scale, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )


        # Fit preprocessor on training data and transform
        try:
            X_train_processed_ins = preprocessor_insurance.fit_transform(X_train_ins)
            X_test_processed_ins = preprocessor_insurance.transform(X_test_ins)
        except Exception as e:
            print(f"Error during preprocessing transformation (Insurance): {e}")
            X_train_processed_ins, X_test_processed_ins = None, None

        if X_train_processed_ins is not None:
            n_features_ins = X_train_processed_ins.shape[1]
            print(f"\nNumber of features after preprocessing (Insurance): {n_features_ins}")

            # Convert to Tensors (using log-transformed y for training/prediction)
            X_train_tensor_ins = torch.tensor(X_train_processed_ins.astype(np.float32)).to(device)
            y_train_tensor_log_ins = torch.tensor(y_train_log_ins.values.astype(np.float32)).unsqueeze(1).to(device)
            X_test_tensor_ins = torch.tensor(X_test_processed_ins.astype(np.float32)).to(device)
            y_test_tensor_log_ins = torch.tensor(y_test_log_ins.values.astype(np.float32)).unsqueeze(1).to(device) # For calculating loss on log scale

            # Create DataLoaders
            train_dataset_ins = TensorDataset(X_train_tensor_ins, y_train_tensor_log_ins)
            test_dataset_ins = TensorDataset(X_test_tensor_ins, y_test_tensor_log_ins) # Loader for test features and LOGGED target
            train_loader_ins = DataLoader(train_dataset_ins, batch_size=BATCH_SIZE, shuffle=True)
            test_loader_ins = DataLoader(test_dataset_ins, batch_size=BATCH_SIZE, shuffle=False)

            print("\nInsurance Data preprocessing and splitting complete (target log-transformed).")
            print(f"Training set size: {len(train_dataset_ins)}")

            DEFAULT_TARGET_DELTA_INS = 1 / len(train_dataset_ins)
            print(f"Default Target Delta for Insurance dataset (1/N): {DEFAULT_TARGET_DELTA_INS:.2e}")
        else:
            print("Skipping Insurance data tensor conversion due to preprocessing error.")
            # ... (set None to prevent errors) ...
            n_features_ins, train_loader_ins, test_loader_ins, DEFAULT_TARGET_DELTA_INS = None, None, None, 1e-5
            y_test_original_scale_ins = None # Also ensure this is None
    else:
        print("Skipping Insurance preprocessing further due to target transformation error.")
        # ... (set None to prevent errors) ...
        n_features_ins, train_loader_ins, test_loader_ins, DEFAULT_TARGET_DELTA_INS = None, None, None, 1e-5
        y_test_original_scale_ins = None # Also ensure this is None
else:
    print("Skipping Insurance preprocessing due to data loading error.")
    # ... (set None to prevent errors) ...
    n_features_ins, train_loader_ins, test_loader_ins, DEFAULT_TARGET_DELTA_INS = None, None, None, 1e-5
    y_test_original_scale_ins = None # Also ensure this is None


--- Preprocessing Insurance Data (Regression with Log Transform) ---
Target variable 'charges' log-transformed (using np.log1p).

Identified Categorical Features (Insurance): ['sex', 'smoker', 'region']
Identified Numerical Features (Insurance): ['age', 'bmi', 'children']

Number of features after preprocessing (Insurance): 9

Insurance Data preprocessing and splitting complete (target log-transformed).
Training set size: 1070
Default Target Delta for Insurance dataset (1/N): 9.35e-04


In [8]:
results_list = []


In [9]:
# Description: Train and evaluate standard model for Insurance cost prediction (Regression),
# accounting for log-transformed target.

print("\n--- Running: Insurance Non-DP Linear Regression (Log-Target) ---")
if n_features_ins is not None and train_loader_ins is not None and test_loader_ins is not None and 'y_test_original_scale_ins' in locals() and y_test_original_scale_ins is not None:
    model_non_dp_ins = LogisticRegression(n_features_ins).to(device)
    criterion_ins_reg = nn.MSELoss() # Loss is calculated on log-transformed scale
    optimizer_non_dp_ins = optim.SGD(model_non_dp_ins.parameters(), lr=LEARNING_RATE)

    print("Training Standard Linear Regression (Insurance - Log-Target)...")
    model_non_dp_ins.train()
    for epoch in range(EPOCHS):
        for batch_X, batch_y_log in train_loader_ins: # y is log-transformed
            batch_X, batch_y_log = batch_X.to(device), batch_y_log.to(device)
            optimizer_non_dp_ins.zero_grad()
            outputs_log = model_non_dp_ins(batch_X) # Predicts log(charges)
            loss = criterion_ins_reg(outputs_log, batch_y_log)
            loss.backward()
            optimizer_non_dp_ins.step()
    print("Standard Training Complete (Insurance - Log-Target).")

    print("Evaluating Standard Model (Insurance - Log-Target)...")
    model_non_dp_ins.eval()
    all_preds_log_non_dp_ins = []
    # True targets for loss calculation are y_test_tensor_log_ins (from test_loader_ins)
    # True targets for final metric calculation are y_test_original_scale_ins

    with torch.no_grad():
        for batch_X, _ in test_loader_ins: # We only need X from loader, true y_log is used for loss
            batch_X = batch_X.to(device)
            outputs_log = model_non_dp_ins(batch_X) # Predictions are log(charges)
            all_preds_log_non_dp_ins.extend(outputs_log.cpu().numpy().flatten())

    # Inverse transform predictions to original scale
    # Predictions were on log(1+y) scale, so inverse is exp(pred)-1
    all_preds_original_scale_non_dp_ins = np.expm1(np.array(all_preds_log_non_dp_ins))
    # Ensure no negative predictions after inverse transform if charges must be positive
    all_preds_original_scale_non_dp_ins = np.maximum(0, all_preds_original_scale_non_dp_ins)


    # Calculate regression metrics on the ORIGINAL scale
    true_targets_original_scale = y_test_original_scale_ins.values # Get numpy array from Series

    mae_non_dp_ins = mean_absolute_error(true_targets_original_scale, all_preds_original_scale_non_dp_ins)
    mse_non_dp_ins = mean_squared_error(true_targets_original_scale, all_preds_original_scale_non_dp_ins)
    r2_non_dp_ins = r2_score(true_targets_original_scale, all_preds_original_scale_non_dp_ins)

    print(f"MAE (Original Scale): {mae_non_dp_ins:.2f}, MSE (Original Scale): {mse_non_dp_ins:.2f}, R2 Score: {r2_non_dp_ins:.4f}")

    results_list.append({
        "Run Type": "Insurance Non-DP Regression (Log-Target)",
        # ... (rest of the fields, MAE, MSE, R2 Score populated) ...
        "Accuracy": "N/A",
        "MAE": mae_non_dp_ins, "MSE": mse_non_dp_ins, "R2 Score": r2_non_dp_ins,
        "LLM Epsilon Suggestion": "N/A", "LLM Reasoning": "N/A"
    })
    print("Insurance Non-DP (Log-Target) results recorded.")
else:
    print("Skipping Insurance non-DP run (Log-Target) due to missing components.")


--- Running: Insurance Non-DP Linear Regression (Log-Target) ---
Training Standard Linear Regression (Insurance - Log-Target)...
Standard Training Complete (Insurance - Log-Target).
Evaluating Standard Model (Insurance - Log-Target)...
MAE (Original Scale): 12114.29, MSE (Original Scale): 334369426.56, R2 Score: -1.1538
Insurance Non-DP (Log-Target) results recorded.


In [10]:
# Description: Define the standard prompt structure and functions to call LLMs.

# # v2
# def create_llm_prompt(task_config, schema_string, data_shape):
#     """Creates a more detailed prompt string for the LLM, guiding parameter choices."""
#     prompt = f"""
# Analyze the provided dataset context and task to recommend **optimized and justified** Differential Privacy (DP) settings for training a Logistic Regression model using DP-SGD.
# The goal is to predict the target variable '{task_config['target_variable']}'.

# **Dataset Context:**
# - Name: {task_config['dataset_name']}
# - Domain: {task_config['data_domain']} (Note: Healthcare data is generally considered sensitive).
# - Task: {task_config['task_description']}
# - Schema (Original Columns): {schema_string}
# - Extra details: {task_config['details']} (Pay close attention to class imbalance).

# **Parameter Guidance - IMPORTANT:** Avoid generic default values. Base your recommendations *specifically* on the context provided above.

# Provide your recommendations ONLY in a structured JSON format. The JSON object must include the following keys:
# - "dp_algorithm": String, the specific DP algorithm variant recommended (e.g., "DP-SGD with Gaussian Noise").
# - "target_epsilon": Float, recommended privacy budget epsilon (e.g., 1.5). Justify this based on sensitivity, utility needs, and domain.
# - "target_delta": Float or String, recommended privacy budget delta (e.g., 1e-5 or suggest calculating as "1/N"). Justify choice.
# - "max_grad_norm": Float, recommended gradient clipping norm (e.g., 1.0). Justify based on model stability and potential gradient explosion, especially considering class imbalance if noted.
# - "preprocessing_suggestions": List of strings, specific preprocessing actions recommended BEFORE applying DP (e.g., "Remove: id", "Normalize: age, avg_glucose_level, bmi").
# - "column_sensitivity_epsilon": A dictionary where keys are original column names and values are *conceptual* relative sensitivity floats (0.0=low, 1.0=high/ID) or labels (Low, Medium, High). This guides understanding, not direct budget split in standard DP-SGD. Exclude the target variable.
# - "reasoning": String, concise reasoning behind the overall recommendations (epsilon, delta, max_grad_norm choices, linking back to context).

# JSON Output ONLY:
# """
#     return prompt

# #v3
# def create_llm_prompt(task_config, schema_string, data_shape):
#     """Creates a more detailed prompt string for the LLM, guiding parameter choices
#        and including general ML best practices context."""

#     # Calculate approximate training size N for context
#     approx_N_train = int(data_shape[0] * (1-TEST_SIZE)) if data_shape else 'Unknown'

#     prompt = f"""
# Analyze the provided dataset context and task to recommend **optimized and justified** Differential Privacy (DP) settings for training a Logistic Regression model using DP-SGD.
# The goal is to predict the target variable '{task_config['target_variable']}'.

# **Dataset Context:**
# - Name: {task_config['dataset_name']}
# - Domain: {task_config['data_domain']} (Note: Healthcare data is generally considered sensitive).
# - Task: {task_config['task_description']}
# - Schema (Original Columns): {schema_string}
# - Data Shape: {data_shape} (Approx. Training N = {approx_N_train})
# - Extra details: {task_config['details']} (Pay close attention to class imbalance).

# **General ML Best Practices Context (Keep these in mind):**
# - **Normalization/Scaling:** Features with different scales (like 'age' vs 'avg_glucose_level') MUST be normalized or standardized (e.g., StandardScaler, MinMaxScaler) for models like Logistic Regression and especially before applying gradient clipping in DP-SGD. This ensures stable gradient computations. Apply scaling AFTER splitting data and ideally AFTER imputation if applicable.
# - **Gradient Stability & Clipping:** DP-SGD uses gradient clipping (`max_grad_norm`) to bound the influence of any single data point. Choosing the norm value is a trade-off:
#     - Too low: Clips potentially useful gradient information, slowing learning or preventing convergence, especially for minority classes or complex patterns.
#     - Too high: Less protection against outliers, potentially higher noise required for the same privacy budget (ε).
#     - Imbalanced Data Impact: Gradients from rare class samples might be infrequent but large; aggressive clipping can disproportionately affect learning for that class.
# - **Imbalanced Data Handling:** Beyond class weighting in the loss (which is assumed here), model evaluation should focus on metrics like F1-score, Precision, Recall for the minority class, not just accuracy. The goal is often to improve detection of the rare class.
# - **Preprocessing Order:** Typically: Split Data -> Impute Missing -> Encode Categorical -> Scale Numerical -> Train Model. DP is applied during the training step.

# **Parameter Guidance - IMPORTANT:** Based on the Dataset Context AND the ML Best Practices above, provide specific, justified recommendations. Avoid generic defaults.

# 1.  **`target_epsilon`**: Balance '{task_config['data_domain']}' sensitivity vs. utility needed for training. Justify the specific trade-off. (Range 1.0-5.0 often considered, but justify *your* choice).
# 2.  **`target_delta`**: Recommend a specific small value (e.g., 1e-5, 1e-6) or suggest "1/N". Justify why (e.g., related to approx N={approx_N_train}).
# 3.  **`max_grad_norm`**: **Connect this directly to the Gradient Stability & Imbalanced Data points above.** Given the heavy imbalance, suggest a value (e.g., range 5.0 - 15.0, or a specific reasoned value) likely higher than a generic default (like 1.0) to preserve minority class signals. Justify based *explicitly* on imbalance and the need for stable yet informative gradients.
# 4.  **`column_sensitivity_epsilon`**: Conceptual relative sensitivity hints (0.0 low, 1.0 high). Reflect potential identifiability/sensitivity based on domain/name. Exclude target.
# 5.  **`reasoning`**: Concise but detailed justification for epsilon, delta, and max_grad_norm, *explicitly linking* choices to dataset context (domain, N, imbalance) and the relevant ML best practices mentioned (normalization, gradient stability).

# **Output Format:**
# Provide recommendations ONLY in a structured JSON format with keys: "dp_algorithm", "target_epsilon", "target_delta", "max_grad_norm", "preprocessing_suggestions", "column_sensitivity_epsilon", "reasoning".

# JSON Output ONLY:
# """
#     return prompt

# Description: Define the standard prompt structure and functions to call LLMs.

def create_llm_prompt(task_config, schema_string, data_shape, task_type="classification"): # Add task_type
    """Creates a more detailed prompt string for the LLM, guiding parameter choices
       and including general ML best practices context. Adapts for task_type."""

    approx_N_train = int(data_shape[0] * (1-TEST_SIZE)) if data_shape else 'Unknown'

    # Adjust parts of the prompt based on task_type
    if task_type == "regression":
        imbalance_guidance = "" # No class imbalance for regression
        target_guidance_metrics = "metrics like Mean Absolute Error (MAE), Mean Squared Error (MSE), or R2 Score. The goal is often to minimize error."
        max_grad_norm_focus = "focus on overall gradient stability, considering the range and scale of the target variable values, rather than specific class signals. A moderate value (e.g., 1.0-10.0, depending on target scale and feature normalization) is common."
        epsilon_utility_focus = "utility needs for accurate predictions (e.g., low MAE/MSE)"
        model_type_in_prompt = "Linear Regression" # Or generic "Regression Model"
    else: # classification (default)
        imbalance_guidance = "(Pay close attention to class imbalance if mentioned in 'Extra details')."
        target_guidance_metrics = "metrics like F1-score, Precision, Recall for the minority class, not just accuracy. The goal is often to improve detection of the rare class."
        max_grad_norm_focus = "**Connect this directly to the Gradient Stability & Imbalanced Data points above.** Given potential class imbalance (see 'Extra details'), suggest a value (e.g., range 5.0 - 15.0, or a specific reasoned value) likely higher than a generic default (like 1.0) to preserve minority class signals. Justify based *explicitly* on imbalance and the need for stable yet informative gradients."
        epsilon_utility_focus = "utility needs for model training (which often requires sufficient signal)"
        model_type_in_prompt = "Logistic Regression"


    prompt = f"""
Analyze the provided dataset context and task to recommend **optimized and justified** Differential Privacy (DP) settings for training a {model_type_in_prompt} model using DP-SGD.
The goal is to predict the target variable '{task_config['target_variable']}'.

**Dataset Context:**
- Name: {task_config['dataset_name']}
- Domain: {task_config['data_domain']}
- Task: {task_config['task_description']}
- Schema (Original Columns): {schema_string}
- Data Shape: {data_shape} (Approx. Training N = {approx_N_train})
- Extra details: {task_config['details']} {imbalance_guidance}

**General ML Best Practices Context (Keep these in mind):**
- **Normalization/Scaling:** Features MUST be normalized or standardized for models like {model_type_in_prompt} and especially before applying gradient clipping in DP-SGD.
- **Gradient Stability & Clipping (`max_grad_norm`):** Bound influence of single points. Trade-off:
    - Too low: Clips useful info, hinders learning.
    - Too high: Less protection, more noise needed.
    - { "Imbalanced Data Impact: Gradients from rare class samples might be infrequent but large; aggressive clipping can disproportionately affect learning for that class." if task_type=="classification" else "For regression, consider the scale of the target variable when thinking about gradient magnitudes."}
- **Evaluation Focus ({task_type}):** Model evaluation should focus on {target_guidance_metrics}
- **Preprocessing Order:** Typically: Split Data -> Impute -> Encode -> Scale -> Train.

**Parameter Guidance - IMPORTANT:** Base recommendations *specifically* on context. Avoid generic defaults.

1.  **`target_epsilon`**: Balance '{task_config['data_domain']}' sensitivity vs. {epsilon_utility_focus}. Justify. (Range 1.0-5.0 often considered, but justify *your* choice).
2.  **`target_delta`**: Recommend small value (e.g., 1e-5) or "1/N". Justify (e.g., N={approx_N_train}).
3.  **`max_grad_norm`**: For this {task_type} task, {max_grad_norm_focus} Justify.
4.  **`column_sensitivity_epsilon`**: Conceptual relative sensitivity hints (0.0 low, 1.0 high). Exclude target.
5.  **`reasoning`**: Detailed justification for epsilon, delta, `max_grad_norm`, linking to dataset context (domain, N, imbalance/target scale) and ML practices.

**Output Format:**
JSON ONLY with keys: "dp_algorithm", "target_epsilon", "target_delta", "max_grad_norm", "preprocessing_suggestions", "column_sensitivity_epsilon", "reasoning".

JSON Output ONLY:
"""
    return prompt



def get_gemini_config(prompt, client):
    """Gets DP config from Gemini API."""
    if not client:
        print("Gemini client not available.")
        return None
    print("\nSending request to Gemini...")
    try:
        response = client.models.generate_content(
            model=GEMINI_MODEL_NAME, contents=prompt
            )
        response_text = response.text
        print("Gemini Response Received.")
        # Extract JSON part
        start_index = response_text.find('{')
        end_index = response_text.rfind('}')
        if start_index != -1 and end_index != -1:
            json_string_only = response_text[start_index : end_index + 1]
            config = json.loads(json_string_only)
            print("Successfully parsed Gemini config.")
            print(config)
            # Basic validation
            required_keys = ["dp_algorithm", "target_epsilon", "target_delta", "max_grad_norm", "preprocessing_suggestions", "column_sensitivity_epsilon", "reasoning"]
            if not all(key in config for key in required_keys):
                print("Warning: Gemini response missing some required keys.")
            return config
        else:
            print("Error: Could not find JSON object in Gemini response.")
            print("Raw Response:", response_text)
            return None
    except Exception as e:
        print(f"Error during Gemini API call or parsing: {e}")
        try:
            print("Gemini Response Content (if available):", response.candidates) # Might show safety blocks
        except: pass
        return None

def get_groq_config(prompt, client, model_name):
    """Gets DP config from Groq API."""
    if not client:
        print("Groq client not available.")
        return None
    print("\nSending request to Groq...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name,
            temperature=0.2,
            max_tokens=3024, 
            top_p=0.8,
            response_format={"type": "json_object"},
        )
        response_content = chat_completion.choices[0].message.content
        print("Groq Response Received.")
        config = json.loads(response_content)
        print("Successfully parsed Groq config.")
        print(config)

        # Basic validation
        required_keys = ["dp_algorithm", "target_epsilon", "target_delta", "max_grad_norm", "preprocessing_suggestions", "column_sensitivity_epsilon", "reasoning"]
        if not all(key in config for key in required_keys):
            print("Warning: Groq response missing some required keys.")
        return config
    except Exception as e:
        print(f"Error during Groq API call or parsing: {e}")
        return None

print("LLM Helper functions defined.")

LLM Helper functions defined.


In [17]:
# Description: Function to train and evaluate a DP model using Opacus, for classification or regression.

def train_evaluate_dp_model(
    config, run_name, train_loader, test_loader, # test_loader yields (X_test, y_test_LOG_TRANSFORMED) for regression
    n_features, device, epochs, learning_rate,
    task_type="classification",
    pos_weight_tensor=None,       # Only for classification
    target_metric_prefix="Class 1", # For naming classification metrics
    y_test_original_scale_for_eval=None # NEW/CORRECTED: Pass original scale y_test for regression evaluation
):
    """Trains and evaluates DP model, returns results dictionary."""
    print(f"\n--- Running: {run_name} (Task: {task_type}) ---")
    if config is None:
        print("Skipping run due to missing configuration.")
        return None

    target_eps = config.get("target_epsilon", DEFAULT_TARGET_EPSILON)
    target_del_config = config.get("target_delta", "1/N")
    max_norm = config.get("max_grad_norm", DEFAULT_MAX_GRAD_NORM)
    llm_reasoning = config.get("reasoning", "N/A")
    llm_eps_suggestion = config.get("target_epsilon", "N/A (Default used)")

    # Calculate actual delta
    if isinstance(target_del_config, str) and "1/N" in target_del_config and train_loader:
        actual_delta = 1 / len(train_loader.dataset)
    elif isinstance(target_del_config, (int, float)):
        actual_delta = target_del_config
    else: # Fallback
        actual_delta = 1 / len(train_loader.dataset) if train_loader else DEFAULT_TARGET_DELTA # Use global default if no loader

    dp_model = LogisticRegression(n_features).to(device)
    dp_optimizer = optim.SGD(dp_model.parameters(), lr=learning_rate)
    privacy_engine = PrivacyEngine()
    try:
        dp_model, dp_optimizer, dp_data_loader = privacy_engine.make_private_with_epsilon(
            module=dp_model, optimizer=dp_optimizer, data_loader=train_loader,
            max_grad_norm=max_norm, target_epsilon=target_eps, target_delta=actual_delta, epochs=epochs
        )
        print(f"Opacus Attached. Target ε={target_eps:.2f}, Target δ={actual_delta:.2e}, Max Grad Norm={max_norm}")
    except Exception as e:
        print(f"Error attaching Opacus PrivacyEngine: {e}")
        return None

    if task_type == "classification":
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
    elif task_type == "regression":
        criterion = nn.MSELoss() # Loss is calculated on log-transformed scale for regression
    else:
        raise ValueError("Invalid task_type. Must be 'classification' or 'regression'.")

    print(f"Training DP Model ({task_type})...")
    dp_model.train()
    for epoch in range(epochs):
        epoch_loss_dp = 0.0
        for batch_X, batch_y_train in dp_data_loader: # batch_y_train is y_log for regression
            batch_X, batch_y_train = batch_X.to(device), batch_y_train.to(device)
            dp_optimizer.zero_grad()
            outputs_train = dp_model(batch_X) # Predicts log-scale for regression
            loss = criterion(outputs_train, batch_y_train)
            loss.backward()
            dp_optimizer.step()
            epoch_loss_dp += loss.item()
        # print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {epoch_loss_dp / len(dp_data_loader):.4f}")

    try:
        final_epsilon = privacy_engine.get_epsilon(delta=actual_delta)
        print(f"DP Training Complete. Final ε = {final_epsilon:.4f}")
    except Exception as e:
        print(f"Could not get final epsilon: {e}")
        final_epsilon = float('nan')

    print(f"Evaluating DP Model ({task_type})...")
    dp_model.eval()
    all_preds_eval_log_scale = [] # For regression, these are log-scale predictions
    all_preds_eval_final = []     # For classification, these are 0/1 predictions
    
    # For regression, test_loader yields (X_test, y_test_log).
    # For classification, test_loader yields (X_test, y_test_labels_for_loss_calc).
    # We will use y_test_original_scale_for_eval for metric calculation.

    with torch.no_grad():
        for batch_X, _ in test_loader: # We only need X from loader for predictions
            batch_X = batch_X.to(device)
            outputs_eval = dp_model(batch_X)
            if task_type == "classification":
                preds_final = torch.round(torch.sigmoid(outputs_eval))
                all_preds_eval_final.extend(preds_final.cpu().numpy().flatten())
            else: # regression
                all_preds_eval_log_scale.extend(outputs_eval.cpu().numpy().flatten())

    results_metrics = {}
    base_results = {
        "Run Type": run_name,
        "LLM Used": config.get("llm_model_name", "N/A"),
        "Target Epsilon": target_eps, "Final Epsilon": final_epsilon,
        "Target Delta": actual_delta, "Max Grad Norm": max_norm,
        "LLM Epsilon Suggestion": llm_eps_suggestion,
        "LLM Reasoning": llm_reasoning
    }

    if task_type == "classification":
        if y_test_original_scale_for_eval is None: # For classification, this should be y_test_labels
            print("Error: True labels for classification evaluation not provided (y_test_original_scale_for_eval is None).")
            return None
        true_targets_for_metrics = y_test_original_scale_for_eval # This is y_test_labels (numpy array)
        
        accuracy_dp = accuracy_score(true_targets_for_metrics, all_preds_eval_final)
        precision_dp = precision_score(true_targets_for_metrics, all_preds_eval_final, pos_label=1, zero_division=0)
        recall_dp = recall_score(true_targets_for_metrics, all_preds_eval_final, pos_label=1, zero_division=0)
        f1_dp = f1_score(true_targets_for_metrics, all_preds_eval_final, pos_label=1, zero_division=0)
        print(f"Accuracy: {accuracy_dp:.4f}, Precision ({target_metric_prefix}): {precision_dp:.4f}, Recall ({target_metric_prefix}): {recall_dp:.4f}, F1 ({target_metric_prefix}): {f1_dp:.4f}")
        results_metrics.update({
            "Accuracy": accuracy_dp,
            f"Precision ({target_metric_prefix})": precision_dp,
            f"Recall ({target_metric_prefix})": recall_dp,
            f"F1 ({target_metric_prefix})": f1_dp
        })
    elif task_type == "regression":
        if y_test_original_scale_for_eval is None:
            print("ERROR: y_test_original_scale_for_eval is None for regression evaluation.")
            return None
            
        # Inverse transform predictions from log(1+y) scale to original y scale
        all_preds_eval_original_scale = np.expm1(np.array(all_preds_eval_log_scale))
        all_preds_eval_original_scale = np.maximum(0, all_preds_eval_original_scale) # Ensure non-negative

        true_targets_original_scale_eval = y_test_original_scale_for_eval # This should be a numpy array or pandas Series
        if isinstance(true_targets_original_scale_eval, pd.Series):
            true_targets_original_scale_eval = true_targets_original_scale_eval.values


        mae_dp = mean_absolute_error(true_targets_original_scale_eval, all_preds_eval_original_scale)
        mse_dp = mean_squared_error(true_targets_original_scale_eval, all_preds_eval_original_scale)
        r2_dp = r2_score(true_targets_original_scale_eval, all_preds_eval_original_scale)
        print(f"MAE (Original Scale): {mae_dp:.2f}, MSE (Original Scale): {mse_dp:.2f}, R2 Score: {r2_dp:.4f}")
        results_metrics.update({"MAE": mae_dp, "MSE": mse_dp, "R2 Score": r2_dp})
    
    base_results.update(results_metrics)
    print(f"{run_name} results recorded.")
    return base_results

print("DP Training/Evaluation function (with task_type and y_test_original_scale_for_eval) defined/updated.")

DP Training/Evaluation function (with task_type and y_test_original_scale_for_eval) defined/updated.


In [12]:
# Description: Define task details for Insurance dataset (Regression) and generate prompt.

print("\n--- Preparing Config and Prompt for Insurance Regression ---")
if df_insurance is not None and 'original_columns_insurance' in locals():
    # Regression details
    regression_details = f"Predict continuous medical charges ('{TARGET_COLUMN_INSURANCE}'). Focus on minimizing MAE/MSE. Target variable likely has a skewed distribution."

    task_config_insurance = {
        "dataset_name": "Kaggle Medical Cost Personal",
        "data_domain": "Healthcare/Finance",
        "task_description": f"Train a Linear Regression model using DP-SGD to predict medical charges ('{TARGET_COLUMN_INSURANCE}').",
        "target_variable": TARGET_COLUMN_INSURANCE,
        "model_type": "Linear Regression", # For LLM context
        "dp_mechanism_family": "DP-SGD",
        "details": regression_details
    }

    schema_string_insurance = ", ".join(original_columns_insurance)
    data_shape_tuple_insurance = df_insurance.shape

    # Call create_llm_prompt with task_type="regression"
    llm_prompt_insurance = create_llm_prompt(
        task_config_insurance,
        schema_string_insurance,
        data_shape_tuple_insurance,
        task_type="regression" # Specify task type
    )
    print("Insurance Regression task config and LLM prompt prepared.")
    # Optional: print prompt
    # print("\n--- Insurance Regression LLM Prompt ---")
    # print(llm_prompt_insurance)
else:
    print("Skipping Insurance Regression prompt creation due to data loading error.")
    llm_prompt_insurance = None


--- Preparing Config and Prompt for Insurance Regression ---
Insurance Regression task config and LLM prompt prepared.


In [18]:
# Description: Execute Fixed DP run for Insurance Regression (Log-Target).

print("\n--- Running: Insurance Fixed DP Regression (Log-Target) ---")
# Ensure y_test_original_scale_ins is available from the preprocessing cell
if n_features_ins is not None and train_loader_ins is not None and \
   'DEFAULT_TARGET_DELTA_INS' in locals() and \
   'y_test_original_scale_ins' in locals() and y_test_original_scale_ins is not None:

    fixed_dp_config_ins_reg = {
        "dp_algorithm": "DP-SGD with Gaussian Noise (Fixed)",
        "target_epsilon": DEFAULT_TARGET_EPSILON,
        "target_delta": DEFAULT_TARGET_DELTA_INS,
        "max_grad_norm": DEFAULT_MAX_GRAD_NORM, # Default 1.0, consider if appropriate for log-scale target
        "reasoning": "Using standard fixed parameters for Insurance Regression (Log-Target).",
        "llm_model_name": "N/A (Fixed Defaults)",
        "preprocessing_suggestions": ["Default"],
        "column_sensitivity_epsilon": {"Info": "Fixed parameters"}
    }
    results_fixed_ins_reg = train_evaluate_dp_model(
        config=fixed_dp_config_ins_reg,
        run_name="Insurance Fixed DP Regression (Log-Target)",
        train_loader=train_loader_ins,
        test_loader=test_loader_ins,
        n_features=n_features_ins,
        device=device,
        epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        task_type="regression", # Specify task type
        pos_weight_tensor=None, # Not used for regression
        y_test_original_scale_for_eval=y_test_original_scale_ins # Pass original y_test
    )
    if results_fixed_ins_reg:
        results_list.append(results_fixed_ins_reg)
else:
    print("Skipping Insurance Fixed DP Regression (Log-Target) due to missing components (check n_features_ins, loaders, or y_test_original_scale_ins).")


--- Running: Insurance Fixed DP Regression (Log-Target) ---

--- Running: Insurance Fixed DP Regression (Log-Target) (Task: regression) ---
Opacus Attached. Target ε=1.00, Target δ=9.35e-04, Max Grad Norm=1.0
Training DP Model (regression)...
DP Training Complete. Final ε = 0.9917
Evaluating DP Model (regression)...
MAE (Original Scale): 12965.51, MSE (Original Scale): 323325780.57, R2 Score: -1.0826
Insurance Fixed DP Regression (Log-Target) results recorded.


In [20]:
# Description: Get config from Gemini for Insurance Regression and run (Log-Target).

print("\n--- Running: Insurance Gemini DP Regression (Log-Target) ---")
# Ensure y_test_original_scale_ins is available
if gemini_client and llm_prompt_insurance and \
   n_features_ins is not None and train_loader_ins is not None and \
   'y_test_original_scale_ins' in locals() and y_test_original_scale_ins is not None:

    gemini_config_ins_reg = get_gemini_config(llm_prompt_insurance, gemini_client) # Assuming gemini_client is your GenerativeModel instance
    if gemini_config_ins_reg:
        gemini_config_ins_reg["llm_model_name"] = GEMINI_MODEL_NAME # Ensure GEMINI_MODEL_NAME is defined
        results_gemini_ins_reg = train_evaluate_dp_model(
            config=gemini_config_ins_reg,
            run_name="Insurance Gemini DP Regression (Log-Target)",
            train_loader=train_loader_ins,
            test_loader=test_loader_ins,
            n_features=n_features_ins,
            device=device,
            epochs=EPOCHS,
            learning_rate=LEARNING_RATE,
            task_type="regression",
            pos_weight_tensor=None,
            y_test_original_scale_for_eval=y_test_original_scale_ins # Pass original y_test
        )
        if results_gemini_ins_reg:
            results_list.append(results_gemini_ins_reg)
    else:
        print("Failed to get config from Gemini for Insurance Regression.")
elif not gemini_client: # Changed from gemini_client for clarity if you use the GenerativeModel instance directly
    print("Skipping Insurance Gemini run - client not initialized.")
else:
    print("Skipping Insurance Gemini run - missing components (prompt, features, loaders, or y_test_original_scale_ins).")


--- Running: Insurance Gemini DP Regression (Log-Target) ---

Sending request to Gemini...
Gemini Response Received.
Successfully parsed Gemini config.
{'dp_algorithm': 'DP-SGD', 'target_epsilon': 3.0, 'target_delta': 1e-05, 'max_grad_norm': 4.0, 'preprocessing_suggestions': ['Split data into training and testing sets first.', "For categorical features ('sex', 'smoker', 'region'): Apply One-Hot Encoding. Consider using `drop='first'` to avoid multicollinearity, though scaling mitigates this for LR.", "For the target variable 'charges': Apply a log transformation (e.g., `numpy.log1p`) to handle its likely skewed distribution. This helps stabilize variance and improves linear model performance.", "For numerical features ('age', 'bmi', 'children') and the one-hot encoded categorical features: Apply scaling (e.g., `StandardScaler` to achieve mean 0 and unit variance, or `MinMaxScaler` to scale to [0,1]). Consistent scaling is crucial for Linear Regression and DP-SGD.", "For the log-transf

In [21]:

# Description: Loop through Groq models, get config, and run DP training for CURRENT dataset.

print(f"\n--- Comparing Multiple Groq LLM Models for {DATA_FILE_INSURANCE} ---")
groq_models_to_test = [
    "llama-3.3-70b-versatile",
    "qwen-qwq-32b",
    "gemma2-9b-it",
    "deepseek-r1-distill-llama-70b" 
]
required_vars_defined = True
vars_to_check = {
    'groq_client': 'Groq client',
    'llm_prompt_insurance': 'LLM prompt for insurance regression',
    'n_features_ins': 'Number of features for insurance model',
    'train_loader_ins': 'Training data loader for insurance',
    'test_loader_ins': 'Test data loader for insurance',
    'y_test_original_scale_ins': 'Original scale y_test for insurance evaluation' # CRITICAL for regression
}

for var_name, desc in vars_to_check.items():
    if var_name not in locals() or locals()[var_name] is None:
        print(f"Prerequisite Error: '{desc}' (variable: {var_name}) is not defined or is None.")
        required_vars_defined = False

if not required_vars_defined:
    print("Skipping Groq model comparison for Insurance Regression due to missing prerequisites.")
else:
    for groq_model_id in groq_models_to_test:
        # Using a clear run name including the dataset, LLM, and model ID
        run_name_groq_dp_ins_reg = f"Insurance Groq ({groq_model_id}) DP Regression (Log-Target)"
        print(f"\n--- Testing Groq Model: {groq_model_id} for Insurance Regression ---")

        current_groq_config_ins_reg = get_groq_config(llm_prompt_insurance, groq_client, groq_model_id)

        if current_groq_config_ins_reg:
            current_groq_config_ins_reg["llm_model_name"] = groq_model_id # Store specific model

            results_current_groq_ins_reg = train_evaluate_dp_model(
                config=current_groq_config_ins_reg,
                run_name=run_name_groq_dp_ins_reg,
                train_loader=train_loader_ins,
                test_loader=test_loader_ins, # This loader yields (X_test, y_test_log)
                n_features=n_features_ins,
                device=device,
                epochs=EPOCHS,
                learning_rate=LEARNING_RATE,
                task_type="regression", # Specify regression
                pos_weight_tensor=None, # Not used for regression
                y_test_original_scale_for_eval=y_test_original_scale_ins # Crucial for regression metrics
            )

            if results_current_groq_ins_reg:
                # The train_evaluate_dp_model should now return regression-specific metrics like MAE, MSE, R2
                # The results_df cell will handle missing classification columns later.
                results_list.append(results_current_groq_ins_reg)
        else:
            print(f"Failed to get config from Groq model {groq_model_id} for Insurance Regression.")


--- Comparing Multiple Groq LLM Models for insurance_cleaned.csv ---

--- Testing Groq Model: llama-3.3-70b-versatile for Insurance Regression ---

Sending request to Groq...
Groq Response Received.
Successfully parsed Groq config.
{'dp_algorithm': 'DP-SGD', 'target_epsilon': 2.5, 'target_delta': 1e-05, 'max_grad_norm': 5.0, 'preprocessing_suggestions': ['Split Data', 'Impute missing values', 'One-hot encode categorical variables (sex, smoker, region)', 'Standardize features (age, bmi, children)'], 'column_sensitivity_epsilon': {'age': 0.5, 'sex': 0.8, 'bmi': 0.4, 'children': 0.6, 'smoker': 0.9, 'region': 0.7}, 'reasoning': 'The chosen target epsilon of 2.5 balances the need for accurate predictions in the healthcare/finance domain with the sensitivity of the data. A target delta of 1e-5 is selected to provide a strong privacy guarantee. The max grad norm of 5.0 is moderate, considering the scale of the target variable and the standardized features. Column sensitivity epsilons are ass

In [22]:
# Description: Show the results from ALL runs in a table.

if results_list:
    results_df = pd.DataFrame(results_list)
    # Define desired column order, including ALL possible metrics
    # Order them logically
    cols_order = [
        "Run Type", "LLM Used",
        "Target Epsilon", "Final Epsilon", "Target Delta", "Max Grad Norm",
        "LLM Epsilon Suggestion", # Moved up for easier comparison with Target Epsilon
        # Classification Metrics
        "Accuracy", "Precision (Stroke)", "Recall (Stroke)", "F1 (Stroke)",
        "Precision (Readmit)", "Recall (Readmit)", "F1 (Readmit)",
        # Regression Metrics
        "MAE", "MSE", "R2 Score",
        # LLM Reasoning last as it can be long
        "LLM Reasoning"
    ]
    # Ensure all expected columns exist, add if missing (fill with N/A or np.nan)
    for col in cols_order:
        if col not in results_df.columns:
            results_df[col] = np.nan # Use np.nan for numerical/potentially numerical

    results_df = results_df[cols_order]
    # Fill NaNs that might have occurred if some runs failed or metrics weren't applicable
    results_df.fillna("N/A", inplace=True) # Replace np.nan with "N/A" string for display

    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.width', 1200) # Wider for more columns
    print("\n--- Combined Experiment Results (Including Insurance Regression) ---")
    display(results_df)
else:
    print("No results recorded yet.")


--- Combined Experiment Results (Including Insurance Regression) ---


Unnamed: 0,Run Type,LLM Used,Target Epsilon,Final Epsilon,Target Delta,Max Grad Norm,LLM Epsilon Suggestion,Accuracy,Precision (Stroke),Recall (Stroke),F1 (Stroke),Precision (Readmit),Recall (Readmit),F1 (Readmit),MAE,MSE,R2 Score,LLM Reasoning
0,Insurance Non-DP Regression (Log-Target),,,,,,,,,,,,,,12114.291913,334369400.0,-1.153766,
1,Insurance Fixed DP Regression,N/A (Fixed Defaults),1.0,0.991737,0.000935,1.0,1.0,,,,,,,,7.375744,55.51144,-60.758069,Using standard fixed parameters for Insurance Regression.
2,Insurance Gemini DP Regression,gemini-2.5-pro-exp-03-25,2.5,2.499418,1e-05,5.0,2.5,,,,,,,,3.854968,16.31756,-17.153749,{'target_epsilon': 'A `target_epsilon` of 2.5 is chosen to balance the sensitivity of 'Healthcar...
3,Insurance Fixed DP Regression (Log-Target),N/A (Fixed Defaults),1.0,0.991737,0.000935,1.0,1.0,,,,,,,,12965.510338,323325800.0,-1.082631,Using standard fixed parameters for Insurance Regression (Log-Target).
4,Insurance Gemini DP Regression (Log-Target),gemini-2.5-pro-exp-03-25,3.0,2.992385,1e-05,4.0,3.0,,,,,,,,12836.516386,319556500.0,-1.058352,The recommended DP settings are tailored to the Kaggle Medical Cost Personal dataset for predict...
5,Insurance Groq (llama-3.3-70b-versatile) DP Regression (Log-Target),llama-3.3-70b-versatile,2.5,2.499418,1e-05,5.0,2.5,,,,,,,,12187.239588,296060200.0,-0.907005,The chosen target epsilon of 2.5 balances the need for accurate predictions in the healthcare/fi...
6,Insurance Groq (qwen-qwq-32b) DP Regression (Log-Target),qwen-qwq-32b,5.0,4.992243,1e-05,5.0,5.0,,,,,,,,12262.226502,298213400.0,-0.920875,"Target epsilon set to 5.0 balances healthcare/finance privacy needs with regression utility, lev..."
7,Insurance Groq (gemma2-9b-it) DP Regression (Log-Target),gemma2-9b-it,3.0,2.992385,1e-05,5.0,3.0,,,,,,,,12496.854989,306996000.0,-0.977446,The chosen DP settings aim to balance privacy and utility for this healthcare cost prediction ta...
8,Insurance Groq (deepseek-r1-distill-llama-70b) DP Regression (Log-Target),deepseek-r1-distill-llama-70b,2.0,1.994851,0.000935,5.0,2.0,,,,,,,,12577.803593,311918000.0,-1.00915,The target_epsilon of 2.0 balances the sensitivity of healthcare data with the need for model ac...


In [24]:
results_df = results_df.drop([1,2])

In [28]:
results_df['dataset'] = DATA_FILE_INSURANCE.split('.')[0]
results_df.head(10)

Unnamed: 0,Run Type,LLM Used,Target Epsilon,Final Epsilon,Target Delta,Max Grad Norm,LLM Epsilon Suggestion,Accuracy,Precision (Stroke),Recall (Stroke),F1 (Stroke),Precision (Readmit),Recall (Readmit),F1 (Readmit),MAE,MSE,R2 Score,LLM Reasoning,dataset
0,Insurance Non-DP Regression (Log-Target),,,,,,,,,,,,,,12114.291913,334369400.0,-1.153766,,insurance_cleaned
3,Insurance Fixed DP Regression (Log-Target),N/A (Fixed Defaults),1.0,0.991737,0.000935,1.0,1.0,,,,,,,,12965.510338,323325800.0,-1.082631,Using standard fixed parameters for Insurance Regression (Log-Target).,insurance_cleaned
4,Insurance Gemini DP Regression (Log-Target),gemini-2.5-pro-exp-03-25,3.0,2.992385,1e-05,4.0,3.0,,,,,,,,12836.516386,319556500.0,-1.058352,The recommended DP settings are tailored to the Kaggle Medical Cost Personal dataset for predict...,insurance_cleaned
5,Insurance Groq (llama-3.3-70b-versatile) DP Regression (Log-Target),llama-3.3-70b-versatile,2.5,2.499418,1e-05,5.0,2.5,,,,,,,,12187.239588,296060200.0,-0.907005,The chosen target epsilon of 2.5 balances the need for accurate predictions in the healthcare/fi...,insurance_cleaned
6,Insurance Groq (qwen-qwq-32b) DP Regression (Log-Target),qwen-qwq-32b,5.0,4.992243,1e-05,5.0,5.0,,,,,,,,12262.226502,298213400.0,-0.920875,"Target epsilon set to 5.0 balances healthcare/finance privacy needs with regression utility, lev...",insurance_cleaned
7,Insurance Groq (gemma2-9b-it) DP Regression (Log-Target),gemma2-9b-it,3.0,2.992385,1e-05,5.0,3.0,,,,,,,,12496.854989,306996000.0,-0.977446,The chosen DP settings aim to balance privacy and utility for this healthcare cost prediction ta...,insurance_cleaned
8,Insurance Groq (deepseek-r1-distill-llama-70b) DP Regression (Log-Target),deepseek-r1-distill-llama-70b,2.0,1.994851,0.000935,5.0,2.0,,,,,,,,12577.803593,311918000.0,-1.00915,The target_epsilon of 2.0 balances the sensitivity of healthcare data with the need for model ac...,insurance_cleaned


In [29]:
results_df.reset_index()
results_df.to_csv('regression_multiple_llm.csv', index_label='SrNo')

In [43]:
# Description: Show the results from all runs in a table.

if results_list:
    results_df = pd.DataFrame(results_list)
    # Reorder columns for clarity
    cols_order = [
        "Run Type", "LLM Used", "Target Epsilon", "Final Epsilon", "Target Delta",
        "Max Grad Norm", "Accuracy", "Precision (Stroke)", "Recall (Stroke)", "F1 (Stroke)",
        "LLM Epsilon Suggestion", "LLM Reasoning"
    ]
    # Ensure all expected columns exist, add if missing
    for col in cols_order:
        if col not in results_df.columns:
            results_df[col] = "N/A"

    results_df = results_df[cols_order]
    pd.set_option('display.max_colwidth', 100) # Show more of the reasoning column
    pd.set_option('display.width', 1000) # Adjust display width
    print("\n--- Combined Experiment Results ---")
    display(results_df)
else:
    print("No results recorded yet.")


--- Combined Experiment Results ---


Unnamed: 0,Run Type,LLM Used,Target Epsilon,Final Epsilon,Target Delta,Max Grad Norm,Accuracy,Precision (Stroke),Recall (Stroke),F1 (Stroke),LLM Epsilon Suggestion,LLM Reasoning
0,Diabetes Non-DP SGD,,,,,,0.606169,,,,,
1,Diabetes Gemini DP SGD,gemini-2.5-pro-exp-03-25,3.0,3.004022,1e-05,1.5,0.595428,,,,3.0,Recommendations are tailored for DP-SGD training of a Logistic Regression model on the UCI Diabe...
2,Groq (deepseek-r1-distill-llama-70b) DP SGD,deepseek-r1-distill-llama-70b,2.0,1.997348,1e-05,10.0,0.595153,0.616633,0.502479,0.553734,2.0,"The target_epsilon of 2.0 balances privacy and utility, considering the healthcare domain's sens..."
3,Diabetes Gemini DP SGD,gemini-2.5-pro-exp-03-25,3.0,3.004022,1e-05,7.5,0.608923,0.628164,0.533333,0.576877,3.0,The recommended DP settings balance the sensitivity of healthcare data with the need for model u...
4,Diabetes Non-DP SGD,,,,,,0.604792,0.628726,0.511295,0.563962,,


In [45]:
results_df.drop(0, axis=0, inplace=True)
results_df.drop(1, axis=0, inplace=True)
results_df

Unnamed: 0,Run Type,LLM Used,Target Epsilon,Final Epsilon,Target Delta,Max Grad Norm,Accuracy,Precision (Stroke),Recall (Stroke),F1 (Stroke),LLM Epsilon Suggestion,LLM Reasoning
2,Groq (deepseek-r1-distill-llama-70b) DP SGD,deepseek-r1-distill-llama-70b,2.0,1.997348,1e-05,10.0,0.595153,0.616633,0.502479,0.553734,2.0,"The target_epsilon of 2.0 balances privacy and utility, considering the healthcare domain's sens..."
3,Diabetes Gemini DP SGD,gemini-2.5-pro-exp-03-25,3.0,3.004022,1e-05,7.5,0.608923,0.628164,0.533333,0.576877,3.0,The recommended DP settings balance the sensitivity of healthcare data with the need for model u...
4,Diabetes Non-DP SGD,,,,,,0.604792,0.628726,0.511295,0.563962,,


In [46]:
results_df.to_csv('results2.csv', index=False)