# ML Model Training: Predicting sepal.length

This notebook was auto-generated by Intent2Model.

**Task:** regression
**Target Column:** sepal.length
**Model:** ridge


## PLANNING SOURCE

**Planning Method:** FALLBACK
**Error:** Gemini API error: All Gemini models hit rate limits. Tried: gemini-2.5-flash, gemini-2.5-flash-lite, gemini-flash-latest, gemini-2.5-pro. Please wait or provide a different API key. Last error: Model gemini-1.5-pro not found

**Target Confidence:** 0.95
**Task Confidence:** 0.90
**Plan Quality:** High Confidence



## STEP 0 — TASK INFERENCE

⚠️ Rule-based fallback task inference (LLM unavailable). Low confidence.


## STEP 1 — DATASET INTELLIGENCE

⚠️ Rule-based fallback dataset intelligence (LLM unavailable). Limited analysis.


## STEP 2 — TRANSFORMATION STRATEGY

⚠️ Rule-based fallback transformation strategy (LLM unavailable). Conservative defaults.


## STEP 3 — MODEL CANDIDATE SELECTION

⚠️ Rule-based fallback model selection (LLM unavailable). Baseline models only.


## STEP 4 — TRAINING & VALIDATION

Use cross-validation by default with task-appropriate metrics.


## STEP 5 — ERROR & BEHAVIOR ANALYSIS

Analyze residuals/confusion matrix and error slices.


## STEP 6 — EXPLAINABILITY

Use feature_importances_ when available and align post-encoding names.


## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data

In [None]:
# Load your dataset
# df = pd.read_csv('your_dataset.csv')

# For this example, we'll use the uploaded data
print(f'Dataset shape: {df.shape}')
print(f'Columns: {list(df.columns)}')
df.head()

## 3. Prepare Data

In [None]:
# Separate features and target
X = df.drop(columns=['sepal.length'])
y = df['sepal.length']

# Handle categorical target if needed

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## 4. Build Preprocessing Pipeline (from AutoMLPlan)

In [None]:
# Preprocessing compiled from AutoMLPlan
# Each feature transform is based on plan.feature_transforms

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

transformers = []

# Numeric features without scaling (from plan): ['sepal.width', 'petal.length', 'petal.width']
num_plain_cols = ['sepal.width', 'petal.length', 'petal.width']
if num_plain_cols:
    transformers.append((
        'num_plain',
        'passthrough',
        num_plain_cols
    ))

# Categorical features: one-hot encoding (from plan): ['variety']
cat_onehot_cols = ['variety']
if cat_onehot_cols:
    steps = []
    try:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=5)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    steps.append(('onehot', ohe))
    transformers.append(('cat_onehot', Pipeline(steps), cat_onehot_cols))

# Create preprocessor from plan-driven transformers

if len(transformers) == 0:
    # ⚠️ WARNING: No transformers generated from plan.feature_transforms! Using runtime fallback.
    numeric_cols = ['sepal.width', 'petal.length', 'petal.width']
    transformers.append(('num_scaled', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_cols))
    categorical_cols = ['variety']
    try:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=5)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    transformers.append(('cat_onehot', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', ohe)]), categorical_cols))
preprocessor = ColumnTransformer(transformers, remainder='drop')



## 5. Build Model (from AutoMLPlan)

In [None]:
# Model compiled from AutoMLPlan.model_candidates

# Selected model: linear_regression (from plan.model_candidates)
# Reason: Baseline linear model for calibration.

from sklearn.linear_model import LinearRegression

model = LinearRegression()



## 6. Assemble Pipeline (from AutoMLPlan)

In [None]:
# Assemble pipeline from plan-driven components
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])



## 7. Train Model

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate using metrics from AutoMLPlan
# Metrics compiled from AutoMLPlan
# Primary metric: rmse
# Additional metrics: ['mae', 'r2']

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)


# Calculate metrics
from sklearn.metrics import mean_squared_error
primary_score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Primary Metric (RMSE): {primary_score:.4f}')

# Additional metrics from plan:
print(f'MAE: {mean_absolute_error(y_test, y_pred):.4f}')
print(f'R²: {r2_score(y_test, y_pred):.4f}')


## 8. Feature Importance (from plan.explainability_md)

In [None]:
# Get feature importance (aligned with plan)
if hasattr(pipeline.named_steps['model'], 'feature_importances_'):
    importances = pipeline.named_steps['model'].feature_importances_
    
    # Get feature names after preprocessing (aligned with plan, not dtype-based)
    # NOTE: Do NOT use numeric_cols or categorical_cols - they may not be defined
    try:
        preprocessor = pipeline.named_steps['preprocessor']
        feature_names = []
        # Get feature names from preprocessor transformers
        if hasattr(preprocessor, 'transformers_'):
            for name, transformer, cols in preprocessor.transformers_:
                if hasattr(transformer, 'get_feature_names_out'):
                    feature_names.extend(transformer.get_feature_names_out(cols))
                elif hasattr(transformer, 'named_steps'):
                    # Pipeline transformer
                    for step_name, step_transformer in transformer.named_steps.items():
                        if hasattr(step_transformer, 'get_feature_names_out'):
                            feature_names.extend(step_transformer.get_feature_names_out(cols))
                            break
                else:
                    # Fallback: use column names
                    feature_names.extend([f'{name}_{col}' for col in cols])
        else:
            # Preprocessor not fitted yet - use generic names
            feature_names = [f'feature_{i}' for i in range(len(importances))]
    except Exception as e:
        # Fallback: use generic feature names
        feature_names = [f'feature_{i}' for i in range(len(importances))]
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names[:len(importances)],
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importance')
    plt.tight_layout()
    plt.show()
    
    print(importance_df)
else:
    print('Feature importance not available for this model type.')

## 9. Save Model

In [None]:
# Save the trained model
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print('Model saved successfully!')

## 10. Make Predictions

In [None]:
# Load model for predictions
# with open('model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# Example prediction
# new_data = pd.DataFrame({...})
# prediction = loaded_model.predict(new_data)
# print(f'Prediction: {prediction}')