In [1]:
import os
import uuid
import pickle
from typing import Union, Tuple, List, Dict
import numpy as np

import pandas as pd

import mlflow

In [2]:
TRACKING_SERVER_HOST = "127.0.0.1" # fill in with the public IP
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5001")

In [3]:
mdl_path = "/home/habeeb/Mlops-proj/04-deployment/streaming/lambda_functions/model.pkl"
with open(mdl_path, "rb") as f:
    model2 = pickle.load(f)

In [4]:
model_name = "nyc-taxi-regressor-weighted-main9"
model = mlflow.pyfunc.load_model(f"models:/{model_name}@production")
print(f"✅ Loaded model: {model_name} @ production")

✅ Loaded model: nyc-taxi-regressor-weighted-main9 @ production


In [5]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: da62ead94d284c08aca4594e17885e42

In [6]:
def generate_uuids(n):
    return [str(uuid.uuid4()) for i in range(n)] 

In [7]:
def prepare_df(data: Union[pd.DataFrame, List[Dict], Dict]) -> Tuple[pd.DataFrame, Union[np.ndarray, None]]:
    """
    Prepare taxi trip data for model input.
    
    Supports:
      - DataFrame input
      - List of dictionaries
      - Single dictionary input
    
    Handles both lpep and tpep datetime columns.
    
    Args:
        data: Input data
    
    Returns:
        Tuple of (processed DataFrame, target if available else None)
    """
    # Normalize input to DataFrame
    if isinstance(data, dict):
        df = pd.DataFrame([data])
    elif isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, pd.DataFrame):
        df = data.copy()
    else:
        raise ValueError("Input must be a DataFrame, list of dicts, or single dict.")

    # Determine pickup/dropoff column names
    if 'lpep_pickup_datetime' in df.columns and 'lpep_dropoff_datetime' in df.columns:
        pickup_col, dropoff_col = 'lpep_pickup_datetime', 'lpep_dropoff_datetime'
    elif 'tpep_pickup_datetime' in df.columns and 'tpep_dropoff_datetime' in df.columns:
        pickup_col, dropoff_col = 'tpep_pickup_datetime', 'tpep_dropoff_datetime'
    else:
        pickup_col, dropoff_col = None, None

    # Handle datetime conversion and duration calculation
    if pickup_col and dropoff_col:
        if not np.issubdtype(df[pickup_col].dtype, np.datetime64):
            df[pickup_col] = pd.to_datetime(df[pickup_col])
        if not np.issubdtype(df[dropoff_col].dtype, np.datetime64):
            df[dropoff_col] = pd.to_datetime(df[dropoff_col])

        df["duration"] = (df[dropoff_col] - df[pickup_col]).dt.total_seconds() / 60
        df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]
    else:
        df["duration"] = None

    # Convert categorical columns to string if present
    for col in ["PULocationID", "DOLocationID"]:
        if col in df.columns:
            df[col] = df[col].astype(str)

    # Create combined PU_DO feature
    if "PULocationID" in df.columns and "DOLocationID" in df.columns:
        df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]

    df['ride_id'] = generate_uuids(len(df))

    # Return target if fully computed
    target = df["duration"].values if df["duration"].notna().all() else None
    return df, target

def predict(features):
    preds = model.predict(features)
    return preds

In [8]:
data = {
        "lpep_pickup_datetime": "2021-02-01T08:00:00",
        "lpep_dropoff_datetime": "2021-02-01T08:15:00",
        "PULocationID": 132,
        "DOLocationID": 138,
        'trip_distance': 50
    }

In [9]:
dfs, targ = prepare_df(data)

In [10]:
dfs

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,duration,PU_DO,ride_id
0,2021-02-01 08:00:00,2021-02-01 08:15:00,132,138,50,15.0,132_138,fb425df3-4955-4f8e-9ae6-67b15988b43c


In [11]:
predict(dfs)

array([39.710823], dtype=float32)

In [12]:
model2.predict(dfs)

array([39.710823], dtype=float32)

In [6]:
!mlflow models --help

Usage: mlflow models [OPTIONS] COMMAND [ARGS]...

  Deploy MLflow models locally.

  To deploy a model associated with a run on a tracking server, set the
  MLFLOW_TRACKING_URI environment variable to the URL of the desired server.

Options:
  --help  Show this message and exit.

Commands:
  build-docker             Builds a Docker image whose default entrypoint...
  generate-dockerfile      Generates a directory with Dockerfile whose...
  predict                  Generate predictions in json format using a...
  prepare-env              Performs any preparation necessary to predict...
  serve                    Serve a model saved with MLflow by launching a...
  update-pip-requirements  Add or remove requirements from a model's...


In [21]:
import numpy as np
from typing import Dict, List, Union, Any
import uuid
from datetime import datetime
from typing import Optional

class MinimalDataFrame:
    """Enhanced DataFrame-like class for ColumnTransformer compatibility"""
    
    def __init__(self, data: Dict[str, List]):
        self.data = data
        self.columns = list(data.keys())
        self.shape = (len(next(iter(data.values()))) if data else 0, len(data))
        self.index = list(range(self.shape[0]))
        self._setup_dtypes()
    
    def _setup_dtypes(self):
        """Determine data types for each column"""
        self.dtypes = {}
        for col, values in self.data.items():
            if not values:
                self.dtypes[col] = 'object'
                continue
            
            # Check if all values are numeric
            sample_val = next((v for v in values if v is not None), None)
            if isinstance(sample_val, (int, float)):
                self.dtypes[col] = 'float64'
            else:
                self.dtypes[col] = 'object'
    
    def __getitem__(self, key):
        if isinstance(key, str):
            return self.data[key]
        elif isinstance(key, list):
            return MinimalDataFrame({k: self.data[k] for k in key if k in self.data})
        elif isinstance(key, tuple) and len(key) == 2:
            # Handle df[rows, cols] indexing
            rows, cols = key
            if isinstance(cols, str):
                cols = [cols]
            elif cols is None:
                cols = self.columns
            
            # Get row indices
            if isinstance(rows, slice):
                row_indices = list(range(*rows.indices(self.shape[0])))
            elif hasattr(rows, '__iter__'):
                row_indices = list(rows)
            else:
                row_indices = [rows]
            
            # Extract data
            result_data = {}
            for col in cols:
                if col in self.data:
                    result_data[col] = [self.data[col][i] for i in row_indices]
            
            return MinimalDataFrame(result_data)
        else:
            raise KeyError(f"Unsupported key type: {type(key)}")
    
    def __len__(self):
        return self.shape[0]
    
    def __iter__(self):
        """Iterate over column names"""
        return iter(self.columns)
    
    def __array__(self):
        """Convert to numpy array for sklearn compatibility - DO NOT USE"""
        # This method should NOT be called by ColumnTransformer
        # ColumnTransformer should use column selection instead
        raise NotImplementedError("ColumnTransformer should use column selection, not __array__")
    
    @property
    def values(self):
        """Return values as numpy array - only when explicitly requested"""
        if not self.data:
            return np.array([]).reshape(0, 0)
        
        # Convert all data to appropriate types
        arrays = []
        for col in self.columns:
            col_data = self.data[col]
            try:
                # Try to convert to numeric
                numeric_data = [float(x) if x is not None else np.nan for x in col_data]
                arrays.append(numeric_data)
            except (ValueError, TypeError):
                # If conversion fails, keep as object
                arrays.append(col_data)
        
        return np.column_stack(arrays) if arrays else np.array([]).reshape(self.shape)
    
    def iloc(self, row_indexer, col_indexer=None):
        if col_indexer is None:
            col_indexer = slice(None)
        
        # Handle column selection
        if isinstance(col_indexer, slice):
            selected_cols = self.columns[col_indexer]
        elif isinstance(col_indexer, (list, tuple)):
            selected_cols = [self.columns[i] if isinstance(i, int) else i for i in col_indexer]
        elif isinstance(col_indexer, int):
            selected_cols = [self.columns[col_indexer]]
        else:
            selected_cols = self.columns
        
        # Handle row selection
        if isinstance(row_indexer, slice):
            row_indices = list(range(*row_indexer.indices(self.shape[0])))
        elif hasattr(row_indexer, '__iter__') and not isinstance(row_indexer, str):
            row_indices = list(row_indexer)
        elif isinstance(row_indexer, int):
            row_indices = [row_indexer]
        else:
            row_indices = list(range(self.shape[0]))
        
        # Extract data
        result_data = {}
        for col in selected_cols:
            if col in self.data:
                result_data[col] = [self.data[col][i] for i in row_indices]
        
        return MinimalDataFrame(result_data)
    
    def loc(self, row_indexer, col_indexer=None):
        """Basic loc implementation"""
        return self.iloc(row_indexer, col_indexer)
    
    def copy(self):
        return MinimalDataFrame({k: v.copy() for k, v in self.data.items()})
    
    def select_dtypes(self, include=None, exclude=None):
        """Select columns by dtype"""
        if include is None and exclude is None:
            return self.copy()
        
        selected_cols = []
        for col in self.columns:
            dtype = self.dtypes.get(col, 'object')
            
            include_col = True
            if include is not None:
                if isinstance(include, str):
                    include = [include]
                include_col = any(inc in dtype for inc in include)
            
            if exclude is not None and include_col:
                if isinstance(exclude, str):
                    exclude = [exclude]
                include_col = not any(exc in dtype for exc in exclude)
            
            if include_col:
                selected_cols.append(col)
        
        return MinimalDataFrame({k: self.data[k] for k in selected_cols})
    
    def drop(self, columns=None, axis=1):
        """Drop columns"""
        if axis != 1:
            raise NotImplementedError("Only column dropping supported")
        
        if isinstance(columns, str):
            columns = [columns]
        
        remaining_cols = [col for col in self.columns if col not in columns]
        return MinimalDataFrame({k: self.data[k] for k in remaining_cols})
    
    def reset_index(self, drop=True):
        """Reset index (no-op for this implementation)"""
        return self.copy()

def features_to_minimal_df(features: List[Dict]) -> MinimalDataFrame:
    """Convert feature list to MinimalDataFrame"""
    if not features:
        return MinimalDataFrame({})
    
    # Get all unique keys
    all_keys = set()
    for feature in features:
        all_keys.update(feature.keys())
    
    # Create column-oriented dictionary
    df_dict = {}
    for key in all_keys:
        df_dict[key] = [feature.get(key) for feature in features]
    
    return MinimalDataFrame(df_dict)

def predict2(features):
    """Simple predict function - let ColumnTransformer handle column selection"""
    # Handle single dictionary input
    if isinstance(features, dict):
        features = [features]
    
    if not features:
        return []
    
    # Convert to MinimalDataFrame with all available columns
    df_like = features_to_minimal_df(features)
    
    # Let the ColumnTransformer select only the columns it needs
    preds = model.predict(df_like)
    return preds

import uuid
from datetime import datetime
from typing import Optional

def parse_datetime_fast(dt_str: str) -> Optional[datetime]:
    """Optimized datetime parsing with minimal try/except overhead"""
    if not dt_str:
        return None
    
    try:
        # Handle ISO format first (most common)
        if "T" in dt_str:
            return datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        else:
            # Handle space-separated format
            return datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S")
    except Exception:
        print(f"⚠️ Could not parse datetime: {dt_str}")
        return None

def calculate_duration(pickup_str: str, dropoff_str: str) -> Optional[float]:
    """Calculate duration in minutes with validation"""
    pickup_dt = parse_datetime_fast(pickup_str)
    dropoff_dt = parse_datetime_fast(dropoff_str)
    
    if not pickup_dt or not dropoff_dt:
        return None
    
    try:
        duration = (dropoff_dt - pickup_dt).total_seconds() / 60.0
        # Filter outliers (1-60 minutes)
        return duration if 1 <= duration <= 60 else None
    except Exception:
        return None

def prepare_single_feature(row: Dict) -> Optional[Dict]:
    """Process a single row efficiently"""
    # Extract datetime fields
    pickup = row.get("lpep_pickup_datetime") or row.get("tpep_pickup_datetime")
    dropoff = row.get("lpep_dropoff_datetime") or row.get("tpep_dropoff_datetime")
    
    if not pickup or not dropoff:
        return None
    
    # Calculate duration
    duration = calculate_duration(pickup, dropoff)
    if duration is None:
        return None
    
    # Create feature dict with minimal copying
    feature = {
        "duration": duration,
        "ride_id": str(uuid.uuid4())
    }
    
    # Add location IDs as strings
    if "PULocationID" in row:
        feature["PULocationID"] = str(row["PULocationID"])
    if "DOLocationID" in row:
        feature["DOLocationID"] = str(row["DOLocationID"])
    
    # Create PU_DO combination if both locations exist
    if "PULocationID" in feature and "DOLocationID" in feature:
        feature["PU_DO"] = f"{feature['PULocationID']}_{feature['DOLocationID']}"
    
    # Copy other relevant fields without deep copying
    for key in ["passenger_count", "trip_distance", "fare_amount", "total_amount"]:
        if key in row:
            feature[key] = row[key]
    
    return feature

def prepare_features_batch(data: Union[Dict, List[Dict]]) -> tuple[List[Dict], int]:
    """Optimized batch feature preparation"""
    if isinstance(data, dict):
        data = [data]
    
    features = []
    processed_count = 0
    
    for row in data:
        try:
            feature = prepare_single_feature(row)
            if feature:
                features.append(feature)
                processed_count += 1
        except Exception as e:
            print(f"⚠️ Error processing row: {str(e)}")
            continue
    
    return features, processed_count

# Usage example:
def predict_from_raw_data(data):
    """Complete pipeline: raw data -> features -> prediction"""
    # Prepare features using your existing function
    features, count = prepare_features_batch(data)
    
    if not features:
        return []
    
    # Make prediction
    return predict2(features)

In [22]:
predict_from_raw_data(data)

ValueError: X has 6 features, but ColumnTransformer is expecting 22 features as input.

In [23]:
prepare_features_batch(data)

([{'duration': 15.0,
   'ride_id': '47acc44a-4e29-4c86-b74f-78357d294eff',
   'PULocationID': '132',
   'DOLocationID': '138',
   'PU_DO': '132_138',
   'trip_distance': 50}],
 1)

In [13]:
import pickle
import os
import inspect
from typing import List, Optional

def inspect_pickle_model(model_path: str) -> dict:
    """
    Inspect a pickled scikit-learn model to extract column information.
    """
    results = {}
    
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        
        print(f"✅ Loaded model from: {model_path}")
        print(f"📊 Model type: {type(model)}")
        
        # Check if it's a Pipeline
        if hasattr(model, 'steps'):
            print(f"🔗 Pipeline detected with steps: {[step[0] for step in model.steps]}")
            results['model_type'] = 'Pipeline'
            results['steps'] = [step[0] for step in model.steps]
            
            # Look for ColumnTransformer in the pipeline
            for step_name, step_transformer in model.steps:
                print(f"\n--- Inspecting step: {step_name} ---")
                step_info = inspect_transformer(step_transformer, step_name)
                results[step_name] = step_info
        
        # Check if it's a direct ColumnTransformer
        elif hasattr(model, 'transformers'):
            print("🔧 Direct ColumnTransformer detected")
            results['model_type'] = 'ColumnTransformer'
            transformer_info = inspect_transformer(model, 'main')
            results.update(transformer_info)
        
        # Check for feature names
        if hasattr(model, 'feature_names_in_'):
            print(f"✨ Found feature_names_in_: {model.feature_names_in_}")
            results['feature_names_in'] = list(model.feature_names_in_)
        
        # Check for n_features_in_
        if hasattr(model, 'n_features_in_'):
            print(f"🔢 Number of input features: {model.n_features_in_}")
            results['n_features_in'] = model.n_features_in_
        
        return results
        
    except Exception as e:
        print(f"❌ Error inspecting model: {str(e)}")
        return {'error': str(e)}

def inspect_transformer(transformer, name: str) -> dict:
    """
    Inspect a specific transformer to extract column information.
    """
    info = {
        'type': type(transformer).__name__,
        'attributes': {}
    }
    
    print(f"  Type: {type(transformer)}")
    
    # Check for ColumnTransformer specific attributes
    if hasattr(transformer, 'transformers'):
        print("  📋 ColumnTransformer transformers:")
        transformers_info = []
        
        for trans_name, trans_obj, columns in transformer.transformers:
            print(f"    - {trans_name}: {type(trans_obj).__name__} on columns {columns}")
            transformers_info.append({
                'name': trans_name,
                'transformer_type': type(trans_obj).__name__,
                'columns': columns
            })
        
        info['transformers'] = transformers_info
        
        # Get all unique columns used
        all_columns = []
        for _, _, columns in transformer.transformers:
            if isinstance(columns, list):
                all_columns.extend(columns)
            elif columns != 'drop':  # Skip dropped transformers
                all_columns.append(columns)
        
        info['all_columns_used'] = list(set(all_columns))
        print(f"  📊 All columns used: {info['all_columns_used']}")
    
    # Check for feature names
    if hasattr(transformer, 'feature_names_in_'):
        info['feature_names_in'] = list(transformer.feature_names_in_)
        print(f"  ✨ Feature names in: {transformer.feature_names_in_}")
    
    if hasattr(transformer, 'n_features_in_'):
        info['n_features_in'] = transformer.n_features_in_
        print(f"  🔢 Number of features in: {transformer.n_features_in_}")
    
    # Check for get_feature_names_out method
    if hasattr(transformer, 'get_feature_names_out'):
        try:
            if hasattr(transformer, 'feature_names_in_') and transformer.feature_names_in_ is not None:
                feature_names_out = transformer.get_feature_names_out(transformer.feature_names_in_)
                info['feature_names_out'] = list(feature_names_out)
                print(f"  🎯 Feature names out: {feature_names_out}")
        except Exception as e:
            print(f"  ⚠️ Could not get feature_names_out: {e}")
    
    # Check other common attributes
    for attr in ['_columns', 'columns_', 'feature_names_', 'get_feature_names']:
        if hasattr(transformer, attr):
            try:
                value = getattr(transformer, attr)
                if callable(value):
                    try:
                        value = value()
                    except:
                        value = "callable (couldn't execute)"
                info['attributes'][attr] = value
                print(f"  📝 {attr}: {value}")
            except Exception as e:
                print(f"  ⚠️ Error getting {attr}: {e}")
    
    return info

def find_training_columns(model_path: str) -> Optional[List[str]]:
    """
    Try to extract the exact column names the model was trained on.
    """
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        
        # Strategy 1: Check feature_names_in_ (sklearn 1.0+)
        if hasattr(model, 'feature_names_in_'):
            return list(model.feature_names_in_)
        
        # Strategy 2: Check pipeline steps
        if hasattr(model, 'steps'):
            for step_name, step_transformer in model.steps:
                if hasattr(step_transformer, 'feature_names_in_'):
                    return list(step_transformer.feature_names_in_)
        
        # Strategy 3: For ColumnTransformer, reconstruct from transformers
        if hasattr(model, 'transformers') or (hasattr(model, 'steps') and 
            any(hasattr(step[1], 'transformers') for step in model.steps)):
            
            # Find the ColumnTransformer
            ct = model
            if hasattr(model, 'steps'):
                for step_name, step_transformer in model.steps:
                    if hasattr(step_transformer, 'transformers'):
                        ct = step_transformer
                        break
            
            if hasattr(ct, 'transformers'):
                all_columns = []
                for trans_name, trans_obj, columns in ct.transformers:
                    if columns != 'drop' and isinstance(columns, (list, tuple)):
                        all_columns.extend(columns)
                    elif isinstance(columns, str):
                        all_columns.append(columns)
                
                return all_columns
        
        print("❌ Could not determine training columns automatically")
        return None
        
    except Exception as e:
        print(f"❌ Error finding training columns: {e}")
        return None

def quick_model_check(model_path: str = "model.pkl"):
    """
    Quick check to get the essential information about your model.
    """
    print("🔍 Quick Model Analysis")
    print("=" * 50)
    
    # Check if file exists
    if not os.path.exists(model_path):
        print(f"❌ Model file not found: {model_path}")
        return
    
    # Get full inspection
    results = inspect_pickle_model(model_path)
    
    print("\n" + "=" * 50)
    print("📋 SUMMARY")
    print("=" * 50)
    
    # Try to find training columns
    training_columns = find_training_columns(model_path)
    if training_columns:
        print(f"✅ Found {len(training_columns)} training columns:")
        for i, col in enumerate(training_columns, 1):
            print(f"  {i:2d}. {col}")
        
        print(f"\n🐍 Python list format:")
        print(f"EXPECTED_COLUMNS = {training_columns}")
    else:
        print("❌ Could not automatically determine training columns")
        print("💡 You may need to check your training script or data")
    
    return results

# Example usage functions
def inspect_mlflow_model(model_name: str, tracking_uri: str = "http://127.0.0.1:5000"):
    """
    Inspect an MLflow model for column information.
    """
    try:
        import mlflow
        mlflow.set_tracking_uri(tracking_uri)
        
        # Load model
        model = mlflow.pyfunc.load_model(f"models:/{model_name}@production")
        
        print(f"✅ Loaded MLflow model: {model_name}")
        
        # Check if it has the underlying sklearn model
        if hasattr(model, '_model_impl') and hasattr(model._model_impl, 'python_model'):
            sklearn_model = model._model_impl.python_model
            print(f"📊 Underlying model type: {type(sklearn_model)}")
            
            # Use the same inspection logic
            if hasattr(sklearn_model, 'feature_names_in_'):
                print(f"✨ Found feature_names_in_: {sklearn_model.feature_names_in_}")
                return list(sklearn_model.feature_names_in_)
        
        print("❌ Could not extract column information from MLflow model")
        return None
        
    except ImportError:
        print("❌ MLflow not available")
        return None
    except Exception as e:
        print(f"❌ Error inspecting MLflow model: {e}")
        return None

if __name__ == "__main__":
    # Run the quick check
    quick_model_check(mdl_path)
    
    # If you want to check MLflow model instead:
    # inspect_mlflow_model("nyc-taxi-regressor-weighted-main9")

🔍 Quick Model Analysis
✅ Loaded model from: /home/habeeb/Mlops-proj/04-deployment/streaming/lambda_functions/model.pkl
📊 Model type: <class 'sklearn.pipeline.Pipeline'>
🔗 Pipeline detected with steps: ['preprocessor', 'regressor']

--- Inspecting step: preprocessor ---
  Type: <class 'sklearn.compose._column_transformer.ColumnTransformer'>
  📋 ColumnTransformer transformers:
    - cat: Pipeline on columns ['PU_DO']
    - num: Pipeline on columns ['trip_distance']
  📊 All columns used: ['PU_DO', 'trip_distance']
  ✨ Feature names in: ['VendorID' 'lpep_pickup_datetime' 'lpep_dropoff_datetime'
 'store_and_fwd_flag' 'RatecodeID' 'PULocationID' 'DOLocationID'
 'passenger_count' 'trip_distance' 'fare_amount' 'extra' 'mta_tax'
 'tip_amount' 'tolls_amount' 'ehail_fee' 'improvement_surcharge'
 'total_amount' 'payment_type' 'trip_type' 'congestion_surcharge'
 'duration' 'PU_DO']
  🔢 Number of features in: 22
  🎯 Feature names out: ['cat__PU_DO_100_168' 'cat__PU_DO_100_180' 'cat__PU_DO_100_190' .

In [14]:
model2

In [26]:
# Global variables for model inspection
EXPECTED_COLUMNS = None
DEFAULT_VALUES = None

def extract_model_columns_and_defaults(model_instance):
    """
    Extract expected column names from the trained model and create sensible defaults.
    """
    global EXPECTED_COLUMNS, DEFAULT_VALUES
    
    if EXPECTED_COLUMNS is not None and DEFAULT_VALUES is not None:
        return EXPECTED_COLUMNS, DEFAULT_VALUES
    
    try:
        # Strategy 1: Check feature_names_in_ (sklearn 1.0+)
        if hasattr(model_instance, 'feature_names_in_'):
            EXPECTED_COLUMNS = list(model_instance.feature_names_in_)
        
        # Strategy 2: Check pipeline steps
        elif hasattr(model_instance, 'steps'):
            for step_name, step_transformer in model_instance.steps:
                if hasattr(step_transformer, 'feature_names_in_'):
                    EXPECTED_COLUMNS = list(step_transformer.feature_names_in_)
                    break
                
                # Check for ColumnTransformer in pipeline
                if hasattr(step_transformer, 'transformers'):
                    if hasattr(step_transformer, 'feature_names_in_'):
                        EXPECTED_COLUMNS = list(step_transformer.feature_names_in_)
                        break
        
        # Strategy 3: For direct ColumnTransformer, reconstruct from transformers
        elif hasattr(model_instance, 'transformers'):
            if hasattr(model_instance, 'feature_names_in_'):
                EXPECTED_COLUMNS = list(model_instance.feature_names_in_)
        
        if EXPECTED_COLUMNS is None:
            raise ValueError("Could not extract column names from model")
        
        print(f"✅ Extracted {len(EXPECTED_COLUMNS)} expected columns from model:")
        print(f"   {EXPECTED_COLUMNS}")
        
        # Create intelligent defaults based on column names
        DEFAULT_VALUES = create_intelligent_defaults(EXPECTED_COLUMNS)
        
        return EXPECTED_COLUMNS, DEFAULT_VALUES
        
    except Exception as e:
        print(f"❌ Error extracting model columns: {str(e)}")
        raise

def create_intelligent_defaults(columns: List[str]) -> Dict:
    """
    Create intelligent default values based on column names and common patterns.
    """
    defaults = {}
    
    for col in columns:
        col_lower = col.lower()
        
        # Categorical/ID columns
        if any(keyword in col_lower for keyword in ['id', 'vendor', 'ratecode', 'payment', 'trip_type']):
            defaults[col] = 1
        
        # Boolean/flag columns
        elif any(keyword in col_lower for keyword in ['flag', 'store']):
            defaults[col] = 'N'
        
        # Count columns
        elif 'count' in col_lower:
            defaults[col] = 1
        
        # Amount/price/fee/tax columns
        elif any(keyword in col_lower for keyword in ['amount', 'fare', 'fee', 'tax', 'tip', 'toll', 'surcharge', 'extra']):
            defaults[col] = 0.0
        
        # Distance columns
        elif 'distance' in col_lower:
            defaults[col] = 1.0  # Default to 1 mile
        
        # Duration columns
        elif 'duration' in col_lower:
            defaults[col] = 10.0  # Default to 10 minutes
        
        # Location/coordinate columns
        elif any(keyword in col_lower for keyword in ['location', 'pu_do', 'pulocation', 'dolocation']):
            if col_lower == 'pu_do':
                defaults[col] = '1_1'  # Default pickup_dropoff combination
            else:
                defaults[col] = '1'  # Default location ID
        
        # Datetime columns (these should be handled separately in your data prep)
        elif any(keyword in col_lower for keyword in ['datetime', 'pickup', 'dropoff']):
            defaults[col] = None  # Will be handled by your datetime processing
        
        # Default for unknown columns
        else:
            # Try to infer from column name patterns
            if col_lower.endswith('_id') or col_lower.startswith('id'):
                defaults[col] = 1
            elif any(char.isalpha() for char in col):  # Contains letters, likely categorical
                defaults[col] = 'unknown'
            else:  # Likely numeric
                defaults[col] = 0.0
    
    print(f"📝 Created intelligent defaults:")
    for col, default in defaults.items():
        print(f"   {col}: {default}")
    
    return defaults

def create_full_feature_dict(feature: Dict, expected_columns: List[str], default_values: Dict) -> Dict:
    """
    Create a complete feature dictionary with all expected columns.
    Fill missing columns with intelligent default values.
    """
    full_feature = {}
    
    # Start with defaults for all expected columns
    for col in expected_columns:
        if col in default_values:
            full_feature[col] = default_values[col]
        else:
            # Fallback default if not in our intelligent defaults
            full_feature[col] = 0
    
    # Override with actual values from input
    for key, value in feature.items():
        if key in expected_columns:
            # Use the provided value, but handle None/empty cases
            if value is not None and value != '':
                full_feature[key] = value
            # If value is None/empty, keep the default
    
    return full_feature

def dict_to_dataframe_compatible_array(features_list: List[Dict], expected_columns: List[str], default_values: Dict) -> np.ndarray:
    """
    Convert list of feature dictionaries to a 2D numpy array compatible with sklearn ColumnTransformer.
    Returns array with shape (n_samples, n_features) where columns are in expected_columns order.
    """
    if not features_list:
        return np.array([]).reshape(0, len(expected_columns))
    
    # Ensure all features have the same structure
    structured_features = [create_full_feature_dict(f, expected_columns, default_values) for f in features_list]
    
    # Create 2D array with proper column ordering
    n_samples = len(structured_features)
    n_features = len(expected_columns)
    
    # Initialize array - we'll use object dtype to handle mixed types
    result_array = np.empty((n_samples, n_features), dtype=object)
    
    for i, feature in enumerate(structured_features):
        for j, col in enumerate(expected_columns):
            value = feature[col]
            if value is None:
                # Handle None values based on column type patterns
                if any(keyword in col.lower() for keyword in ['flag', 'store']):
                    result_array[i, j] = 'N'
                elif any(keyword in col.lower() for keyword in ['amount', 'fare', 'fee', 'tax', 'tip', 'toll', 'distance', 'duration']):
                    result_array[i, j] = 0.0
                elif any(keyword in col.lower() for keyword in ['id', 'vendor', 'ratecode', 'payment', 'trip_type', 'count']):
                    result_array[i, j] = 1
                else:
                    result_array[i, j] = 'unknown'
            else:
                result_array[i, j] = value
    
    print(f"📊 Created 2D array with shape: {result_array.shape}")
    print(f"   Columns (first 5): {expected_columns[:5]}")
    print(f"   Sample row: {result_array[0] if len(result_array) > 0 else 'No data'}")
    
    return result_array

def make_prediction_batch(features: List[Dict]) -> List[float]:
    """Make predictions for a batch of features using 2D array compatible with ColumnTransformer"""
    try:
        model_instance = model2
        
        # Extract expected columns and defaults from model on first run
        expected_columns, default_values = extract_model_columns_and_defaults(model_instance)
        
        # Convert to 2D array that's compatible with ColumnTransformer
        prediction_data = dict_to_dataframe_compatible_array(features, expected_columns, default_values)
        
        print(f"📊 Prepared prediction data:")
        print(f"   Shape: {prediction_data.shape}")
        print(f"   Expected columns: {len(expected_columns)}")
        print(f"   Provided features: {len(features)}")
        
        # Make prediction
        predictions = model_instance.predict(prediction_data)
        
        print(f"✅ Predictions successful!")
        print(f"   Predictions shape: {predictions.shape if hasattr(predictions, 'shape') else len(predictions)}")
        
        # Ensure predictions is a list
        if hasattr(predictions, 'tolist'):
            return predictions.tolist()
        elif isinstance(predictions, (list, tuple)):
            return list(predictions)
        else:
            return [float(predictions)]
            
    except Exception as e:
        print(f"❌ Prediction error: {str(e)}")
        print(f"Prediction data shape: {prediction_data.shape if 'prediction_data' in locals() else 'Not created'}")
        print(f"Available columns in features: {list(features[0].keys()) if features else 'No features'}")
        raise

# Alternative approach using pandas DataFrame (if you have pandas available)
def make_prediction_batch_with_pandas(features: List[Dict]) -> List[float]:
    """
    Alternative implementation using pandas DataFrame for maximum compatibility.
    Use this if you have pandas available and want the most reliable approach.
    """
    import pandas as pd
    
    try:
        model_instance = model2
        
        # Extract expected columns and defaults from model on first run
        expected_columns, default_values = extract_model_columns_and_defaults(model_instance)
        
        # Create list of complete feature dictionaries
        complete_features = [create_full_feature_dict(f, expected_columns, default_values) for f in features]
        
        # Convert to DataFrame with proper column order
        df = pd.DataFrame(complete_features, columns=expected_columns)
        
        print(f"📊 Created DataFrame:")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {list(df.columns)}")
        print(f"   Sample row:\n{df.iloc[0] if len(df) > 0 else 'No data'}")
        
        # Make prediction
        predictions = model_instance.predict(df)
        
        print(f"✅ Predictions successful!")
        
        # Ensure predictions is a list
        if hasattr(predictions, 'tolist'):
            return predictions.tolist()
        elif isinstance(predictions, (list, tuple)):
            return list(predictions)
        else:
            return [float(predictions)]
            
    except Exception as e:
        print(f"❌ Prediction error: {str(e)}")
        raise

In [22]:
minimal_data = {
    "data": [{
        "PULocationID": "100", 
        "DOLocationID": "200",
        "trip_distance": 2.5,
        "lpep_pickup_datetime": "2024-01-01 10:00:00",
        "lpep_dropoff_datetime": "2024-01-01 10:15:00"
    }]
}

In [27]:
make_prediction_batch(minimal_data['data'])

✅ Extracted 22 expected columns from model:
   ['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge', 'duration', 'PU_DO']
📝 Created intelligent defaults:
   VendorID: 1
   lpep_pickup_datetime: None
   lpep_dropoff_datetime: None
   store_and_fwd_flag: N
   RatecodeID: 1
   PULocationID: 1
   DOLocationID: 1
   passenger_count: 1
   trip_distance: 1.0
   fare_amount: 0.0
   extra: 0.0
   mta_tax: 0.0
   tip_amount: 0.0
   tolls_amount: 0.0
   ehail_fee: 0.0
   improvement_surcharge: 0.0
   total_amount: 0.0
   payment_type: 1
   trip_type: 1
   congestion_surcharge: 0.0
   duration: 10.0
   PU_DO: 1_1
📊 Created 2D array with shape: (1, 22)
   Columns (first 5): ['VendorID', 'lpep_pickup_datetime', 'lpep_dro

ValueError: Specifying the columns using strings is only supported for dataframes.