# Medicost - Model Development & Optimization Pipeline
### Machine Learning Model Development

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../data/insurance.csv')
df.head()

In [None]:
# DATA LOADING AND PREPARATION
df = pd.read_csv('../data/insurance.csv')
    
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(df.head())


In [None]:
# DATA PREPROCESSING
# Clean and encode the data for machine learning

# Create a copy of the original data
data = df.copy()

print("🔄 Encoding categorical variables...")

# Initialize label encoders dictionary
label_encoders = {}

# Encode categorical variables
categorical_columns = ['sex', 'smoker', 'region']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

print("🔄 Creating engineered features...")

# Create BMI categories
data['bmi_category'] = pd.cut(data['bmi'], 
                            bins=[0, 18.5, 25, 30, float('inf')], 
                            labels=[0, 1, 2, 3]).astype(int)

# Create age groups
data['age_group'] = pd.cut(data['age'], 
                         bins=[0, 25, 35, 50, 65, float('inf')], 
                         labels=[0, 1, 2, 3, 4]).astype(int)

print("✅ Data preprocessing completed!")

In [None]:
# FEATURE PREPARATION AND DATA SPLITTING
# Prepare features and target variable, then split the data

# Separate features and target
X = data.drop(columns=['charges'])
y = data['charges']

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"✅ Data split: {X_train.shape[0]} train samples, {X_test.shape[0]} test samples")


In [None]:
# FEATURE SCALING
# Scale numerical features for better model performance

# Initialize and fit the scaler on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Feature scaling completed!")

In [None]:
# MODEL TRAINING
# Train multiple models and compare performance

print("🔄 Training models...")

# Define models to compare
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
}

# Store results and trained models
results = {}
trained_models = {}

# Train each model
for name, model in models.items():
    print(f"   Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    # Store results
    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae
    }
    
    trained_models[name] = model

print("✅ All models trained successfully!")

In [None]:
# MODEL COMPARISON AND SELECTION
# Compare all models and select the best performer

print("📊 Model Performance Comparison:")
print("=" * 60)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    name: {
        'Test R²': metrics['test_r2'],
        'Test RMSE': metrics['test_rmse'],
        'Test MAE': metrics['test_mae'],
        'Overfitting': metrics['train_r2'] - metrics['test_r2']
    }
    for name, metrics in results.items()
}).T

print(comparison_df.round(4))

# Select best model (highest Test R²)
best_model_name = comparison_df['Test R²'].idxmax()
best_model = trained_models[best_model_name]

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   Accuracy (R²): {comparison_df.loc[best_model_name, 'Test R²']:.4f} ({comparison_df.loc[best_model_name, 'Test R²']*100:.2f}%)")
print(f"   RMSE: ${comparison_df.loc[best_model_name, 'Test RMSE']:.2f}")
print(f"   MAE: ${comparison_df.loc[best_model_name, 'Test MAE']:.2f}")


In [None]:
# PREDICTION FUNCTION
# Create a function to make predictions for new data

def predict_insurance_cost(age, sex, bmi, children, smoker, region):
    """
    Predict insurance cost for new customer data
    """
    
    # Create input DataFrame
    input_data = pd.DataFrame({
        'age': [age],
        'sex': [sex],
        'bmi': [bmi],
        'children': [children],
        'smoker': [smoker],
        'region': [region]
    })
    
    # Apply same preprocessing as training data
    for col, encoder in label_encoders.items():
        input_data[col] = encoder.transform(input_data[col])
    
    # Add engineered features
    input_data['bmi_category'] = pd.cut(input_data['bmi'], 
                                      bins=[0, 18.5, 25, 30, float('inf')], 
                                      labels=[0, 1, 2, 3]).astype(int)
    
    input_data['age_group'] = pd.cut(input_data['age'], 
                                   bins=[0, 25, 35, 50, 65, float('inf')], 
                                   labels=[0, 1, 2, 3, 4]).astype(int)
    
    # Scale features
    input_scaled = scaler.transform(input_data)
    
    # Make prediction
    prediction = best_model.predict(input_scaled)[0]
    
    return prediction

print("✅ Prediction function created!")

In [None]:
# FEATURE IMPORTANCE ANALYSIS
# Analyze which features are most important for predictions

if hasattr(best_model, 'feature_importances_'):
    # Get feature importances
    importances = best_model.feature_importances_
    feature_names = X_train.columns
    
    # Create DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print("📊 Feature Importance Ranking:")
    print("=" * 40)
    for idx, row in feature_importance_df.iterrows():
        print(f"{row['Feature']:15} | {row['Importance']:.4f}")
    
else:
    print("Selected model doesn't support feature importance analysis")


In [None]:

# EXAMPLE PREDICTIONS AND TESTING
# Test the prediction function with example cases

print("🔮 Example Predictions:")
print("=" * 40)

# Test cases
test_cases = [
    (25, 'female', 22.0, 0, 'no', 'northeast'),
    (45, 'male', 30.0, 2, 'yes', 'southeast'),
    (35, 'female', 25.0, 1, 'no', 'northwest')
]

case_names = ['Young Non-Smoker', 'Middle-aged Smoker', 'Average Case']

for i, case in enumerate(test_cases):
    prediction = predict_insurance_cost(*case)
    print(f"{case_names[i]:18} | ${prediction:.2f}")

In [None]:
# MODEL VALIDATION AND FINAL RESULTS
# Summary of model performance and final validation

# Final predictions on test set
final_predictions = best_model.predict(X_test_scaled)

# Calculate final metrics
final_r2 = r2_score(y_test, final_predictions)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
final_mae = mean_absolute_error(y_test, final_predictions)

print("🎯 FINAL MODEL PERFORMANCE:")
print("=" * 50)
print(f"Model: {best_model_name}")
print(f"Accuracy (R²): {final_r2:.4f} ({final_r2*100:.2f}%)")
print(f"RMSE: ${final_rmse:.2f}")
print(f"MAE: ${final_mae:.2f}")

# Error analysis
errors = final_predictions - y_test
print(f"\nError Analysis:")
print(f"Mean Error: ${np.mean(errors):.2f}")
print(f"Std Error: ${np.std(errors):.2f}")

print(f"\n🎉 ML PIPELINE COMPLETED!")
print(f"Best Model: {best_model_name} with {final_r2*100:.2f}% accuracy")


In [None]:
# SAVE MODEL (Optional)
# Save the trained model for future use

import joblib

# Save the model and preprocessing objects
joblib.dump(best_model, '../models/insurance_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoders, '../models/encoders.pkl')

print("💾 Model saved successfully!")
print("Files: insurance_model.pkl, scaler.pkl, encoders.pkl")

# To load later:
# model = joblib.load('insurance_model.pkl')
# scaler = joblib.load('scaler.pkl') 
# encoders = joblib.load('encoders.pkl')