# Train Prediction Model

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F  # Add this import
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


def verify_columns(df):
    """Verify and print available columns"""
    print("\nAvailable columns in dataset:")
    for col in df.columns:
        print(f"- {col}")
    return df.columns.tolist()


def preprocess_data(df):
    """Preprocess data with proper type conversion"""
    df = df.copy()

    # 1. Handle categorical features
    cat_cols = ['Gender', 'Health_Status', 'Marital_Status', 'Claim_History']
    label_encoders = {}

    for col in cat_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col].astype(str))

    # 2. Handle numeric columns with proper error handling
    numeric_cols = ['Age', 'Annual_Income', 'Sum_Assured', 'Premium']
    for col in numeric_cols:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Fill numeric nulls with median
            df[col] = df[col].fillna(df[col].median())
        except Exception as e:
            print(f"Error converting {col}: {str(e)}")

    # 3. Drop problematic columns
    cols_to_drop = [
        'Current_Insurance',
        'Previous_Policies',
        'Family_Details'  # Will handle family details separately if needed
    ]
    df = df.drop(columns=cols_to_drop, errors='ignore')

    # 4. Feature engineering (only after numeric conversion)
    df['Income_to_Age'] = df['Annual_Income'] / df['Age']
    df['Premium_to_Income'] = df['Premium'] / df['Annual_Income']

    # 5. Scale numeric features
    num_cols_to_scale = [
        'Age',
        'Annual_Income',
        'Sum_Assured',
        'Income_to_Age',
        'Premium_to_Income'
    ]

    # Replace infinities before scaling
    df = df.replace([np.inf, -np.inf], np.nan)
    df[num_cols_to_scale] = df[num_cols_to_scale].fillna(
        df[num_cols_to_scale].median())

    scaler = RobustScaler()
    df[num_cols_to_scale] = scaler.fit_transform(df[num_cols_to_scale])

    return df, scaler, label_encoders


class InsuranceDataset(Dataset):
    def __init__(self, features, targets):
        # Ensure all inputs are float32
        self.features = torch.FloatTensor(features.astype(np.float32))
        self.targets = torch.FloatTensor(
            targets.astype(np.float32)).reshape(-1, 1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]


class ImprovedInsuranceModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()

        # Print input size for debugging
        print(f"Model input size: {input_size}")

        # Layer architecture
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 1)

        # Batch normalization
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(32)
        self.bn3 = nn.BatchNorm1d(16)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # First layer
        x1 = self.fc1(x)
        x1 = self.bn1(x1)
        x1 = F.relu(x1)
        x1 = self.dropout(x1)

        # Second layer with residual
        x2 = self.fc2(x1)
        x2 = self.bn2(x2)
        x2 = F.relu(x2)
        x2 = self.dropout(x2)
        x2 = x2 + x1[:, :32]  # Residual connection

        # Third layer
        x3 = self.fc3(x2)
        x3 = self.bn3(x3)
        x3 = F.relu(x3)
        x3 = self.dropout(x3)

        # Output layer
        out = self.fc4(x3)
        return F.relu(out)  # Ensure positive output


def train_model(model, train_loader, val_loader, criterion, optimizer, device, scaler, encoders, epochs=1000):
    model = model.to(device)
    best_val_loss = float('inf')
    patience = 15
    counter = 0

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y).item()

        val_loss /= len(val_loader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'scaler': scaler,
                'encoders': encoders,
                'input_size': model.fc1.in_features
            }, 'insurance_model.pth')
        else:
            counter += 1

        if counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss /
                  len(train_loader):.4f}, Val Loss: {val_loss:.4f}")


def calculate_sum_assured(age, annual_income):
    """More conservative sum assured calculation"""
    if age < 30:
        multiplier = 15
    elif age < 40:
        multiplier = 12
    elif age < 50:
        multiplier = 8
    else:
        multiplier = 5
    return min(annual_income * multiplier, 30000000)  # Cap at 3 Cr


def main():
    try:
        # Load data with proper encoding
        df = pd.read_csv(
            'K:/Insurance-Prediction/insurance_data.csv', encoding='utf-8')

        # Print data info for debugging
        print("\nDataset Info:")
        print(df.info())

        processed_df, scaler, encoders = preprocess_data(df)

        # Verify processed data
        print("\nProcessed columns:", processed_df.columns.tolist())
        print("Processed dtypes:", processed_df.dtypes)

        # Split features and target
        X = processed_df.drop('Premium', axis=1).values  # Convert to numpy
        y = processed_df['Premium'].values  # Convert to numpy

        print(f"Features shape: {X.shape}")
        print(f"Target shape: {y.shape}")

        # Train/val split
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Create datasets
        train_dataset = InsuranceDataset(X_train, y_train)
        val_dataset = InsuranceDataset(X_val, y_val)

        # Create loaders
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)

        # Initialize model
        model = ImprovedInsuranceModel(input_size=X.shape[1])
        criterion = nn.MSELoss()
        optimizer = torch.optim.AdamW(
            model.parameters(), lr=0.001, weight_decay=0.01
        )

        # Train model
        train_model(model, train_loader, val_loader,
                    criterion, optimizer, device, scaler, encoders)

        # Save model components
        torch.save({
            'model_state_dict': model.state_dict(),
            'scaler': scaler,
            'encoders': encoders
        }, 'insurance_model.pth')

        return model, scaler, encoders

    except Exception as e:
        print(f"Error in main(): {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None, None


if __name__ == "__main__":
    model, scaler, encoders = main()


Using device: cuda

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2000 non-null   int64  
 1   Gender             2000 non-null   object 
 2   Health_Status      2000 non-null   object 
 3   Marital_Status     2000 non-null   object 
 4   Current_Insurance  1664 non-null   object 
 5   Previous_Policies  2000 non-null   int64  
 6   Claim_History      2000 non-null   object 
 7   Annual_Income      1950 non-null   float64
 8   Premium            2000 non-null   int64  
 9   Sum_Assured        2000 non-null   int64  
 10  Family_Details     1613 non-null   object 
dtypes: float64(1), int64(4), object(6)
memory usage: 172.0+ KB
None

Processed columns: ['Age', 'Gender', 'Health_Status', 'Marital_Status', 'Claim_History', 'Annual_Income', 'Premium', 'Sum_Assured', 'Income_to_Age', 'Premium_to_Inco

# Sample Prediction

In [13]:
class ImprovedInsuranceModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        
        self.shared = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU()
        )
        
        # Single output for premium
        self.premium_head = nn.Linear(32, 1)
        
    def forward(self, x):
        features = self.shared(x)
        premium = self.premium_head(features)
        return F.relu(premium)  # Ensure positive premium

def prepare_features(age, gender, health_status, marital_status, 
                    annual_income, claim_history, scaler, encoders):
    """Prepare input features for prediction"""
    
    # Create input dataframe
    input_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Health_Status': [health_status],
        'Marital_Status': [marital_status],
        'Annual_Income': [annual_income],
        'Claim_History': [claim_history]
    })
    
    # Encode categorical variables
    categorical_maps = {
        'Gender': {'Male': 0, 'Female': 1},
        'Health_Status': {'Healthy': 0, 'Minor Issues': 1, 'Chronic': 2},
        'Marital_Status': {'Single': 0, 'Married': 1, 'Divorced': 2, 'Widowed': 3},
        'Claim_History': {'None': 0, 'Minor': 1, 'Major': 2}
    }
    
    # Apply categorical mappings
    for col, mapping in categorical_maps.items():
        input_data[col] = input_data[col].map(mapping)
    
    # Calculate Sum_Assured before feature engineering
    input_data['Sum_Assured'] = calculate_initial_sum_assured(age, annual_income)
    
    # Add engineered features
    input_data['Income_to_Age'] = input_data['Annual_Income'] / input_data['Age']
    input_data['Premium_to_Income'] = 0  # placeholder for new predictions
    
    # Ensure columns match training data order
    required_columns = [
        'Age', 'Gender', 'Health_Status', 'Marital_Status', 
        'Claim_History', 'Annual_Income', 'Sum_Assured',
        'Income_to_Age', 'Premium_to_Income'
    ]
    input_data = input_data[required_columns]
    
    # Scale numeric features
    numeric_cols = ['Age', 'Annual_Income', 'Sum_Assured', 'Income_to_Age', 'Premium_to_Income']
    input_data[numeric_cols] = scaler.transform(input_data[numeric_cols])
    
    # Convert to tensor
    input_tensor = torch.FloatTensor(input_data.values).to(device)
    
    return input_tensor

def calculate_initial_sum_assured(age, annual_income):
    """Calculate initial sum assured for feature preparation"""
    if age < 30:
        multiplier = 15
    elif age < 40:
        multiplier = 12
    elif age < 50:
        multiplier = 8
    else:
        multiplier = 5
    return min(annual_income * multiplier, 30000000)  # Cap at 3 Cr

def predict_insurance_details(
    age, gender, health_status, marital_status,
    annual_income, claim_history, model, scaler, device
):
    try:
        # Prepare input features
        input_tensor = prepare_features(
            age, gender, health_status, marital_status,
            annual_income, claim_history, scaler, encoders
        )
        
        # Get model prediction (just premium)
        model.eval()
        with torch.no_grad():
            premium = model(input_tensor)
            
        # Calculate recommended policy type based on demographics
        policy_types = ['Term Plan', 'Endowment Plan', 'ULIP', 
                       'Retirement Plan', 'Child Plan']
        
        # Simple policy recommendation logic
        if age < 35 and marital_status == 'Single':
            recommended_policy = 'Term Plan'
        elif age > 45:
            recommended_policy = 'Retirement Plan'
        elif marital_status == 'Married' and age < 40:
            recommended_policy = 'ULIP'
        elif health_status != 'Healthy':
            recommended_policy = 'Endowment Plan'
        else:
            recommended_policy = 'Term Plan'
            
        # Calculate sum assured
        sum_assured = calculate_initial_sum_assured(age, annual_income)
        
        # Calculate recommended term
        if age < 30:
            recommended_term = 30
        elif age < 40:
            recommended_term = 25
        elif age < 50:
            recommended_term = 20
        else:
            recommended_term = 15
            
        # Estimate conversion probability
        conversion_prob = 0.7 if health_status == 'Healthy' else 0.4
        
        return {
            'recommended_policy': recommended_policy,
            'premium': premium.item(),
            'sum_assured': sum_assured,
            'term_years': recommended_term,
            'conversion_probability': conversion_prob
        }
    
    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        return None

# Test the prediction
sample_prediction = predict_insurance_details(
    age=35,
    gender='Male',
    health_status='Healthy',
    marital_status='Married', 
    annual_income=1000000,
    claim_history='None',
    model=model,
    scaler=scaler,
    device=device
)

if sample_prediction:
    print(f"""
    Recommended Insurance Plan: {sample_prediction['recommended_policy']}
    Premium: ₹{sample_prediction['premium']:,.2f}/year
    Sum Assured: ₹{sample_prediction['sum_assured']:,.2f}
    Term: {sample_prediction['term_years']} years
    Conversion Probability: {sample_prediction['conversion_probability']:.1%}
    """)


    Recommended Insurance Plan: ULIP
    Premium: ₹9,645.86/year
    Sum Assured: ₹12,000,000.00
    Term: 25 years
    Conversion Probability: 70.0%
    
