# Synthetic Insurance Data Generator for AA Ireland Risk Model

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

In [2]:
fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Helper function to simulate vehicle types
def random_vehicle():
    return random.choice(['sedan', 'suv', 'hatchback', 'convertible'])

# Generate synthetic dataset
def generate_insurance_data(n_customers=100000):
    data = []
    for _ in range(n_customers):
        age = random.randint(18, 75)
        vehicle_age = random.randint(0, 10)
        vehicle_type = random_vehicle()
        annual_premium = round(np.random.normal(loc=450, scale=100), 2)
        previous_insurance = random.choice([0, 1])
        # Rough risk estimation (not used in real life): older cars, no insurance, young drivers => higher risk
        base_risk = (
            (75 - age) * 0.01 +
            vehicle_age * 0.02 +
            (0.15 if previous_insurance == 0 else -0.05) +
            (0.1 if vehicle_type == 'convertible' else 0.0)
        )
        claim = 1 if random.random() < base_risk else 0

        data.append({
            "customer_id": str(fake.uuid4()),
            "age": age,
            "vehicle_age": vehicle_age,
            "vehicle_type": str(vehicle_type),
            "annual_premium": float(annual_premium),
            "previous_insurance": int(previous_insurance),
            "claim": int(claim)
        })

    return pd.DataFrame(data)

# Generate and save
df = generate_insurance_data()

df.head(20)

Unnamed: 0,customer_id,age,vehicle_age,vehicle_type,annual_premium,previous_insurance,claim
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,58,1,sedan,499.67,1,0
1,23b8c1e9-3924-46de-beb1-3b9046685257,26,1,sedan,436.17,1,1
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,23,3,suv,514.77,0,1
3,972a8469-1641-4f82-8b9d-2434e465e150,63,10,convertible,602.3,0,1
4,17fc695a-07a0-4a6e-8822-e8f36c031199,35,0,suv,426.58,1,1
5,9a1de644-815e-46d1-bb8f-aa1837f8a88b,27,3,hatchback,426.59,0,1
6,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,24,5,hatchback,607.92,1,0
7,6b65a6a4-8b81-48f6-b38a-088ca65ed389,64,7,sedan,526.74,1,1
8,47378190-96da-4dac-b2ff-5d2a386ecbe0,36,10,hatchback,403.05,0,1
9,c241330b-01a9-471f-9e8a-774bcf36d58b,20,10,suv,504.26,1,0


In [3]:
# Save to CSV
df.to_csv("synthetic_insurance.csv", index=False)

### 🧾 Feature Definitions

- **`previous_insurance`**:
  - `0` → Customer **did not** have a prior insurance policy
  - `1` → Customer **had** a previous insurance policy

- **`claim`** *(target variable)*:
  - `0` → Customer **did not** make a claim
  - `1` → Customer **made** a claim

These features are used in the model to predict the probability of an insurance claim based on customer profile data.
