In [1]:
import pandas as pd
import numpy as np

# Load and clean the original dataset again
data = pd.read_csv('../data/raw/MachineLearningRating_v3.txt', sep='|', low_memory=False)


In [2]:
# Convert numeric fields
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')



In [3]:
# Filter rows with valid claims
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['TotalClaims', 'TotalPremium', 'LossRatio'])

In [4]:
# Only rows with a claim > 0
claims_df = data[data['TotalClaims'] > 0].copy()

In [5]:
# Drop irrelevant/high-cardinality columns
drop_cols = [
    'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'VehicleIntroDate',
    'Model', 'Make', 'MainCrestaZone', 'SubCrestaZone',
    'Product', 'Section', 'StatutoryClass', 'StatutoryRiskType'
]
claims_df.drop(columns=drop_cols, inplace=True, errors='ignore')


In [6]:
# One-hot encode categorical variables
categorical_cols = claims_df.select_dtypes(include='object').columns
claims_df = pd.get_dummies(claims_df, columns=categorical_cols, drop_first=True)


In [7]:
# Drop any remaining NaNs
claims_df = claims_df.dropna()

# Split into features and target
X = claims_df.drop(columns=['TotalClaims'])
y = claims_df['TotalClaims']

print(f"✅ Features shape: {X.shape}, Target shape: {y.shape}")

✅ Features shape: (0, 305), Target shape: (0,)
