In [30]:
import pandas as pd
import numpy as np

# Load and clean the original dataset again
data = pd.read_csv('../data/raw/MachineLearningRating_v3.txt', sep='|', low_memory=False)


In [31]:
# Convert numeric fields
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')

print("Total rows:", data.shape[0])
print("Rows with non-zero TotalClaims:", (data['TotalClaims'] > 0).sum())
print("Rows with TotalPremium > 0:", (data['TotalPremium'] > 0).sum())

print(data['TotalClaims'].unique()[:20])  # preview first 20 unique values
print(data['TotalClaims'].dtype)

Total rows: 1000098
Rows with non-zero TotalClaims: 2788
Rows with TotalPremium > 0: 618176
[    0.          2294.09649123  2040.47368421 46492.21175439
 26516.85964912  6140.35087719  4385.96491228 25438.59649123
 16715.         74967.01754386 14867.5         2192.98245614
  1450.           921.05263158 16804.46491228  2889.70175439
   877.19298246 15262.60526316 72445.03508772  5263.15789474]
float64


In [None]:
# STEP 1: Filter only claim records
claims_df = data[(data['TotalClaims'] > 0) & (data['TotalPremium'] > 0)].copy()



✅ After cleanup: 632 rows
✅ Ready for modeling: 632 records, 304 features


In [None]:
# STEP 2: Drop irrelevant/high-cardinality columns
drop_cols = [
    'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'VehicleIntroDate',
    'Model', 'Make', 'MainCrestaZone', 'SubCrestaZone',
    'Product', 'Section', 'StatutoryClass', 'StatutoryRiskType'
]
claims_df.drop(columns=drop_cols, inplace=True, errors='ignore')

In [None]:
# STEP 3: Encode categorical features using one-hot encoding
categorical_cols = claims_df.select_dtypes(include='object').columns
claims_df = pd.get_dummies(claims_df, columns=categorical_cols, drop_first=True)

In [None]:
# STEP 4: Drop remaining NaNs just to be safe
# Only drop rows where target or numeric inputs are invalid
required_cols = ['TotalPremium', 'TotalClaims', 'SumInsured', 'CalculatedPremiumPerTerm', 'CustomValueEstimate']
claims_df = claims_df.dropna(subset=required_cols)

print(f"✅ After cleanup: {claims_df.shape[0]} rows")


In [None]:
# STEP 5: Split features and target
X = claims_df.drop(columns=['TotalClaims'])
y = claims_df['TotalClaims']

print(f"✅ Ready for modeling: {X.shape[0]} records, {X.shape[1]} features")

✅ Features shape: (0, 305), Target shape: (0,)
