In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [23]:
df = pd.read_csv("datasets/car_insurance_claim.csv")

for col in ["INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM", "CLM_AMT",]:
    df[col] = df[col].replace("[^.0-9]", "", regex=True).astype(float).fillna(0.0)

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.upper().replace("Z_", "", regex=True).replace("[^A-Z<]", "", regex=True)

df.drop(labels=["ID","BIRTH"], axis=1, inplace=True)

df["OCCUPATION"].fillna("OTHER", inplace=True)
for col in ["AGE","YOJ","CAR_AGE"]:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include=[float]):
    df[col] = df[col].astype(int)

df["URBANICITY"] = df["URBANICITY"].map({"HIGHLYURBANURBAN":"URBAN", "HIGHLYRURALRURAL":"RURAL"})
df.rename(columns={"URBANICITY": "AREA"}, inplace=True)

categorical_features = ["EDUCATION", "KIDSDRIV", "HOMEKIDS", "CAR_TYPE", "OCCUPATION", "MVR_PTS"]
boolean_features = ["CAR_USE", "REVOKED", "RED_CAR", "GENDER", "MSTATUS", "AREA", "PARENT1", "CLAIM_FLAG"]
numerical_features = ["AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CLM_FREQ", "CLM_AMT", "CAR_AGE"]

df

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,AREA
0,0,60,0,11,67349,NO,0,NO,M,PHD,...,MINIVAN,YES,4461,2,NO,3,0,18,0,URBAN
1,0,43,0,11,91449,NO,257252,NO,M,HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,1,0,URBAN
2,0,48,0,11,52881,NO,0,NO,M,BACHELORS,...,VAN,YES,0,0,NO,2,0,10,0,URBAN
3,0,35,1,10,16039,NO,124191,YES,F,HIGHSCHOOL,...,SUV,NO,38690,2,NO,3,0,10,0,URBAN
4,0,51,0,14,0,NO,306251,YES,M,<HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,6,0,URBAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,1,45,2,9,164669,NO,386273,YES,M,PHD,...,MINIVAN,NO,0,0,NO,2,0,17,0,URBAN
10298,0,46,0,9,107204,NO,332591,YES,M,MASTERS,...,PANELTRUCK,NO,0,0,NO,0,0,1,0,URBAN
10299,0,48,0,15,39837,NO,170611,YES,F,<HIGHSCHOOL,...,SUV,NO,0,0,NO,0,0,1,0,URBAN
10300,0,50,0,7,43445,NO,149248,YES,F,BACHELORS,...,MINIVAN,NO,0,0,NO,0,0,11,0,URBAN


In [24]:
df_cat = df[categorical_features].apply(LabelEncoder().fit_transform)
df_bool = df[boolean_features].apply(LabelEncoder().fit_transform)
df_cont = df[numerical_features]

In [25]:
class CarInsuranceDataset(Dataset):
    def __init__(self, df_cat, df_bool, df_cont):
        self.cat = df_cat.to_numpy()
        self.bool = df_bool.to_numpy()
        self.cont = df_cont.to_numpy()

    def __getitem__(self, idx):
        cat_row = torch.tensor(self.cat[idx], dtype=torch.float)
        bool_row = torch.tensor(self.bool[idx], dtype=torch.float)
        cont_row = torch.tensor(self.cont[idx], dtype=torch.float)
        return cat_row, bool_row, cont_row

    def __len__(self):
        return np.size(self.cat, 0)

In [26]:
dataset = CarInsuranceDataset(df_cat, df_bool, df_cont)
dataloader = DataLoader(dataset, shuffle=True)
len(dataset)

10302

In [27]:
torch.cuda.is_available()

False

In [28]:
class MixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(25, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 8),
            torch.nn.ReLU(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(8, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 25),
            torch.nn.Sigmoid()
        )


    def forward(self, x_cat, x_bool, x_cont):
        x = torch.cat((x_cat, x_bool, x_cont), 1)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 10
lr = 0.001

model = MixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for cat_tensor, bool_tensor, cont_tensor in dataloader:
        optimizer.zero_grad()
        outputs = model(cat_tensor, bool_tensor, cont_tensor)
        train_loss = criterion(outputs, torch.cat((cat_tensor, bool_tensor, cont_tensor), 1))
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch : 1/10, loss = 1777714573.463919
epoch : 2/10, loss = 1777714368.791812
epoch : 3/10, loss = 1777714319.964037
epoch : 4/10, loss = 1777714202.149171
epoch : 5/10, loss = 1777714201.497190
epoch : 6/10, loss = 1777714201.443922
epoch : 7/10, loss = 1777714201.443922
epoch : 8/10, loss = 1777714201.443922
epoch : 9/10, loss = 1777714202.145921
epoch : 10/10, loss = 1777714201.443922
