In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from torch.utils.data import dataloader, Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

In [14]:
df = pd.read_csv("datasets/car_insurance_claim.csv")

for col in ["INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM", "CLM_AMT",]:
    df[col] = df[col].replace("[^.0-9]", "", regex=True).astype(float).fillna(0.0)

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.upper().replace("Z_", "", regex=True).replace("[^A-Z<]", "", regex=True)

df.drop(labels=["ID","BIRTH"], axis=1, inplace=True)

df["OCCUPATION"].fillna("OTHER", inplace=True)
for col in ["AGE","YOJ","CAR_AGE"]:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include=[float]):
    df[col] = df[col].astype(int)

df["URBANICITY"] = df["URBANICITY"].map({"HIGHLYURBANURBAN":"URBAN", "HIGHLYRURALRURAL":"RURAL"})
df.rename(columns={"URBANICITY": "AREA"}, inplace=True)

categorical_features = ["CAR_USE", "REVOKED", "RED_CAR", "GENDER", "MSTATUS", "AREA", "PARENT1", "EDUCATION", "KIDSDRIV", "HOMEKIDS", "CAR_TYPE", "OCCUPATION", "CAR_USE", "MVR_PTS"]
numerical_features = ["AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CLM_FREQ", "CLM_AMT", "CAR_AGE"]

df

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,AREA
0,0,60,0,11,67349,NO,0,NO,M,PHD,...,MINIVAN,YES,4461,2,NO,3,0,18,0,URBAN
1,0,43,0,11,91449,NO,257252,NO,M,HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,1,0,URBAN
2,0,48,0,11,52881,NO,0,NO,M,BACHELORS,...,VAN,YES,0,0,NO,2,0,10,0,URBAN
3,0,35,1,10,16039,NO,124191,YES,F,HIGHSCHOOL,...,SUV,NO,38690,2,NO,3,0,10,0,URBAN
4,0,51,0,14,0,NO,306251,YES,M,<HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,6,0,URBAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,1,45,2,9,164669,NO,386273,YES,M,PHD,...,MINIVAN,NO,0,0,NO,2,0,17,0,URBAN
10298,0,46,0,9,107204,NO,332591,YES,M,MASTERS,...,PANELTRUCK,NO,0,0,NO,0,0,1,0,URBAN
10299,0,48,0,15,39837,NO,170611,YES,F,<HIGHSCHOOL,...,SUV,NO,0,0,NO,0,0,1,0,URBAN
10300,0,50,0,7,43445,NO,149248,YES,F,BACHELORS,...,MINIVAN,NO,0,0,NO,0,0,11,0,URBAN


In [16]:
class MixedDataset(Dataset):
    def __init__(self, data_paths, transform=None):
        self.data_paths = data_paths
        self.transform = transform

    def __len__(self):
        return len(self.data_paths)

    def __getitem__(self, idx):
        # Load and preprocess the data
        sample = self.load_data(self.data_paths[idx])

        if self.transform:
            sample = self.transform(sample)

        return sample

    def load_data(self, data_path):
        # Implement loading logic based on the file type or other criteria
        # Return the loaded data
        pass

class AttentionClusteringNet(nn.Module):
    def __init__(self, num_numeric_features, num_categorical_features, embedding_dim, hidden_dim, num_clusters):
        super(AttentionClusteringNet, self).__init__()

        # Numeric feature encoding
        self.numeric_encoder = nn.Sequential(
            nn.Linear(num_numeric_features, hidden_dim),
            nn.ReLU()
        )

        # Categorical feature encoding
        self.embedding = nn.Embedding(num_categorical_features, embedding_dim)

        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim + embedding_dim, 1),
            nn.Softmax(dim=1)
        )

        # Fusion layer
        self.fusion = nn.Linear(hidden_dim + embedding_dim, hidden_dim)

        # Clustering layer
        self.clustering = nn.Linear(hidden_dim, num_clusters)

    def forward(self, numeric_features, categorical_features):
        # Numeric feature encoding
        encoded_numeric = self.numeric_encoder(numeric_features)

        # Categorical feature encoding
        embedded_categorical = self.embedding(categorical_features)

        # Attention mechanism
        attention_input = torch.cat((encoded_numeric, embedded_categorical), dim=1)
        attention_weights = self.attention(attention_input)

        # Weighted fusion of variable representations
        fused_representation = torch.mul(attention_weights, attention_input)
        fused_representation = self.fusion(fused_representation)

        # Clustering
        logits = self.clustering(fused_representation)

        return logits, attention_weights

# Prepare the data
# Assuming you have your own dataset and data loading/preprocessing code

# Define hyperparameters
num_epochs = 10
batch_size = 32
lr = 0.001
num_clusters = 5
hidden_dim = 64
embedding_dim = 32

# Create an instance of the model
model = AttentionClusteringNet(len(numerical_features), len(categorical_features), embedding_dim, hidden_dim, num_clusters)

# Define the clustering loss function
loss_fn = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

dataset = MixedDataset(df)

# Create a data loader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0

    for batch_data in dataloader:  # Iterate over the batches of data
        numeric_features, categorical_features, targets = batch_data

        optimizer.zero_grad()  # Zero the gradients

        logits, attention_weights = model(numeric_features, categorical_features)

        # Compute the clustering loss
        loss = loss_fn(logits, targets)

        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model parameters

        running_loss += loss.item()

    # Print the average loss for the epoch
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

KeyError: 6346