In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from torch.utils.data import dataloader, Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch
from torch.nn import Embedding
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
df = pd.read_csv("datasets/car_insurance_claim.csv")

for col in ["INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM", "CLM_AMT",]:
    df[col] = df[col].replace("[^.0-9]", "", regex=True).astype(float).fillna(0.0)

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.upper().replace("Z_", "", regex=True).replace("[^A-Z<]", "", regex=True)

df.drop(labels=["ID","BIRTH"], axis=1, inplace=True)

df["OCCUPATION"].fillna("OTHER", inplace=True)
for col in ["AGE","YOJ","CAR_AGE"]:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include=[float]):
    df[col] = df[col].astype(int)

df["URBANICITY"] = df["URBANICITY"].map({"HIGHLYURBANURBAN":"URBAN", "HIGHLYRURALRURAL":"RURAL"})
df.rename(columns={"URBANICITY": "AREA"}, inplace=True)

categorical_features = ["CAR_USE", "REVOKED", "RED_CAR", "GENDER", "MSTATUS", "AREA", "PARENT1", "EDUCATION", "KIDSDRIV", "HOMEKIDS", "CAR_TYPE", "OCCUPATION", "MVR_PTS", "CLAIM_FLAG"]
numerical_features = ["AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CLM_FREQ", "CLM_AMT", "CAR_AGE"]

df

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,AREA
0,0,60,0,11,67349,NO,0,NO,M,PHD,...,MINIVAN,YES,4461,2,NO,3,0,18,0,URBAN
1,0,43,0,11,91449,NO,257252,NO,M,HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,1,0,URBAN
2,0,48,0,11,52881,NO,0,NO,M,BACHELORS,...,VAN,YES,0,0,NO,2,0,10,0,URBAN
3,0,35,1,10,16039,NO,124191,YES,F,HIGHSCHOOL,...,SUV,NO,38690,2,NO,3,0,10,0,URBAN
4,0,51,0,14,0,NO,306251,YES,M,<HIGHSCHOOL,...,MINIVAN,YES,0,0,NO,0,0,6,0,URBAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,1,45,2,9,164669,NO,386273,YES,M,PHD,...,MINIVAN,NO,0,0,NO,2,0,17,0,URBAN
10298,0,46,0,9,107204,NO,332591,YES,M,MASTERS,...,PANELTRUCK,NO,0,0,NO,0,0,1,0,URBAN
10299,0,48,0,15,39837,NO,170611,YES,F,<HIGHSCHOOL,...,SUV,NO,0,0,NO,0,0,1,0,URBAN
10300,0,50,0,7,43445,NO,149248,YES,F,BACHELORS,...,MINIVAN,NO,0,0,NO,0,0,11,0,URBAN


In [8]:
df_cat = df[categorical_features].apply(LabelEncoder().fit_transform)
df_cont = df[numerical_features]

x_cat_tensor = torch.tensor(df_cat.values, dtype=torch.long)
x_cont_tensor = torch.tensor(df_cont.values, dtype=torch.float)

In [11]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim, attn_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.attn_dim = attn_dim

        self.query = nn.Linear(input_dim, attn_dim)
        self.key = nn.Linear(input_dim, attn_dim)
        self.value = nn.Linear(input_dim, attn_dim)

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        attn_weights = F.softmax(q @ k.transpose(-2, -1) / self.attn_dim**0.5, dim=-1)
        output = attn_weights @ v

        return output, attn_weights


class MixedTypeClusteringModel(nn.Module):
    def __init__(self, n_cat, emb_dim, n_cont, attn_dim, n_clusters):
        super(MixedTypeClusteringModel, self).__init__()

        self.n_cat = n_cat
        self.emb_dim = emb_dim
        self.n_cont = n_cont
        self.attn_dim = attn_dim
        self.n_clusters = n_clusters

        self.embedding = Embedding(n_cat, emb_dim)
        self.self_attention = SelfAttention(emb_dim + n_cont, attn_dim)

    def forward(self, x_cat, x_cont):
        # Embed categorical variables
        x_cat_emb = self.embedding(x_cat)

        # Concatenate with continuous variables
        x = torch.cat([x_cat_emb, x_cont], dim=1)

        # Apply self-attention
        x_transformed, attn_weights = self.self_attention(x)

        return x_transformed, attn_weights

    def fit(self, x_cat, x_cont):
        # Embed categorical variables
        x_cat_emb = self.embedding(x_cat)

        # Concatenate with continuous variables
        x = torch.cat([x_cat_emb, x_cont], dim=1)

        # Apply self-attention
        x_transformed, attn_weights = self.self_attention(x)

        # Perform KMeans clustering
        kmeans = KMeans(n_init="auto",n_clusters=2, random_state=0).fit(df)

        return kmeans

# Hyperparameters
n_cat = ...    # Number of unique values in categorical data
emb_dim = ...  # Dimension of embeddings
n_cont = ...   # Number of continuous variables
attn_dim = ... # Dimension of attention
n_clusters = ... # Number of clusters
epochs = ...   # Number of epochs
lr = ...       # Learning rate

# Create model
model = MixedTypeClusteringModel(n_cat, emb_dim, n_cont, attn_dim, n_clusters)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()

    # Forward pass
    x_transformed, attn_weights = model(x_cat_tensor, x_cont_tensor)

    # Compute loss
    cluster_ids_x, cluster_centers = model.fit(x_cat_tensor, x_cont_tensor)
    loss = criterion(x_transformed, cluster_ids_x)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss for every epoch
    print(f'Epoch {epoch+1}, Loss {loss.item()}')

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
