In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("./playground-series-s4e7/train.csv")

df

In [None]:
dfdum = pd.get_dummies(df)
dfdum.info()

In [None]:
df.Region_Code.hist()

In [None]:
df.Driving_License.unique()

In [None]:
df.Gender.unique()

In [None]:
import torch

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
print(f"Using {device}")

In [None]:
from torch.utils.data import Dataset
import numpy as np


class InsuranceDataset(Dataset):
    def __init__(self, csv_path, contains_labels):
        self.df = pd.read_csv(csv_path)

        self.df = pd.get_dummies(self.df)

        normalized_cols = [
            "Age",
            "Region_Code",
            "Annual_Premium",
            "Policy_Sales_Channel",
            "Vintage",
        ]

        for col in normalized_cols:
            self.df[col] = (self.df[col] - self.df[col].mean()) / self.df[col].std()

        self.contains_labels = contains_labels

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        id = row["id"]
        row = row.drop("id")

        if self.contains_labels:
            label = torch.tensor(row["Response"], dtype=torch.float)
            row = row.drop("Response")

        data = torch.tensor(row.to_numpy(dtype=np.float32), dtype=torch.float)

        if self.contains_labels:
            return id, data, label
        else:
            return id, data

In [None]:
train_set = InsuranceDataset("./playground-series-s4e7/train.csv", contains_labels=True)

train_set[3]

In [None]:
import torch.nn as nn


class InsuranceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_layer = nn.Linear(14, 1)

        self.model = nn.Sequential(
            nn.Linear(14, 1),
            nn.Sigmoid(),
        )

    def forward(self, input):
        return self.model(input)

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm


train_loader = DataLoader(train_set, batch_size=2048, shuffle=True)

model = InsuranceModel().to(device)
model.train()

loss = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1):
    for i, (id, data, targets) in tqdm(
        enumerate(train_loader), total=len(train_loader)
    ):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()

        output = model(data)

        targets = targets.unsqueeze(1)

        loss_val = loss(output, targets)

        if i % 100 == 0:
            print(f"Train loss: {loss_val.item()}")

        loss_val.backward()

        optimizer.step()

In [None]:
test_set = InsuranceDataset("./playground-series-s4e7/test.csv", contains_labels=False)
test_loader = DataLoader(test_set, batch_size=2048, shuffle=True)

model.eval()

sub = pd.DataFrame(columns=["id", "Response"])

for id, data in tqdm(test_loader):
    data = data.to(device)

    outputs = model(data)

    outputs = outputs.detach().squeeze().cpu().numpy()

    batch_df = pd.DataFrame({"id": id, "Response": outputs})
    sub = pd.concat([sub, batch_df], ignore_index=True)


sub.to_csv("submission.csv", index=False)