In [38]:
import torch
import numpy as np
import pandas as pd

In [39]:
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

In [79]:
df_0 = pd.read_csv("data.csv", index_col=0)

# split into train and test
test_size = 4000
df_train = df_0.iloc[:-2*test_size]
df_dev = df_0.iloc[-2*test_size:-test_size]
df_test = df_0.iloc[-test_size:]
df_train.head()

Unnamed: 0_level_0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [78]:
# mask of missing data
# mask = df_0.isnull()
# mask.head()
print(df_train.min())
print(df_train.max())

pressure [MPa]          0.1000
mass_flux [kg/m2-s]     0.0000
x_e_out [-]            -0.8667
D_e [mm]                1.0000
D_h [mm]                1.0000
length [mm]            10.0000
chf_exp [MW/m2]         0.8000
dtype: float64
pressure [MPa]           20.680
mass_flux [kg/m2-s]    7975.000
x_e_out [-]               0.232
D_e [mm]                 37.500
D_h [mm]                120.000
length [mm]            3048.000
chf_exp [MW/m2]          19.300
dtype: float64


  after removing the cwd from sys.path.
  """


In [42]:
df1 = pd.get_dummies(df_train["geometry"]).head().copy()
df1[df_train["geometry"].head().isnull()] = np.nan
df1

Unnamed: 0_level_0,annulus,plate,tube
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,,,
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [43]:
from torch.utils.data import Dataset, DataLoader

In [50]:
class MaskingDataset(Dataset):
    def __init__(self, df, p_mask=0.2, categorical_cols=None):
        self.features = len(df.columns)
        self.p_mask = p_mask

        self._categorical_cols = []
        if categorical_cols is not None:
            for col in categorical_cols:
                one_hot = pd.get_dummies(df[col])
                one_hot[df[col].isnull()] = np.nan

                self._categorical_cols.append(one_hot.columns)
                df = df.drop(col, axis=1)
                df = pd.concat([df, one_hot], axis=1)

        self.df = df

    def __len__(self):
        return len(self.df)

    def onehot_categorical_mask(self, column_mask):
        return np.concatenate(
            [np.full(len(cs), m) for m, cs in zip(column_mask, self._categorical_cols)]
        )

    def __getitem__(self, idx):
        record = self.df.iloc[idx]
        output_mask = torch.tensor(~record.isnull().values)

        input_mask = ~np.random.binomial(1, self.p_mask, size=self.features).astype(
            bool
        )

        # mask continuous columns normally
        # mask categorical columns with all their columns

        full_input_mask = (
            torch.tensor(
                np.concatenate(
                    [
                        input_mask[: -len(self._categorical_cols)],
                        self.onehot_categorical_mask(
                            input_mask[-len(self._categorical_cols) :]
                        ),
                    ]
                )
            )
            & output_mask
        )

        x_in = record.values.copy()
        x_in[~full_input_mask] = 0

        x_out = record.values.copy()
        x_out[~output_mask] = np.nan

        # x = torch.tensor(record.values)
        # return masked_tensor(x, full_input_mask), masked_tensor(x, output_mask)
        return (
            torch.as_tensor(x_in, dtype=torch.float32),
            torch.as_tensor(x_out, dtype=torch.float32),
        )

In [73]:
categorical_cols = ["author", "geometry"]
p_mask = 0.5
ds_train = MaskingDataset(df_train, p_mask=p_mask, categorical_cols=categorical_cols)
ds_dev = MaskingDataset(df_dev, p_mask=p_mask, categorical_cols=categorical_cols)
ds_test = MaskingDataset(df_test, p_mask=p_mask, categorical_cols=categorical_cols)

In [74]:
import torch.nn as nn


class FFModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))

        return self.fc3(x)


def separate_slices(categorical_slices, output_size):
    slices = []
    slice_is_categorical = []
    prev_start = 0
    for start, end in categorical_slices:
        if start != prev_start:
            slices.append(slice(prev_start, start))
            slice_is_categorical.append(False)
        slices.append(slice(start, end))
        slice_is_categorical.append(True)
        prev_start = end

    if prev_start != output_size:
        slices.append(slice(prev_start, output_size))
        slice_is_categorical.append(False)

    return slices, slice_is_categorical


class MixedContCatModel(nn.Module):
    def __init__(self, model, categorical_slices, output_size) -> None:
        super().__init__()

        self.model = model
        
        self.softmax = nn.Softmax(dim=1)

        self.slices, self.slice_is_categorical = separate_slices(
            categorical_slices, output_size
        )

    def forward(self, x):
        x = self.model(x)
        for slice_, is_categorical in zip(self.slices, self.slice_is_categorical):
            if is_categorical:
                x[:, slice_] = self.softmax(x[:, slice_])
        return x

In [75]:
class ContCatLoss(nn.Module):
    def __init__(
        self,
        categorical_slices,
        output_size,
        cont_loss=nn.MSELoss(),
        cat_loss=nn.CrossEntropyLoss(),
    ) -> None:
        super().__init__()

        self.n_cats = len(categorical_slices)
        slices, slice_is_categorical = separate_slices(categorical_slices, output_size)
        self.cont_slices = [s for s, is_cat in zip(slices, slice_is_categorical) if not is_cat]
        self.cat_slices = [s for s, is_cat in zip(slices, slice_is_categorical) if is_cat]

        self.cont_loss = cont_loss
        self.cat_loss = cat_loss


    def forward(self, y_pred, y_true):
        output_mask = ~torch.isnan(y_true)
        y_pred = y_pred * output_mask
        y_true = y_true.where(output_mask, torch.tensor(0.0))

        y_pred_cont = torch.concat([y_pred[:, s] for s in self.cont_slices], dim=1)
        y_true_cont = torch.concat([y_true[:, s] for s in self.cont_slices], dim=1)
        loss_cont = self.cont_loss(y_pred_cont, y_true_cont)

        losses_cat = []
        for s in self.cat_slices:
            y_pred_cat = y_pred[:, s]
            y_true_cat = y_true[:, s]
            losses_cat.append(self.cat_loss(y_pred_cat, y_true_cat))
        
        
        return loss_cont, losses_cat

class MixedContCatLoss(nn.Module):
    def __init__(self, contcatloss, ratio):
        super().__init__()

        self.contcatloss = contcatloss
        self.ratio = ratio
    
    def forward(self, y_pred, y_true):
        loss_cont, losses_cat = self.contcatloss(y_pred, y_true)

        loss_cat = sum(losses_cat)

        return self.ratio * loss_cont + (1 - self.ratio) * loss_cat

In [76]:
train_dataloader = DataLoader(ds_train, batch_size=32, shuffle=True)

x_size = ds_train[0][0].shape[0]
cat_slices = [[7,17],[17,20]]

inner_model = FFModel(x_size, 128, x_size)
model = MixedContCatModel(inner_model, cat_slices, x_size)

import torch.optim as optim

ccloss = ContCatLoss(cat_slices, x_size)
criterion = MixedContCatLoss(ccloss, 0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch} loss: {running_loss / len(train_dataloader)}")

Epoch 0 loss: 274720.2014388109
Epoch 1 loss: 174807.4745274442
Epoch 2 loss: 152527.1680295374
Epoch 3 loss: 132950.34780425407
Epoch 4 loss: 125254.51258827385
Epoch 5 loss: 118209.06767485623
Epoch 6 loss: 111672.27736753637
Epoch 7 loss: 110039.57336772665
Epoch 8 loss: 109395.12018193927


KeyboardInterrupt: 

In [62]:
~torch.tensor([1,torch.nan,3]).isnan() * torch.tensor([1,2,3])

tensor([1, 0, 3])