In [52]:
import torch
import numpy as np
import pandas as pd

In [53]:
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

In [54]:
df_0 = pd.read_csv("data.csv", index_col=0)
features = df_0.drop("x_e_out [-]",axis=1).columns
df_0[features] = df_0[features].fillna(df_0[features].mean())

# split into train and test
test_size = 4000
df_train = df_0.iloc[:-2*test_size]
df_dev = df_0.iloc[-2*test_size:-test_size]
df_test = df_0.iloc[-test_size:]
df_train.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Thompson,tube,7.0,3770.0,0.1754,8.629255,10.8,432.0,3.6
1,Thompson,tube,10.640747,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [55]:
# mask of missing data
# mask = df_0.isnull()
# mask.head()
df_train.describe()

Unnamed: 0,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
count,23644.0,23644.0,15845.0,23644.0,23644.0,23644.0,23644.0
mean,10.630301,3061.910861,6e-06,8.622508,14.147334,829.987506,3.79174
std,4.01972,1638.126607,0.100673,4.696631,18.376478,619.062075,1.970519
min,0.1,0.0,-0.8667,1.0,1.0,10.0,0.8
25%,6.89,1655.0,-0.0465,5.6,7.7,432.0,2.4
50%,10.640747,3068.011023,0.0049,8.629255,10.3,762.0,3.4
75%,13.79,4028.0,0.0649,10.3,14.17433,864.0,4.6
max,20.68,7975.0,0.232,37.5,120.0,3048.0,19.3


In [56]:
from torch.utils.data import Dataset, DataLoader

In [57]:
class MissingFeaturesDataset(Dataset):
    def __init__(self, df, y_col, p_mask=0.2, categorical_cols=None):
        self.features = len(df.columns) - 1
        # self.p_mask = p_mask
        self.mask_val = 0

        self._categorical_cols = []
        if categorical_cols is not None:
            for col in categorical_cols:
                one_hot = pd.get_dummies(df[col])
                one_hot[df[col].isnull()] = np.nan

                self._categorical_cols.append(one_hot.columns)
                df = df.drop(col, axis=1)
                df = pd.concat([df, one_hot], axis=1)

        # drop y_col = NaN
        df.dropna(subset=[y_col], inplace=True)

        self.df = df.drop(y_col, axis=1)
        self.ys = df[y_col]

    def __len__(self):
        return len(self.df)

    def onehot_categorical_mask(self, column_mask):
        return np.concatenate(
            [np.full(len(cs), m) for m, cs in zip(column_mask, self._categorical_cols)]
        )

    def __getitem__(self, idx):
        record = self.df.iloc[idx]
        output_mask = torch.tensor(~record.isnull().values)

        # input_mask = ~np.random.binomial(1, self.p_mask, size=self.features).astype(
        #     bool
        # )

        # # mask continuous columns normally
        # # mask categorical columns with all their columns

        # full_input_mask = (
        #     torch.tensor(
        #         np.concatenate(
        #             [
        #                 input_mask[: -len(self._categorical_cols)],
        #                 self.onehot_categorical_mask(
        #                     input_mask[-len(self._categorical_cols) :]
        #                 ),
        #             ]
        #         )
        #     )
        #     & output_mask
        # )

        # x_in = record.values.copy()
        # x_in[~full_input_mask] = 0

        x = record.values.copy()
        x[~output_mask] = self.mask_val

        # x = torch.tensor(record.values)
        # return masked_tensor(x, full_input_mask), masked_tensor(x, output_mask)
        assert not np.isnan(x).any(), x
        assert not np.isnan(self.ys.iloc[idx]), self.ys.iloc[idx]
        return (
            torch.as_tensor(x, dtype=torch.float32),
            torch.as_tensor(self.ys.iloc[idx], dtype=torch.float32),
        )

In [58]:
categorical_cols = ["author", "geometry"]
p_mask = 0.2
ds_train = MissingFeaturesDataset(
    df_train, y_col="x_e_out [-]", p_mask=p_mask, categorical_cols=categorical_cols
)
ds_dev = MissingFeaturesDataset(
    df_dev, y_col="x_e_out [-]", p_mask=p_mask, categorical_cols=categorical_cols
)
ds_test = MissingFeaturesDataset(
    df_test, y_col="x_e_out [-]", p_mask=p_mask, categorical_cols=categorical_cols
)

In [59]:
from torch import nn
from typing import List

device = "cuda" if torch.cuda.is_available() else "cpu"
class MLP(nn.Module):
    def __init__(self, layer_sizes: List[int], activation=nn.ReLU()):
        super().__init__()
        if len(layer_sizes) < 2:
            raise ValueError("At least 2 layers are required")

        layers = []
        for size_from, size_to in zip(layer_sizes, layer_sizes[1:-1]):
            layers.append(nn.Linear(size_from, size_to))
            layers.append(activation)
        layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))

        self.linear_stack = nn.Sequential(*layers)

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits.squeeze(1)


class Trainer:
    def __init__(self, model) -> None:
        self.model = model.to(device)
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
        # self.optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
        self.batch_size = 64

    def train(self, dataloader):
        size = len(dataloader.dataset)
        self.model.train()
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = self.model(X)
            loss = self.loss_fn(pred, y)

            # Backpropagation
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # if batch % 100 == 0:
            #     loss, current = loss.item(), (batch + 1) * len(X)
            #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    def eval(self, dataloader):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        self.model.eval()
        test_loss, correct = 0, 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(device), y.to(device)
                pred = self.model(X)
                test_loss += self.loss_fn(pred, y).item()
                correct += (pred.argmax() == y).type(torch.float).sum().item()
        test_loss /= num_batches
        correct /= size
        return correct, test_loss
        # print(
        #     f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
        # )
    
    def predict(self, dataloader):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        self.model.eval()
        preds = []
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(device), y.to(device)
                pred = self.model(X)
                preds.extend(pred.numpy())
        return preds
        # print(
        #     f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
        # )


    def train_loop(self, dataset_train, dataset_dev, epochs=5):
        dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=self.batch_size)
        dataloader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=self.batch_size)

        for t in range(epochs):
            # print(f"Epoch {t+1}\n-------------------------------")
            self.train(dataloader_train)
            # save model
            torch.save(self.model.state_dict(), f"model_{t}.pt")
            yield self.eval(dataloader_dev)
        print("Done!")


In [63]:
x_size = len(ds_train[0][0])
model = MLP([x_size, 128, 64, 1])
# model = MLP([x_size, 128, 64, 1], nn.Sigmoid())
sigmoid = 0.00875367448177366
trainer = Trainer(model)
for epoch_r in trainer.train_loop(ds_train, ds_dev, epochs=20):
    print(epoch_r)

(0.0, 0.1158370908704542)
(0.0, 0.05727974422985599)
(0.0, 0.03613453899465856)
(0.0, 0.03143297406356959)
(0.0, 0.10570263330425535)
(0.0, 0.019639033380718457)
(0.0, 0.016550269887028707)


In [39]:
# load model and run on dataset
sub_df = df_0.where(df_0["x_e_out [-]"].isnull()).dropna(how="all")
assert sub_df["x_e_out [-]"].isnull().all()
sub_df["x_e_out [-]"] = 0.0

sub_ds = MissingFeaturesDataset(sub_df, y_col="x_e_out [-]", categorical_cols=categorical_cols)

model = MLP([x_size, 128, 64, 1], nn.Sigmoid())
model.load_state_dict(torch.load("sigmoid.pt"))
model.eval()
model.to(device)

trainer = Trainer(model)
preds = trainer.predict(torch.utils.data.DataLoader(sub_ds, batch_size=64, shuffle=False))

preds_df = pd.Series(preds, index=sub_df.index)

In [41]:
preds_df.to_csv("submission.csv")