In [1]:
# to do
# data loader
# kbinsdiscretizer
# embeddings
# transformer

In [44]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [65]:
n_embed = 10
n_heads = 4
d_model = 64
head_size = d_model//n_heads
dropout = 0.3
batch_size = 256

In [4]:
input = pd.read_csv('./data/train.csv', index_col=0)
target = 'FloodProbability'
features = [col for col in input.columns if col != target]

X_train, X_val, y_train, y_val = train_test_split(
    input[features],
    input[target],
    test_size=0.2,
    random_state=42
    )

In [19]:
disc = KBinsDiscretizer(
    n_bins=n_embed,
    encode='ordinal',
    strategy='uniform',
    subsample=None
)

train_disc = disc.fit_transform(X_train)
train_tensor = torch.tensor(train_disc, dtype=torch.int32)
val_disc = disc.transform(X_val)
val_tensor = torch.tensor(val_disc, dtype=torch.int32)

xs = {
    'train':train_tensor,
    'val':val_tensor
}

ys = {
    'train':torch.tensor(y_train.values, dtype=torch.float32),
   
    'val':torch.tensor(y_val.values, dtype=torch.float32)
}

def get_batch(split):
    assert split in ['train', 'val']
    idx = torch.randint(len(xs[split]), (batch_size,))
    x = xs[split][idx]
    y = ys[split][idx]
    x, y = x.to(device), y.to(device)
    return x, y

In [67]:
class Head(nn.Module):
    def __init__(self, head_size, dropout):
        super().__init__()
        self.key = nn.Linear(d_model, head_size)
        self.query = nn.Linear(d_model, head_size)
        self.value = nn.Linear(d_model, head_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        w = k @ q.transpose(-2, -1) * C**-0.5 # (B, T, T), multiply with C**-0.5 to ensure unit gaussian outputs
        w = F.softmax(w, dim=-1) # (B, T, T)
        w = self.dropout(w)
        out = w @ v # (B, T, T) @ (B, T, C) = (B, T, C)
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, head_size, n_heads, d_model, dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, dropout) for _ in range(n_heads)])
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.ReLU(),
            nn.Linear(4*d_model, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        out = self.net(x)
        return out
    
class Block(nn.Module):
    def __init__(self, head_size, d_model, n_heads, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(head_size, n_heads, d_model, dropout)
        self.ff = FeedForward(d_model, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

In [73]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(len(features), d_model)
        self.blocks = nn.Sequential(
            Block(head_size, d_model, n_heads, dropout),
            Block(head_size, d_model, n_heads, dropout),
            Block(head_size, d_model, n_heads, dropout)
        )
        self.linear = nn.Linear(d_model*len(features), 1)

    def forward(self, x, y=None):
        out = self.embed(x)
        out = self.blocks(out).view(-1, d_model*len(features))
        out = self.linear(out).squeeze()

        if y == None:
            loss = None
        else:
            loss = F.huber_loss(out, y)
        return out, loss

In [74]:
x, y = get_batch('train')
m = Model().to(device)
m.train()
optimizer = optim.AdamW(m.parameters(), lr=1e-2)

In [75]:
pred, loss = m(x, y)

In [76]:
losses = []
for i in tqdm(range(10000)):
    x, y = get_batch('train')
    logits, loss = m(x, y)
    optimizer.zero_grad()
    loss.backward()
    losses.append(loss.item())
    optimizer.step()
    if i%1000==0:
        tqdm.write(f"step {i+1}: loss {np.mean(losses[-10000:]):.4f}")

  0%|          | 0/10000 [00:00<?, ?it/s]

step 1: loss 0.2738
step 1001: loss 0.0916
step 2001: loss 0.0460
step 3001: loss 0.0309
step 4001: loss 0.0233
step 5001: loss 0.0187
step 6001: loss 0.0157
step 7001: loss 0.0135
step 8001: loss 0.0118
step 9001: loss 0.0105
