In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import time
import warnings

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch import optim

warnings.filterwarnings("ignore")

In [2]:
SEED = 22
NFOLDS = 5

np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fd43c059c90>

In [3]:
train = pd.read_csv("../input/lish-moa/train_features.csv")
targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
test = pd.read_csv("../input/lish-moa/test_features.csv")
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")

In [4]:
def encode(df):
    df[["cp_type", "cp_dose"]] = df[["cp_type", "cp_dose"]].replace([
        "trt_cp", "ctl_vehicle", "D1", "D2"], [0, 1, 0, 1])
    return df

train = encode(train)
test = encode(test)

X = train.iloc[:, 1:].to_numpy()
X_test = test.iloc[:, 1:].to_numpy()
y = targets.iloc[:, 1:].to_numpy()

In [5]:
INPUT_FEATURES = X.shape[1]
OUTPUT_FEATURES = y.shape[1]

In [6]:
class MoaDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        
    def __getitem__(self, idx):
        return torch.FloatTensor(self.X[idx]), torch.FloatTensor(self.y[idx])
    
    def __len__(self):
        return len(self.X)

In [7]:
X = MoaDataset(X, y)
# X_test = MoaDataset(X_test, train_data=False)

X_loader = DataLoader(X, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
# X_test_loader = DataLoader(X_test, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

In [8]:
class MoaModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.model = nn.Sequential(
            nn.Linear(INPUT_FEATURES, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            
            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            
            nn.Linear(1024, OUTPUT_FEATURES),
            
            nn.Flatten(),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        output = self.model(x)
        return output

In [9]:
EPOCHS = 5

model = MoaModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    losses = []
    
    for i, data in enumerate(X_loader, 0):
        inputs, labels = data
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, avg loss: {np.mean(losses)}")

Epoch: 1, Loss: 0.014184613712131977, avg loss: 0.02464403728667357
Epoch: 2, Loss: 0.0051403590478003025, avg loss: 0.015277753532901606
Epoch: 3, Loss: 0.0014829108258709311, avg loss: 0.013428877735443054
Epoch: 4, Loss: 0.0008213979308493435, avg loss: 0.011508861382650436
Epoch: 5, Loss: 0.0006249843281693757, avg loss: 0.009403806213001114


In [10]:
model.eval()

X_test_preds = torch.from_numpy(X_test).float()
X_test_loader = DataLoader(X_test_preds, batch_size=64, num_workers=4, pin_memory=True)
preds = torch.FloatTensor()
with torch.no_grad():
    for i, test_data in enumerate(X_test_loader):
        output = model(test_data)
    
        preds = torch.cat((preds, output), dim=0)

In [11]:
sub.iloc[:, 1:] = preds.numpy()
sub.to_csv("submission.csv", index=False)