In this notebook we will train a deep learning model using all the data available !
* preprocessing : I encoded the smiles of all the train & test set and saved it [here](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset) , this may take up to 1 hour on TPU.
* Training & Inference : I used a simple 1dcnn model trained on 20 epochs.

How to improve :
* Try a different architecture : I'm able to get an LB score of 0.604 with minor changes on this architecture.
* Try another model like Transformer, or LSTM.
* Train for more epochs.
* Add more features like a one hot encoding of bb2 or bb3.
* And of course ensembling with GBDT models.

In [95]:
!pip install fastparquet -q

[0m

In [96]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS
import polars 

In [97]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import numpy as np


In [98]:
class Config:
    PREPROCESS = False
    KAGGLE_NOTEBOOK = False
    DEBUG = False
    
    SEED = 42
    EPOCHS = 30
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05
    PATIENCE = 5
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    NBR_FOLDS = 15
    SELECTED_FOLDS = [0,1,2]
    
    
    
    


In [99]:
if Config.KAGGLE_NOTEBOOK:
    RAW_DIR = "/kaggle/input/leash-BELKA/"
    PROCESSED_DIR = "/kaggle/input/belka-enc-dataset"
    OUTPUT_DIR = ""
    MODEL_DIR = ""
else:
    RAW_DIR = "../data/raw/"
    PROCESSED_DIR = "../data/processed/"
    OUTPUT_DIR = "../data/result/"
    MODEL_DIR = "../models/"

In [100]:
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds(seed=Config.SEED)

# Preprocessing

In [101]:
if Config.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('train_enc.parquet')

    test_raw = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('test_enc.parquet')

else:
    if Config.DEBUG:
        train = polars.read_parquet(os.path.join(PROCESSED_DIR, 'train_enc.parquet'), n_rows=1000)
        test = polars.read_parquet(os.path.join(PROCESSED_DIR, 'test_enc.parquet'))
    else:
        train = polars.read_parquet(os.path.join(PROCESSED_DIR, 'train_enc.parquet'), n_rows=10000)
        test = polars.read_parquet(os.path.join(PROCESSED_DIR, 'test_enc.parquet'))
        
    train = train.to_pandas()
    test = test.to_pandas()

In [102]:
def prepare_data(train, train_idx, valid_idx, features, targets, device):
    """
    データの準備を行う関数
    """
    X_train = torch.tensor(train.loc[train_idx, features].values, dtype=torch.float32).to(device)
    y_train = torch.tensor(train.loc[train_idx, targets].values, dtype=torch.float32).to(device)
    X_val = torch.tensor(train.loc[valid_idx, features].values, dtype=torch.float32).to(device)
    y_val = torch.tensor(train.loc[valid_idx, targets].values, dtype=torch.float32).to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
    
    return train_loader, valid_loader, X_val, y_val



In [103]:
train

Unnamed: 0,enc0,enc1,enc2,enc3,enc4,enc5,enc6,enc7,enc8,enc9,...,enc135,enc136,enc137,enc138,enc139,enc140,enc141,bind1,bind2,bind3
0,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
1,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
2,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
3,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
4,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8,22,8,8,29,8,3,3,5,32,...,0,0,0,0,0,0,0,0,0,0
9996,8,22,8,8,29,8,3,3,5,32,...,0,0,0,0,0,0,0,0,0,0
9997,8,22,8,8,29,8,3,3,5,32,...,0,0,0,0,0,0,0,0,0,0
9998,8,22,8,8,29,8,3,3,5,32,...,0,0,0,0,0,0,0,0,0,0


In [104]:


class Trainer:
    def __init__(self, model, criterion, optimizer, device, patience):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        self.patience = patience

    def train_epoch(self, train_loader):
        self.model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, targets)
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        return epoch_loss

    def validate(self, valid_loader):
        self.model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in valid_loader:
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

        val_loss /= len(valid_loader.dataset)
        return val_loss

    def train(self, train_loader, valid_loader, epochs):
        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            epoch_loss = self.train_epoch(train_loader)
            val_loss = self.validate(valid_loader)

            print(f'Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.model.state_dict(), os.path.join(MODEL_DIR, 'best_model.pt'))
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= self.patience:
                    print('Early stopping')
                    break

        return best_val_loss

    # 1行ずつ予測(メモリ節約)
    def predict(self, data_loader):
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for inputs in data_loader:
                inputs = inputs.to(self.device)
                outputs = self.model(inputs)
                predictions.append(outputs.cpu().numpy())
        return np.concatenate(predictions)

In [105]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # モデルの層を定義します
        self.layer = nn.Sequential(
            nn.Linear(142, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        )
    
    def forward(self, x):
        return self.layer(x)

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 17, 128)  # Correct input size after pooling
        self.fc2 = nn.Linear(128, 3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.pool(self.relu(self.conv1(x)))  # Output shape: [batch_size, 16, 71]
        x = self.pool(self.relu(self.conv2(x)))  # Output shape: [batch_size, 32, 35]
        x = self.pool(self.relu(self.conv3(x)))  # Output shape: [batch_size, 64, 17]
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [111]:


# 定数やモデルの定義は適宜修正してください
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']


# cross entoropy
criterion = nn.CrossEntropyLoss()

# StratifiedKFoldの設定
skf = StratifiedKFold(n_splits=Config.NBR_FOLDS, shuffle=True, random_state=42)
all_preds = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[TARGETS].sum(1))):
    
    if fold not in Config.SELECTED_FOLDS:
        continue

    # データの準備
    train_loader, valid_loader, X_val, y_val = prepare_data(train, train_idx, valid_idx, FEATURES, TARGETS, Config.DEVICE)

    model = CNNModel().to(Config.DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=Config.LR, weight_decay=Config.WD)

    trainer = Trainer(model, criterion, optimizer, Config.DEVICE, Config.PATIENCE)
    trainer.train(train_loader, valid_loader, Config.EPOCHS)

    # 最良のモデルをロードして予測を行う
    model.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'best_model.pt')))
    oof = model(X_val)
    print('fold :', fold, 'CV score =', APS(y_val.cpu().numpy(), oof.detach().cpu().numpy(), average='micro'))
    
    test_tensor = torch.tensor(test.values, dtype=torch.float32).to(Config.DEVICE)
    test_loader = DataLoader(test_tensor, batch_size=Config.BATCH_SIZE, shuffle=False)
    preds = trainer.predict(test_loader)
    all_preds.append(preds)
    


# CVのアンサンブル
preds = np.mean(all_preds, axis=0)




Epoch 1/30, Train Loss: 0.0093, Val Loss: 0.0084
Epoch 2/30, Train Loss: 0.0088, Val Loss: 0.0091
Epoch 3/30, Train Loss: 0.0089, Val Loss: 0.0087
Epoch 4/30, Train Loss: 0.0088, Val Loss: 0.0080
Epoch 5/30, Train Loss: 0.0087, Val Loss: 0.0081
Epoch 6/30, Train Loss: 0.0088, Val Loss: 0.0084
Epoch 7/30, Train Loss: 0.0090, Val Loss: 0.0086
Epoch 8/30, Train Loss: 0.0090, Val Loss: 0.0087
Epoch 9/30, Train Loss: 0.0090, Val Loss: 0.0086
Early stopping
fold : 0 CV score = 0.05506999850376996
Epoch 1/30, Train Loss: 0.0094, Val Loss: 0.0066
Epoch 2/30, Train Loss: 0.0086, Val Loss: 0.0073
Epoch 3/30, Train Loss: 0.0088, Val Loss: 0.0074
Epoch 4/30, Train Loss: 0.0088, Val Loss: 0.0074
Epoch 5/30, Train Loss: 0.0088, Val Loss: 0.0072
Epoch 6/30, Train Loss: 0.0088, Val Loss: 0.0070
Early stopping
fold : 1 CV score = 0.018332792542700326
Epoch 1/30, Train Loss: 0.0099, Val Loss: 0.0082
Epoch 2/30, Train Loss: 0.0093, Val Loss: 0.0075
Epoch 3/30, Train Loss: 0.0084, Val Loss: 0.0078
Epoch 4

# Submission

In [108]:

# テストデータの読み込み
tst = pd.read_parquet(os.path.join(RAW_DIR, "test.parquet"))

# 'binds'列を追加して初期化
tst['binds'] = 0

# ブールマスクの作成
mask_BRD4 = (tst['protein_name'] == 'BRD4').values
mask_HSA = (tst['protein_name'] == 'HSA').values
mask_sEH = (tst['protein_name'] == 'sEH').values

# 各マスクに対応する予測値を代入
tst.loc[mask_BRD4, 'binds'] = preds[mask_BRD4][:, 0]
tst.loc[mask_HSA, 'binds'] = preds[mask_HSA][:, 1]
tst.loc[mask_sEH, 'binds'] = preds[mask_sEH][:, 2]

# 0以下の要素を0にし，1以上の要素を1にする
tst['binds'] = tst['binds'].clip(0, 1)


# 'id'と'binds'列をCSVに出力
tst[['id', 'binds']].to_csv(os.path.join(OUTPUT_DIR,'submission.csv'), index=False)


  0.00293384]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  tst.loc[mask_BRD4, 'binds'] = preds[mask_BRD4][:, 0]
