In this notebook we will train a deep learning model using all the data available !
* preprocessing : I encoded the smiles of all the train & test set and saved it [here](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset) , this may take up to 1 hour on TPU.
* Training & Inference : I used a simple 1dcnn model trained on 20 epochs.

How to improve :
* Try a different architecture : I'm able to get an LB score of 0.604 with minor changes on this architecture.
* Try another model like Transformer, or LSTM.
* Train for more epochs.
* Add more features like a one hot encoding of bb2 or bb3.
* And of course ensembling with GBDT models.

In [1]:
!pip install fastparquet -q

[0m

In [1]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS
import polars 

In [2]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

In [3]:
class CFG:

    PREPROCESS = False
    EPOCHS = 2
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05

    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]

    SEED = 2024



In [4]:
KAGGLE_NOTEBOOK = False

if KAGGLE_NOTEBOOK:
    RAW_DIR = "/kaggle/input/leash-BELKA/"
    PROCCESSED_DIR = "/kaggle/input/belka-enc-dataset"
    OUTPUT_DIR = ""
    MODEL_DIR = ""
else:
    RAW_DIR = "../data/raw/"
    PROCESSED_DIR = "../data/processed/"
    OUTPUT_DIR = "../data/result/"
    MODEL_DIR = "../models/"

In [5]:
import tensorflow as tf
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

2024-06-03 12:11:20.155623: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-03 12:11:20.178629: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-03 12:11:20.178656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-03 12:11:20.179311: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-03 12:11:20.183235: I tensorflow/core/platform/cpu_feature_guar

# Preprocessing

In [6]:
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('train_enc.parquet')

    test_raw = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('test_enc.parquet')

else:
    train = polars.read_parquet(os.path.join(PROCESSED_DIR, 'train_enc.parquet'), n_rows=1000)
    test = polars.read_parquet(os.path.join(PROCESSED_DIR, 'test_enc.parquet'))
    
    train = train.to_pandas()
    test = test.to_pandas()

# Modeling

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import numpy as np

# 定数やモデルの定義は適宜修正してください
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']
CFG = {
    'NBR_FOLDS': 5,
    'SELECTED_FOLDS': [0, 1, 2, 3, 4],
    'BATCH_SIZE': 32,
    'LR': 0.001,
    'WD': 1e-4,
    'EPOCHS': 5,
    'PATIENCE': 5,
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# データセットの読み込み
# train, test を適宜読み込みます

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # モデルの層を定義します
        self.layer = nn.Sequential(
            nn.Linear(142, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        )
    
    def forward(self, x):
        return self.layer(x)

# 損失関数
criterion = nn.MSELoss()

# StratifiedKFoldの設定
skf = StratifiedKFold(n_splits=CFG['NBR_FOLDS'], shuffle=True, random_state=42)
all_preds = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[TARGETS].sum(1))):
    if fold not in CFG['SELECTED_FOLDS']:
        continue

    # データの準備
    X_train = torch.tensor(train.loc[train_idx, FEATURES].values, dtype=torch.float32).to(CFG['DEVICE'])
    y_train = torch.tensor(train.loc[train_idx, TARGETS].values, dtype=torch.float32).to(CFG['DEVICE'])
    X_val = torch.tensor(train.loc[valid_idx, FEATURES].values, dtype=torch.float32).to(CFG['DEVICE'])
    y_val = torch.tensor(train.loc[valid_idx, TARGETS].values, dtype=torch.float32).to(CFG['DEVICE'])
    
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG['BATCH_SIZE'])

    model = MyModel().to(CFG['DEVICE'])
    optimizer = optim.Adam(model.parameters(), lr=CFG['LR'], weight_decay=CFG['WD'])

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(CFG['EPOCHS']):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in valid_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
        
        val_loss /= len(valid_loader.dataset)
        print(f'Epoch {epoch+1}/{CFG["EPOCHS"]}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(MODEL_DIR ,f'model-{fold}.pt'))
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= CFG['PATIENCE']:
                print('Early stopping')
                break

    # 最良のモデルをロードして予測を行う
    model.load_state_dict(torch.load(os.path.join(MODEL_DIR,f'model-{fold}.pt')))
    model.eval()
    oof = model(X_val)
    print('fold :', fold, 'CV score =', APS(y_val.cpu().numpy(), oof.detach().cpu().numpy(), average='micro'))  
    test_tensor = torch.tensor(test.values, dtype=torch.float32).to(CFG['DEVICE'])
    with torch.no_grad():
        preds = model(test_tensor).cpu().numpy()
    all_preds.append(preds)

preds = np.mean(all_preds, axis=0)


NVIDIA GeForce RTX 3080 Ti with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_60 sm_70 sm_75 compute_70 compute_75.
If you want to use the NVIDIA GeForce RTX 3080 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



Epoch 1/5, Train Loss: 1.9067, Val Loss: 0.3946
Epoch 2/5, Train Loss: 0.3231, Val Loss: 0.1939
Epoch 3/5, Train Loss: 0.1650, Val Loss: 0.1289
Epoch 4/5, Train Loss: 0.1047, Val Loss: 0.1005
Epoch 5/5, Train Loss: 0.0794, Val Loss: 0.0739
fold : 0 CV score = 0.0




Epoch 1/5, Train Loss: 3.1028, Val Loss: 0.8209
Epoch 2/5, Train Loss: 0.3676, Val Loss: 0.2541
Epoch 3/5, Train Loss: 0.1834, Val Loss: 0.1840
Epoch 4/5, Train Loss: 0.1232, Val Loss: 0.1395
Epoch 5/5, Train Loss: 0.0865, Val Loss: 0.1117
fold : 1 CV score = 0.004946975174740902
Epoch 1/5, Train Loss: 1.3081, Val Loss: 0.3654
Epoch 2/5, Train Loss: 0.2307, Val Loss: 0.1865
Epoch 3/5, Train Loss: 0.1157, Val Loss: 0.1117
Epoch 4/5, Train Loss: 0.0827, Val Loss: 0.0968
Epoch 5/5, Train Loss: 0.0578, Val Loss: 0.0643
fold : 2 CV score = 0.0018832391713747645
Epoch 1/5, Train Loss: 1.1544, Val Loss: 0.4339
Epoch 2/5, Train Loss: 0.2336, Val Loss: 0.1866
Epoch 3/5, Train Loss: 0.1174, Val Loss: 0.1109
Epoch 4/5, Train Loss: 0.0717, Val Loss: 0.0973
Epoch 5/5, Train Loss: 0.0558, Val Loss: 0.0773
fold : 3 CV score = 0.0022624434389140274
Epoch 1/5, Train Loss: 2.6051, Val Loss: 0.5349
Epoch 2/5, Train Loss: 0.4727, Val Loss: 0.2401
Epoch 3/5, Train Loss: 0.1902, Val Loss: 0.1614
Epoch 4/5, 

# Submission

In [21]:

# テストデータの読み込み
tst = pd.read_parquet(os.path.join(RAW_DIR, "test.parquet"))

# 'binds'列を追加して初期化
tst['binds'] = 0

# ブールマスクの作成
mask_BRD4 = (tst['protein_name'] == 'BRD4').values
mask_HSA = (tst['protein_name'] == 'HSA').values
mask_sEH = (tst['protein_name'] == 'sEH').values

# 各マスクに対応する予測値を代入
tst.loc[mask_BRD4, 'binds'] = preds[mask_BRD4][:, 0]
tst.loc[mask_HSA, 'binds'] = preds[mask_HSA][:, 1]
tst.loc[mask_sEH, 'binds'] = preds[mask_sEH][:, 2]

# 0以下の要素を0にし，1以上の要素を1にする
tst['binds'] = tst['binds'].clip(0, 1)


# 'id'と'binds'列をCSVに出力
tst[['id', 'binds']].to_csv(os.path.join(OUTPUT_DIR,'submission.csv'), index=False)


  0.7633616 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  tst.loc[mask_BRD4, 'binds'] = preds[mask_BRD4][:, 0]
