In [1]:
from google.colab import userdata, drive
import os
drive.mount('/content/drive')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')

Mounted at /content/drive


In [2]:
!pip install kaggle
!kaggle datasets download -d kacky355/belka-train-valid-tfrecords-1d-preprocessed
!unzip belka-train-valid-tfrecords-1d-preprocessed.zip

Dataset URL: https://www.kaggle.com/datasets/kacky355/belka-train-valid-tfrecords-1d-preprocessed
License(s): unknown
Downloading belka-train-valid-tfrecords-1d-preprocessed.zip to /content
 99% 1.96G/1.98G [00:27<00:00, 107MB/s]
100% 1.98G/1.98G [00:27<00:00, 77.7MB/s]
Archive:  belka-train-valid-tfrecords-1d-preprocessed.zip
  inflating: logs/main.log           
  inflating: tf_idx/train_00.idx     
  inflating: tf_idx/train_01.idx     
  inflating: tf_idx/train_02.idx     
  inflating: tf_idx/train_03.idx     
  inflating: tf_idx/train_04.idx     
  inflating: tf_idx/train_05.idx     
  inflating: tf_idx/train_06.idx     
  inflating: tf_idx/train_07.idx     
  inflating: tf_idx/train_08.idx     
  inflating: tf_idx/train_09.idx     
  inflating: tf_idx/train_10.idx     
  inflating: tf_idx/train_11.idx     
  inflating: tf_idx/train_12.idx     
  inflating: tf_idx/train_13.idx     
  inflating: tf_idx/train_14.idx     
  inflating: tf_idx/train_15.idx     
  inflating: tf_idx/train

In [3]:
!pip install rdkit
!pip install lightning
!pip install polars

!pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-cuda120
!pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-tf-plugin-cuda120

!pip install git+https://github.com/kacky355/my_libraries.git

Collecting rdkit
  Downloading rdkit-2024.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.1/35.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2024.3.1
Collecting lightning
  Downloading lightning-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.3.1-py3-none-any.whl (812 kB)
[2K     

In [4]:
!pip install causal-conv1d>=1.4.0
!pip install mamba-ssm

Collecting mamba-ssm
  Downloading mamba_ssm-2.2.1.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from mamba-ssm)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: mamba-ssm
  Building wheel for mamba-ssm (setup.py) ... [?25l[?25hdone
  Created wheel for mamba-ssm: filename=mamba_ssm-2.2.1-cp310-cp310-linux_x86_64.whl size=323803693 sha256=e1f518a2f4a14a81be070dbbd6b0711b983af6eac62b98265d790e8df42a8078
  Stored in directory: /root/.cache/pip/wheels/31/44/37/2bd1f5a2ad0219a3aa5ae653b942d1af4645f43f14c3ec961e
Successfully built mamba-ssm
Installing collected packages: einops, mamba-ssm
Successfully installed einops-0.8.0 mamba-ssm-2.2.1


In [5]:
import random
import os
import glob

import matplotlib.pyplot as plt
import gc
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import polars as pl

from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import nvidia.dali.tfrecord as tfrec
from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy

import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torch.optim as optim
from torchmetrics import AveragePrecision
import lightning as L
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor,TQDMProgressBar

from sklearn.metrics import average_precision_score as APS

import tensorflow as tf

from logger.mylogger import get_my_logger

In [59]:
%%writefile config.py
import os
import glob

class CFG:
    DEBUG = False
    MODEL_NAME = 'mamba'

    EPOCHS = 8
    BATCH_SIZE = 4096
    NBR_FOLDS = 15
    NUM_TRAINS = 91_854_569
    NUM_VALIDS = 6_561_041
    STEPS_PER_EPOCH_TRAIN = (NUM_TRAINS -1) //BATCH_SIZE +1
    STEPS_PER_EPOCH_VALID = (NUM_VALIDS -1) //BATCH_SIZE +1


    SELECTED_FOLDS = [0]

    BASE_DIR = '/content/drive/MyDrive/BELKA_model/kaggle/working'
    DATA_SOURCE = '/content'
    TRAINS = glob.glob(os.path.join(DATA_SOURCE, 'train/*'))
    TRAINS.sort()
    TRAIN_IDX = glob.glob(os.path.join(DATA_SOURCE, 'tf_idx', 'train_*.idx'))
    TRAIN_IDX.sort()
    VALIDS = glob.glob(os.path.join(DATA_SOURCE, 'valid/*'))
    VALIDS.sort()
    VARID_IDX = glob.glob(os.path.join(DATA_SOURCE, 'tf_idx', 'valid_*.idx'))
    VARID_IDX.sort()

    SEED = 2024


    FEATURES = [f'enc{i}' for i in range(142)]
    TARGETS = ['bind1', 'bind2', 'bind3']
    COLUMNS = FEATURES + TARGETS

    NUM_CLASSES = 3
    SEQ_LENGTH = 142


    MODEL_PARAM = {
        'batch': BATCH_SIZE,
        'input_dim': SEQ_LENGTH,
        'hidden_dim': 128,
        'input_dim_embedding': 37,
        'dropout': 0.1,
        'num_heads': 4,
        'num_layers': 3,
        'out_dim': 3,
    }


    if DEBUG:
        EPOCHS = 3
        TRAINS = TRAINS[:4]
        TRAIN_IDX = TRAIN_IDX[:4]


Overwriting config.py


In [47]:
# %%writefile modules.py

# from config import CFG


# import torch
# from torch import nn, Tensor
# import torch.nn.functional as F
# import torch.optim as optim

# class Mamba(nn.Module):
#     def __init__(self, ):
#         super().__init__()


#     def forward(self, x):

#         return x

In [53]:
%%writefile models.py
from config import CFG

import math

import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torch.optim as optim
import lightning as L

from sklearn.metrics import average_precision_score as APS

from mamba_ssm import Mamba




class LMmamba(L.LightningModule):
    def __init__(self, batch, input_dim, input_dim_embedding, hidden_dim, num_heads, num_layers, dropout, out_dim):
        super().__init__()
        self.save_hyperparameters()
#         self.average_precision = MulticlassAveragePrecision(num_classes= 3, thresholds= 0.5)
        self.val_preds = []
        self.val_y = []


        self.embedding = nn.Embedding(num_embeddings=input_dim_embedding, embedding_dim=hidden_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, stride=1, padding=0)
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.mamba_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.mamba_blocks.append(Mamba(
            d_model=hidden_dim,
            d_state=16,
            d_conv=3,
            expand=2
        ))

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim,out_dim),
        )

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0,2,1)
        x = self.conv1(x)
        x = self.batch_norm(x)
        x = x.permute(0,2,1)

        for block in self.mamba_blocks:
            x=block(x)
            x = F.dropout(x, self.hparams.dropout)

        x = self.mlp_head(x[:,0,:])
        return x

    def training_step(self, batch, batch_idx):
        x, y = self.process_batch(batch)
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('train_loss', loss)
        del x, y, logits
        return loss


    def validation_step(self, batch, batch_idx):
        x, y = self.process_batch(batch)
        logits = self(x)
        preds = torch.sigmoid(logits)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('val_loss', loss)
        self.val_preds.append(preds)
        self.val_y.append(y)
        return loss

    def on_validation_epoch_end(self):
        preds = torch.cat(self.val_preds, 0).to('cpu').detach().numpy()
        y_eval = torch.cat(self.val_y, 0).to('cpu').detach().numpy()
        self.log('validation APS  CV score =', APS(y_eval, preds, average='micro'))
        self.val_preds.clear()
        self.val_y.clear()


    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('test_loss', loss)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr= 1e-3)
        return optimizer

    def process_batch(self, batch):
        X, y = batch[0].clone().long(), batch[1].clone()
        return X, y





class DemoModel(L.LightningModule):
    def __init__(self, input_dim=142, input_dim_embedding=37, hidden_dim=128, num_filters=32, output_dim=3, lr=1e-3, weight_decay=1e-6):
        super().__init__()
        self.save_hyperparameters()

        self.embedding = nn.Embedding(num_embeddings=self.hparams.input_dim_embedding, embedding_dim=self.hparams.hidden_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=self.hparams.hidden_dim, out_channels=self.hparams.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.hparams.num_filters, out_channels=self.hparams.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.hparams.num_filters*2, out_channels=self.hparams.num_filters*3, kernel_size=3, stride=1, padding=0)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(self.hparams.num_filters*3, 1024)
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, self.hparams.output_dim)

    def forward(self, x):
        x = self.embedding(x).permute(0,2,1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.global_max_pool(x).squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        return optimizer

    def process_batch(self, batch):
        X, y = batch
        X, y = X.clone(), y.clone()
        return X, y

Overwriting models.py


In [54]:
%%writefile DALILmodels.py
from config import CFG
from models import LMmamba, DemoModel

import tensorflow as tf

from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import nvidia.dali.tfrecord as tfrec
from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy


class DALILmamba(LMmamba):
    def __init__(self, batch, input_dim, input_dim_embedding, hidden_dim, num_heads, num_layers, dropout, out_dim):
        super().__init__(batch, input_dim, input_dim_embedding, hidden_dim, num_heads, num_layers, dropout, out_dim)

    def setup(self,stage=None):
        device_id = self.local_rank
        shard_id = self.global_rank
        num_shards = self.trainer.world_size

        train_pipe = belka_pipeline(
            batch_size=CFG.BATCH_SIZE,
            num_threads=4,
            device_id=device_id,
            device='cuda',
            shard_id=shard_id,
            num_shards=num_shards,
            paths=CFG.TRAINS,
            idxs=CFG.TRAIN_IDX,
            seed=CFG.SEED + 2 + device_id*2
        )
        valid_pipe = belka_pipeline(
            batch_size=CFG.BATCH_SIZE,
            num_threads=4,
            device_id=device_id,
            device='cuda',
            shard_id=shard_id,
            num_shards=num_shards,
            paths=CFG.VALIDS,
            idxs=CFG.VARID_IDX,
            seed=CFG.SEED-2
        )

        class LightningWrapper(DALIGenericIterator):
            def __init__(self, *kargs, **kwargs):
                super().__init__(*kargs, **kwargs)
            def __next__(self):
                out = super().__next__()
                out = out[0]
                return [out[k] for k in self.output_map]


        self.train_loader = LightningWrapper(train_pipe, ['X', 'y'],reader_name='Reader', last_batch_policy=LastBatchPolicy.DROP)
        self.valid_loader = LightningWrapper(valid_pipe, ['X', 'y'],reader_name='Reader', last_batch_policy=LastBatchPolicy.PARTIAL)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.valid_loader


class DALILDemoModel(DemoModel):
    def __init__(self, *kargs, **kwargs):
        super().__init__(*kargs, **kwargs)

    def setup(self,stage=None):
        device_id = self.local_rank
        shard_id = self.global_rank
        num_shards = self.trainer.world_size

        train_pipe = belka_pipeline(
            batch_size=CFG.BATCH_SIZE,
            num_threads=4,
            device_id=device_id,
            device='cuda',
            shard_id=shard_id,
            num_shards=num_shards,
            paths=CFG.TRAINS,
            idxs=CFG.TRAIN_IDX,
            seed=CFG.SEED + 2 + device_id*2
        )
        valid_pipe = belka_pipeline(
            batch_size=CFG.BATCH_SIZE,
            num_threads=4,
            device_id=device_id,
            device='cuda',
            shard_id=shard_id,
            num_shards=num_shards,
            paths=CFG.VALIDS,
            idxs=CFG.VARID_IDX,
            seed=CFG.SEED-2
        )

        class LightningWrapper(DALIGenericIterator):
            def __init__(self, *kargs, **kwargs):
                super().__init__(*kargs, **kwargs)
            def __next__(self):
                out = super().__next__()
                out = out[0]
                return [out[k] for k in self.output_map]


        self.train_loader = LightningWrapper(train_pipe, ['X', 'y'],reader_name='Reader', last_batch_policy=LastBatchPolicy.DROP)
        self.valid_loader = LightningWrapper(valid_pipe, ['X', 'y'],reader_name='Reader', last_batch_policy=LastBatchPolicy.PARTIAL)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.valid_loader




@pipeline_def
def belka_pipeline(device, paths, idxs, seed,shard_id=0, num_shards=1, is_train=True):
    device_id = Pipeline.current().device_id

    inputs = fn.readers.tfrecord(
        path = paths,
        index_path = idxs,
        features={
            "x": tfrec.FixedLenFeature([CFG.SEQ_LENGTH], tfrec.int64, 0),
            "y": tfrec.FixedLenFeature([CFG.NUM_CLASSES], tfrec.float32, .0)
        },
        random_shuffle=is_train,
        num_shards=num_shards,
        shard_id=shard_id,
        initial_fill=CFG.BATCH_SIZE,
        seed=seed,
        name='Reader'
    )
    x = inputs['x']
    y = inputs['y']
    if device=='cuda':
        x = x.gpu()
        y = y.gpu()
    return x,y

Overwriting DALILmodels.py


In [55]:
%%writefile main.py
from config import CFG
from DALILmodels import DALILmamba

from logger.mylogger import get_my_logger

import os
import numpy as np
import random
import time

import torch
from torch.utils.data import TensorDataset, DataLoader

import lightning as L
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor,TQDMProgressBar



def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

def set_logger(name):
    now = time.localtime()
    now = time.strftime("%Y-%m-%d-%H-%M-%S", now)
    log_name = f'{name}-{now}.log'
    logger = get_my_logger(CFG.BASE_DIR, log_name)
    return logger

def set_trainer(logger):
    early_stop_callback = EarlyStopping(
        monitor= 'val_loss',
        mode= 'min',
        patience= 3,
        verbose= True
    )

    checkpoint_callback = ModelCheckpoint(
        dirpath= f'{CFG.BASE_DIR}/models/',
        filename= f'model-{{val_loss}}',
        monitor= 'val_loss',
        save_top_k= 1,
        verbose= True,
    )

    progress_bar_callback = TQDMProgressBar(refresh_rate=1)

    lr_monitor = LearningRateMonitor(logging_interval='epoch')

    callbacks = [
        early_stop_callback,
        checkpoint_callback,
        progress_bar_callback,
        lr_monitor,
    ]
    logger.info('set callbacks')

    if "PL_TRAINER_GPUS" in os.environ:
        os.environ.pop("PL_TRAINER_GPUS")

    trainer = L.Trainer(
            max_epochs= CFG.EPOCHS,
            callbacks= callbacks,
            accelerator= 'auto',
            enable_progress_bar= True,
            devices= 'auto',
            # strategy='ddp',
        )
    logger.info('trainer has made')
    return trainer

def calc_validation_APS(model, model_name, logger):
    logger.info('validation_APS start calucurate')
    valid_loader = model.val_dataloader()
    all_preds = []
    all_y = []
    model.eval()
    with torch.no_grad():
        for X, y in valid_loader:
            oof = model(X)
            all_preds.append(oof)
            all_y.append(y)
    preds = torch.cat(all_preds, 0)
    y_eval = torch.cat(all_y, 0)
    logger.info('valid_results: CV score =', APS(y_eval, preds, average='micro'))

    val_results = pd.DataFrame({'y_eval': y_eval.reshape(-1).to('cpu').detach().numpy(),
                                model_name: preds.reshape(-1).to('cpu').detach().numpy()})
    val_results.to_csv(os.path.join(CFG.BASE_DIR, f'val_results_{model_name}.csv'))
    logger.info(f'val_results write complite!\nfile_name: val_results_{model_name}.csv')

def make_submit(model, logger):
    logger.info('load test data')
    test_data = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')
    test_data = TensorDataset(torch.tensor(test_data))
    test_loader= DataLoader(test_data)
    logger.info('predict test data')
    test_preds=[]
    model.eval()
    with torch.no_grad():
        for (X,) in test_loader:
            oof = model(X)
            test_preds.append(oof)
    preds= torch.cat(test_preds, 0).detach().numpy()

    logger.info('writing submission.csv start')
    tst = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    tst['binds'] = 0
    tst.loc[tst['protein_name']=='BRD4', 'binds'] = preds[(tst['protein_name']=='BRD4').values, 0]
    tst.loc[tst['protein_name']=='HSA', 'binds'] = preds[(tst['protein_name']=='HSA').values, 1]
    tst.loc[tst['protein_name']=='sEH', 'binds'] = preds[(tst['protein_name']=='sEH').values, 2]
    tst[['id', 'binds']].to_csv('submission.csv', index = False)
    logger.info('writing complete')



if __name__ == '__main__':
    logger = set_logger(CFG.MODEL_NAME)

    set_seeds(seed= CFG.SEED)
    logger.info(f'set seed: {CFG.SEED}')

    model_module = DALILmamba(**CFG.MODEL_PARAM)
    logger.info(f'model has made.\n model:{model_module}')

    trainer = set_trainer(logger)
    logger.info('training begin')
    trainer.fit(model_module)
    logger.info('training finish!')
    # model_module = model_module.load_from_checkpoint(checkpoint_callback.best_model_path)
    # calc_validation_APS(model_module, CFG.MODEL_NAME, logger)
    # make_submit(model_module, logger)

Overwriting main.py


In [60]:
!python main.py

2024-07-03 04:52:57.105205: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 04:52:57.105252: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 04:52:57.106506: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 04:52:59,753 mamba-2024-07-03-04-52-59.log:27 get_my_logger [INFO]: logger has made. log_dir:/content/drive/MyDrive/BELKA_model/kaggle/working/logs
2024-07-03 04:52:59,755 mamba-2024-07-03-04-52-59.log:123 <module> [INFO]: set seed: 2024
2024-07-03 04:52:59,765 mamba-2024-07-03-04-52-59.log:126 <module> [INFO]: model has made.
 model:DALILmamba(
  (emb