In [1]:
from google.colab import userdata, drive
import os
drive.mount('/content/drive')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')

Mounted at /content/drive


In [2]:
!pip install kaggle
!kaggle datasets download -d kacky355/belka-train-valid-tfrecords-1d-preprocessed
!unzip belka-train-valid-tfrecords-1d-preprocessed.zip

Dataset URL: https://www.kaggle.com/datasets/kacky355/belka-train-valid-tfrecords-1d-preprocessed
License(s): unknown
Downloading belka-train-valid-tfrecords-1d-preprocessed.zip to /content
 99% 1.97G/1.98G [00:24<00:00, 50.4MB/s]
100% 1.98G/1.98G [00:24<00:00, 88.3MB/s]
Archive:  belka-train-valid-tfrecords-1d-preprocessed.zip
  inflating: logs/main.log           
  inflating: tf_idx/train_00.idx     
  inflating: tf_idx/train_01.idx     
  inflating: tf_idx/train_02.idx     
  inflating: tf_idx/train_03.idx     
  inflating: tf_idx/train_04.idx     
  inflating: tf_idx/train_05.idx     
  inflating: tf_idx/train_06.idx     
  inflating: tf_idx/train_07.idx     
  inflating: tf_idx/train_08.idx     
  inflating: tf_idx/train_09.idx     
  inflating: tf_idx/train_10.idx     
  inflating: tf_idx/train_11.idx     
  inflating: tf_idx/train_12.idx     
  inflating: tf_idx/train_13.idx     
  inflating: tf_idx/train_14.idx     
  inflating: tf_idx/train_15.idx     
  inflating: tf_idx/trai

In [3]:
!pip install rdkit
!pip install lightning
!pip install polars

!pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-cuda120
!pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-tf-plugin-cuda120

!pip install git+https://github.com/kacky355/my_libraries.git

Collecting rdkit
  Downloading rdkit-2024.3.3-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2024.3.3
Collecting lightning
  Downloading lightning-2.3.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.3.2-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [4]:
import random
import os
import glob

import matplotlib.pyplot as plt
import gc
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import polars as pl

from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import nvidia.dali.tfrecord as tfrec
from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy

import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torch.optim as optim
from torchmetrics import AveragePrecision
import lightning as L
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor,TQDMProgressBar

from sklearn.metrics import average_precision_score as APS

import tensorflow as tf

from logger.mylogger import get_my_logger

In [5]:
class CFG:
    DEBUG = False
    MODEL_NAME = 'mamba'

    EPOCHS = 8
    BATCH_SIZE = 4096
    NBR_FOLDS = 15
    NUM_TRAINS = 91_854_569
    NUM_VALIDS = 6_561_041
    STEPS_PER_EPOCH_TRAIN = (NUM_TRAINS -1) //BATCH_SIZE +1
    STEPS_PER_EPOCH_VALID = (NUM_VALIDS -1) //BATCH_SIZE +1


    SELECTED_FOLDS = [0]

    BASE_DIR = '/content/drive/MyDrive/BELKA_model/kaggle/working'
    DATA_SOURCE = '/content'
    TRAINS = glob.glob(os.path.join(DATA_SOURCE, 'train/*'))
    TRAINS.sort()
    TRAIN_IDX = glob.glob(os.path.join(DATA_SOURCE, 'tf_idx', 'train_*.idx'))
    TRAIN_IDX.sort()
    VALIDS = glob.glob(os.path.join(DATA_SOURCE, 'valid/*'))
    VALIDS.sort()
    VARID_IDX = glob.glob(os.path.join(DATA_SOURCE, 'tf_idx', 'valid_*.idx'))
    VARID_IDX.sort()

    SEED = 2024


    FEATURES = [f'enc{i}' for i in range(142)]
    TARGETS = ['bind1', 'bind2', 'bind3']
    COLUMNS = FEATURES + TARGETS

    NUM_CLASSES = 3
    SEQ_LENGTH = 142


    MODEL_PARAM = {
        'batch': BATCH_SIZE,
        'input_dim': SEQ_LENGTH,
        'hidden_dim': 128,
        'input_dim_embedding': 37,
        'dropout': 0.1,
        'num_heads': 4,
        'num_layers': 3,
        'out_dim': 3,
    }


    if DEBUG:
        EPOCHS = 3
        TRAINS = TRAINS[:4]
        TRAIN_IDX = TRAIN_IDX[:4]


In [6]:
@pipeline_def
def belka_pipeline(device, paths, idxs, seed,shard_id=0, num_shards=1, is_train=True):
    device_id = Pipeline.current().device_id

    inputs = fn.readers.tfrecord(
        path = paths,
        index_path = idxs,
        features={
            "x": tfrec.FixedLenFeature([CFG.SEQ_LENGTH], tfrec.int64, 0),
            "y": tfrec.FixedLenFeature([CFG.NUM_CLASSES], tfrec.float32, .0)
        },
        random_shuffle=is_train,
        num_shards=num_shards,
        shard_id=shard_id,
        initial_fill=CFG.BATCH_SIZE,
        seed=seed,
        name='Reader'
    )
    x = inputs['x']
    y = inputs['y']
    if device=='cuda':
        x = x.gpu()
        y = y.gpu()
    return x,y
valid_pipe = belka_pipeline(
    batch_size=CFG.BATCH_SIZE,
    num_threads=4,
    device_id=None,
    device='cpu',
    paths=CFG.VALIDS,
    idxs=CFG.VARID_IDX,
    seed=CFG.SEED-2,
    is_train=False
)

class LightningWrapper(DALIGenericIterator):
    def __init__(self, *kargs, **kwargs):
        super().__init__(*kargs, **kwargs)
    def __next__(self):
        out = super().__next__()
        out = out[0]
        return [out[k] for k in self.output_map]


valid_loader = LightningWrapper(valid_pipe, ['X', 'y'],reader_name='Reader', last_batch_policy=LastBatchPolicy.PARTIAL)



In [7]:
from logger.mylogger import get_my_logger
import os
import time
import random
import numpy as np
import torch
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

def set_logger(name):
    now = time.localtime()
    now = time.strftime("%Y-%m-%d-%H-%M-%S", now)
    log_name = f'{name}-{now}.log'
    logger = get_my_logger(CFG.BASE_DIR, log_name)
    return logger
set_seeds(CFG.SEED)
name='val_y_make'
logger = set_logger(name)



all_y = []
for X, y in valid_loader:
        all_y.append(y)
y_eval = torch.cat(all_y, 0)
val_results = pd.DataFrame(y_eval.detach().numpy(), columns=[f'bind{i+1}' for i in range(3) ])
val_results.to_csv(os.path.join(CFG.BASE_DIR, f'val_y.csv'))
logger.info(f'val_results write complite!\nfile_name: val_y.csv')


2024-07-06 19:30:38,579 val_y_make-2024-07-06-19-30-38.log:27 get_my_logger [INFO]: logger has made. log_dir:/content/drive/MyDrive/BELKA_model/kaggle/working/logs
2024-07-06 19:32:05,383 val_y_make-2024-07-06-19-30-38.log:31 <cell line: 31> [INFO]: val_results write complite!
file_name: val_y.csv


In [8]:
val_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6561041 entries, 0 to 6561040
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   bind1   float32
 1   bind2   float32
 2   bind3   float32
dtypes: float32(3)
memory usage: 75.1 MB
