In this notebook we will train a deep learning model using all the data available !
* preprocessing : I encoded the smiles of all the train & test set and saved it [here](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset) , this may take up to 1 hour on TPU.
* Training & Inference : I used a simple 1dcnn model trained on 20 epochs.

How to improve :
* Try a different architecture : I'm able to get an LB score of 0.604 with minor changes on this architecture.
* Try another model like Transformer, or LSTM.
* Train for more epochs.
* Add more features like a one hot encoding of bb2 or bb3.
* And of course ensembling with GBDT models.

In [33]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm
from sklearn.metrics import average_precision_score as APS
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import numpy as np
import torch.nn.functional as F
import math

from module import network, dataset, util
from importlib import reload

import lightgbm as lgb

In [53]:
class Config:
    PREPROCESS = False
    KAGGLE_NOTEBOOK = False
    DEBUG = True
    
    SEED = 42
    EPOCHS = 10
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 1e-6
    PATIENCE = 10
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]
    
    
if Config.DEBUG:
    n_rows = 10**5
else:
    n_rows = None
    


In [41]:
if Config.KAGGLE_NOTEBOOK:
    RAW_DIR = "/kaggle/input/leash-BELKA/"
    PROCESSED_DIR = "/kaggle/input/belka-enc-dataset"
    OUTPUT_DIR = ""
    MODEL_DIR = ""
else:
    RAW_DIR = "../data/raw/"
    PROCESSED_DIR = "../data/processed/"
    OUTPUT_DIR = "../data/result/"
    MODEL_DIR = "../models/"

TRAIN_DATA_NAME = "train_enc.parquet"

In [42]:
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds(seed=Config.SEED)

train_file_list = [f"../data/chuncked-dataset/local_train_enc_{i}.parquet" for i in range(10)]
train_file_list

['../data/chuncked-dataset/local_train_enc_0.parquet',
 '../data/chuncked-dataset/local_train_enc_1.parquet',
 '../data/chuncked-dataset/local_train_enc_2.parquet',
 '../data/chuncked-dataset/local_train_enc_3.parquet',
 '../data/chuncked-dataset/local_train_enc_4.parquet',
 '../data/chuncked-dataset/local_train_enc_5.parquet',
 '../data/chuncked-dataset/local_train_enc_6.parquet',
 '../data/chuncked-dataset/local_train_enc_7.parquet',
 '../data/chuncked-dataset/local_train_enc_8.parquet',
 '../data/chuncked-dataset/local_train_enc_9.parquet']

# Preprocessing

In [43]:
if Config.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('train_enc.parquet')

    test_raw = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('test_enc.parquet')

else:
    # train = pl.read_parquet(os.path.join(PROCESSED_DIR, TRAIN_DATA_NAME), n_rows=n_rows)
    test = pl.read_parquet(os.path.join(PROCESSED_DIR, 'test_enc.parquet'), n_rows=n_rows)
    # train = train.to_pandas()
    test = test.to_pandas()

In [44]:

    
def prepare_data(train, train_idx, valid_idx, features, targets, device):
    """
    データの準備を行う関数
    """
    X_train = torch.tensor(train.loc[train_idx, features].values, dtype=torch.float32).to(device)
    y_train = torch.tensor(train.loc[train_idx, targets].values, dtype=torch.float32).to(device)
    X_val = torch.tensor(train.loc[valid_idx, features].values, dtype=torch.float32).to(device)
    y_val = torch.tensor(train.loc[valid_idx, targets].values, dtype=torch.float32).to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
    
    return train_loader, valid_loader, X_val, y_val


def prepare_dataloader(train, val, features, targets, device):
    X_train = torch.tensor(train.loc[:, features].values, dtype=torch.float32).to(device)
    y_train = torch.tensor(train.loc[:, targets].values, dtype=torch.float32).to(device)
    X_val = torch.tensor(val.loc[:, features].values, dtype=torch.float32).to(device)
    y_val = torch.tensor(val.loc[:, targets].values, dtype=torch.float32).to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
    
    return train_loader, valid_loader, X_val, y_val

In [57]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import average_precision_score as APS



def train_model_per_target(train_file_list, features, targets, params, epochs, n_rows):
    models = {}
    best_scores = {}
    
    # 各ターゲットに対するモデルを訓練
    for target in targets:
        print(f'Training model for target: {target}')
        # バリデーションデータを読み込む
        val = pl.read_parquet(train_file_list[9], n_rows=n_rows).to_pandas()
        X_val = val[features]
        y_val = val[target]
        d_valid = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
        models[target] = []
        best_scores[target] = []

        for epoch in range(epochs):
            print(f'Epoch {epoch+1}/{epochs}')
            train = pl.read_parquet(train_file_list[epoch % 9], n_rows=n_rows).to_pandas()
            X_train = train[features]
            y_train = train[target]
            d_train = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
            callbacks = [
                lgb.log_evaluation(period=50)
            ]

            model = lgb.train(params, d_train, num_boost_round=400, valid_sets=[d_valid], callbacks=callbacks)
            models[target].append(model)
            
            # 最新のスコアを保存
            valid_pred = model.predict(X_val)
            score = APS(y_val, valid_pred)
            best_scores[target].append(score)
            print(f'Best Score for {target} in epoch {epoch+1}: {score:.4f}')
    
    return models, best_scores

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'average_precision',
    'num_leaves': 31,
    'learning_rate': 0.05,
    "scale_pos_weight": 150,
    "verbose" : 1,
}

# 特徴量とターゲット
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']

# モデルの訓練
models, best_scores = train_model_per_target(train_file_list, FEATURES, TARGETS, params, epochs=1, n_rows=n_rows)
val = pl.read_parquet(train_file_list[9], n_rows=n_rows).to_pandas()

# 各ターゲットのモデルを評価
for target in TARGETS:
    X_val = val[FEATURES]
    y_val = val[target]
    val_predictions = [model.predict(X_val[FEATURES]) for model in models[target]]
    average_prediction = np.mean(val_predictions, axis=0)
    val_auc = APS(y_val, average_prediction)
    print(f'Validation AUC for {target}: {val_auc}')


Training model for target: bind1
Epoch 1/1
[LightGBM] [Info] Number of positive: 419, number of negative: 99581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.368553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3302
[LightGBM] [Info] Number of data points in the train set: 100000, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004190 -> initscore=-5.470856
[LightGBM] [Info] Start training from score -5.470856
[50]	valid_0's average_precision: 0.0122421
[100]	valid_0's average_precision: 0.0154834
[150]	valid_0's average_precision: 0.0188752
[200]	valid_0's average_precision: 0.0213878
[250]	valid_0's average_precision: 0.0240629
[300]	valid_0's average_precision: 0.0276314
[350]	valid_0's average_precision: 0.0315796
[400]	valid_0's average_precision: 0.0346035
Best Score for bind1 in epoch 1: 0.0346


In [58]:
# 各ターゲットのモデルを評価
train = pl.read_parquet(train_file_list[0], n_rows=n_rows).to_pandas()

for target in TARGETS:
    X_train = train[FEATURES]
    y_train = train[target]
    train_predictions = [model.predict(X_train[FEATURES]) for model in models[target]]
    average_prediction = np.mean(train_predictions, axis=0)
    print(f'Train score for {target}: {APS(y_train, average_prediction)}')

Train score for bind1: 0.9956698864129331
Train score for bind2: 0.9892017215621385
Train score for bind3: 0.9695519432983993


In [59]:
# local testの予測と結果
local_test = pl.read_parquet(os.path.join(PROCESSED_DIR, 'local_test_enc.parquet'), n_rows=n_rows).to_pandas()

for target in TARGETS:
    X_local_test = local_test[FEATURES]
    y_local_test = local_test[target]
    local_test_predictions = [model.predict(X_local_test[FEATURES]) for model in models[target]]
    average_prediction = np.mean(local_test_predictions, axis=0)
    print(f'Local_test score for {target}: {APS(y_local_test, average_prediction)}')



Local_test score for bind1: 0.020546628333132456
Local_test score for bind2: 0.02493482378749972
Local_test score for bind3: 0.2675674310457085


# Submission

In [None]:

# テストデータの読み込み
tst = pl.read_parquet(os.path.join(RAW_DIR, "test.parquet"), n_rows=None).to_pandas()

# 'binds'列を追加して初期化
tst['binds'] = 0

# ブールマスクの作成
mask_BRD4 = (tst['protein_name'] == 'BRD4').values
mask_HSA = (tst['protein_name'] == 'HSA').values
mask_sEH = (tst['protein_name'] == 'sEH').values

# 各マスクに対応する予測値を代入
tst.loc[mask_BRD4, 'binds'] = preds[mask_BRD4][:, 0]
tst.loc[mask_HSA, 'binds'] = preds[mask_HSA][:, 1]
tst.loc[mask_sEH, 'binds'] = preds[mask_sEH][:, 2]



submission = tst[['id', 'binds']].copy()
# 'id'と'binds'列をCSVに出力
submission.to_csv(os.path.join(OUTPUT_DIR,'submission.csv'), index=False)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 100000 but corresponding boolean dimension is 1674896

In [None]:
submission

Unnamed: 0,id,binds
0,295246830,0.507840
1,295246831,0.490291
2,295246832,0.284896
3,295246833,0.516321
4,295246834,0.482399
...,...,...
1674891,296921721,0.393290
1674892,296921722,0.377067
1674893,296921723,0.465928
1674894,296921724,0.552328


In [None]:
submission[submission['binds'] < 0.1]

Unnamed: 0,id,binds


In [None]:
model.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'best_model_lstm_44.pt')))
# local testの予測と結果
local_test = pl.read_parquet(os.path.join(PROCESSED_DIR, 'local_test_enc.parquet'))
local_test = local_test.to_pandas()

target = local_test[TARGETS].values
local_test_tensor = torch.tensor(local_test[FEATURES].values, dtype=torch.float32).to(Config.DEVICE)
local_preds = predict_in_batches(model, local_test_tensor, Config.BATCH_SIZE)

# calculate score
score = APS(target, local_preds.detach().cpu().numpy(), average="micro")
print('local test score =', score)


local test score = 0.6196472752183542


In [None]:
print("kaggle score = ", get_score(target, local_preds))

kaggle score =  0.5581163764912928
