<a href="https://colab.research.google.com/github/kimjaehwankimjaehwan/python_/blob/main/seyonec_PubChem10M_SMILES_BPE_450k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers torch rdkit pandas scikit-learn


Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 16,
    'EPOCHS': 3,
    'LR': 5e-5,
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
train, val = train_test_split(chembl_data, test_size=0.2, random_state=CFG['SEED'])

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])
model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True): # Added has_target parameter
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target # Store has_target value

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target: # Check if target should be included
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else: # Return only input_ids and attention_mask
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

train_dataset = SMILESDataset(train, tokenizer)
val_dataset = SMILESDataset(val, tokenizer)

# TrainingArguments 및 Trainer 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch', #Evaluation strategy matches save strategy
    learning_rate=CFG['LR'],
    per_device_train_batch_size=CFG['BATCH_SIZE'],
    per_device_eval_batch_size=CFG['BATCH_SIZE'],
    num_train_epochs=CFG['EPOCHS'],
    seed=CFG['SEED'],
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_strategy = 'epoch' # Changed to epoch to match evaluation strategy

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))}
)

# 모델 학습
trainer.train()

# 검증 데이터 예측 및 평가
val_preds = trainer.predict(val_dataset)
val_rmse = np.sqrt(mean_squared_error(val['pIC50'], val_preds.predictions.flatten()))
print(f'Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False) # Set has_target to False for the test dataset
test_preds = trainer.predict(test_dataset)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(test_preds.predictions.flatten())

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_baseline_submit.csv', index=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.653618,0.808466
2,No log,0.471863,0.686923
3,No log,0.480935,0.693495


Validation RMSE: 0.6869229707557296


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 8,  # Batch size reduced for finer gradients
    'EPOCHS': 5,  # Increased number of epochs
    'LR': 2e-5,  # Reduced learning rate for more stable training
    'WARMUP_RATIO': 0.1,  # Warmup ratio for the learning rate scheduler
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
train, val = train_test_split(chembl_data, test_size=0.2, random_state=CFG['SEED'])

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])
model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target:
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

train_dataset = SMILESDataset(train, tokenizer)
val_dataset = SMILESDataset(val, tokenizer)

# 옵티마이저 및 스케줄러 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LR'])
total_steps = len(train_dataset) // CFG['BATCH_SIZE'] * CFG['EPOCHS']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(CFG['WARMUP_RATIO'] * total_steps),
    num_training_steps=total_steps
)

# TrainingArguments 및 Trainer 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=CFG['LR'],
    per_device_train_batch_size=CFG['BATCH_SIZE'],
    per_device_eval_batch_size=CFG['BATCH_SIZE'],
    num_train_epochs=CFG['EPOCHS'],
    seed=CFG['SEED'],
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_strategy='epoch',
    gradient_accumulation_steps=2,  # To simulate a larger batch size
    #optimizers=(optimizer, scheduler)  # Custom optimizer and scheduler - This line is removed
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))}
)

# 모델 학습
trainer.train()

# 검증 데이터 예측 및 평가
val_preds = trainer.predict(val_dataset)
val_rmse = np.sqrt(mean_squared_error(val['pIC50'], val_preds.predictions.flatten()))
print(f'Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False)
test_preds = trainer.predict(test_dataset)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(test_preds.predictions.flatten())

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_tuned_submit.csv', index=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.698729,0.8359
2,No log,0.521701,0.722289
3,No log,0.624996,0.790567
4,No log,0.534792,0.731295
5,No log,0.556339,0.745881


Validation RMSE: 0.7222886035452358


1. 학습률(LR): 기본 학습률을 2e-5로 줄였습니다. 더 낮은 학습률은 학습의 안정성을 높여줄 수 있습니다.
2. 에포크 수 증가: 에포크 수를 5로 늘려서 모델이 충분히 학습할 수 있게 했습니다.
3. 배치 크기 감소: 배치 크기를 8로 줄여서 메모리 사용을 줄이고, 세밀한 학습이 가능하도록 했습니다.
4. Warmup 단계 추가: WARMUP_RATIO를 사용하여 초기 몇 단계 동안 학습률을 점진적으로 증가시키는 warmup 단계를 추가했습니다.
5. Gradient Accumulation: 작은 배치 크기를 사용하는 대신, 6. gradient_accumulation_steps를 2로 설정하여 실질적으로 배치 크기를 16으로 시뮬레이션했습니다.
6. Custom Optimizer and Scheduler: 옵티마이저와 학습률 스케줄러를 직접 설정하여 더 정밀하게 학습을 제어할 수 있도록 했습니다.

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Sep  1 15:31:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0              46W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from transformers import AdamW

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 8,
    'EPOCHS': 10,  # Increased number of epochs
    'LR': 1e-5,  # Reduced learning rate
    'WARMUP_RATIO': 0.2,  # Increased warmup ratio
    'ENSEMBLE_MODELS': 3,  # Number of models in the ensemble
    'K_FOLDS': 5  # Number of folds for cross-validation
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target:
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

def create_optimizer_and_scheduler(model, total_steps):
    optimizer = AdamW(model.parameters(), lr=CFG['LR'])
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(CFG['WARMUP_RATIO'] * total_steps),
        num_training_steps=total_steps
    )
    return optimizer, scheduler

def train_and_evaluate(train_df, val_df, fold_idx):
    train_dataset = SMILESDataset(train_df, tokenizer)
    val_dataset = SMILESDataset(val_df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

    total_steps = len(train_dataset) // CFG['BATCH_SIZE'] * CFG['EPOCHS']
    optimizer, scheduler = create_optimizer_and_scheduler(model, total_steps)

    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold_idx}',
        evaluation_strategy='epoch',
        per_device_train_batch_size=CFG['BATCH_SIZE'],
        per_device_eval_batch_size=CFG['BATCH_SIZE'],
        num_train_epochs=CFG['EPOCHS'],
        seed=CFG['SEED'],
        load_best_model_at_end=True,
        logging_dir=f'./logs_fold_{fold_idx}',
        save_strategy='epoch',
        gradient_accumulation_steps=2,  # To simulate a larger batch size
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))},
        optimizers=(optimizer, scheduler)
    )

    trainer.train()

    # 모델 저장
    trainer.save_model(f'./results_fold_{fold_idx}')

    # 검증 데이터 예측
    val_preds = trainer.predict(val_dataset)
    return val_preds.predictions.flatten()

# Cross-Validation 기반 앙상블
kf = KFold(n_splits=CFG['K_FOLDS'], shuffle=True, random_state=CFG['SEED'])
ensemble_preds = []

for fold_idx, (train_index, val_index) in enumerate(kf.split(chembl_data)):
    print(f"Training fold {fold_idx+1}/{CFG['K_FOLDS']}")
    train_df, val_df = chembl_data.iloc[train_index], chembl_data.iloc[val_index]
    fold_preds = train_and_evaluate(train_df, val_df, fold_idx)
    ensemble_preds.append(fold_preds)

# 각 폴드의 예측 값을 numpy 배열로 변환하고, 크기를 맞춰줌
ensemble_preds = [np.array(preds) for preds in ensemble_preds]
min_length = min([len(preds) for preds in ensemble_preds])
ensemble_preds = [preds[:min_length] for preds in ensemble_preds]

# 앙상블 결과 계산
final_ensemble_preds = np.mean(ensemble_preds, axis=0)

# 올바르게 인덱스를 사용하여 RMSE 계산
val_index = np.concatenate([val_index for _, val_index in kf.split(chembl_data)])
val_rmse = np.sqrt(mean_squared_error(chembl_data.iloc[val_index[:min_length]]['pIC50'], final_ensemble_preds))
print(f'Ensemble Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False)

ensemble_test_preds = []

for fold_idx in range(CFG['K_FOLDS']):
    model = AutoModelForSequenceClassification.from_pretrained(f'./results_fold_{fold_idx}', num_labels=1)
    trainer = Trainer(model=model)
    test_preds = trainer.predict(test_dataset)
    ensemble_test_preds.append(test_preds.predictions.flatten())

# 최종 앙상블 테스트 예측
final_test_preds = np.mean(ensemble_test_preds, axis=0)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(final_test_preds)

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_ensemble_submit.csv', index=False)


Training fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,31.124796,5.57896
2,No log,1.220578,1.104797
3,No log,0.844613,0.919029
4,No log,0.696253,0.834418
5,No log,0.558804,0.747532
6,13.039800,0.887116,0.941868
7,13.039800,0.633695,0.796049
8,13.039800,0.548072,0.740319
9,13.039800,0.690251,0.830814
10,13.039800,0.739088,0.859702


Training fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,29.958324,5.47342
2,No log,1.061877,1.030474
3,No log,0.830433,0.911281
4,No log,0.718589,0.847696
5,No log,0.738732,0.859495
6,12.529500,0.832161,0.912229
7,12.529500,0.595247,0.771523
8,12.529500,0.680827,0.825123
9,12.529500,0.580379,0.761826
10,12.529500,0.648351,0.805203


Training fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,31.337585,5.597998
2,No log,1.08972,1.043897
3,No log,0.575194,0.758415
4,No log,0.83146,0.911844
5,No log,0.650155,0.806322
6,12.419100,0.860149,0.927442
7,12.419100,0.618002,0.786131
8,12.419100,0.613324,0.78315
9,12.419100,0.689739,0.830505
10,12.419100,0.56626,0.752503


Training fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,30.598385,5.531581
2,No log,0.953274,0.976358
3,No log,0.544108,0.737637
4,No log,0.847588,0.920645
5,No log,0.591387,0.769017
6,12.488600,0.851984,0.92303
7,12.488600,0.652355,0.807685
8,12.488600,0.662631,0.814021
9,12.488600,0.593564,0.770431
10,12.488600,0.593447,0.770355


Training fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,29.266417,5.409844
2,No log,1.034485,1.017096
3,No log,0.736941,0.858453
4,No log,0.78832,0.887874
5,No log,0.763934,0.874033
6,12.568500,0.76936,0.877132
7,12.568500,0.813334,0.90185
8,12.568500,0.856134,0.925275
9,12.568500,0.720677,0.848927
10,12.568500,0.797771,0.89318


Ensemble Validation RMSE: 0.5112974297320854


1. Cross-Validation을 이용한 앙상블:
여러 모델을 학습하는 대신 교차 검증(Cross-Validation) 기반으로 여러 분할에서 모델을 훈련하고 그 결과를 앙상블할 수 있습니다. 이 방법은 모델의 일반화 성능을 높일 수 있습니다.

2. Learning Rate를 단계적으로 조절:
Learning Rate Scheduler를 조정하여 학습 중후반부에 좀 더 작은 Learning Rate를 사용할 수 있습니다. get_linear_schedule_with_warmup 같은 방법 대신 get_cosine_schedule_with_warmup을 사용하는 것이 한 가지 방법입니다.

3. Data Augmentation:
SMILES 표현을 랜덤하게 뒤섞는 등의 방법으로 데이터를 증강할 수 있습니다. 이렇게 하면 모델이 다양한 입력에 대해 좀 더 강건하게 학습될 수 있습니다.

4. Hyperparameter Tuning:
Hyperparameter를 최적화하는 방법으로 optuna와 같은 라이브러리를 사용할 수 있습니다. 다양한 하이퍼파라미터 조합을 자동으로 탐색하여 최적의 파라미터를 찾을 수 있습니다.

5. Feature Engineering:
SMILES 외에 분자 구조에 대한 추가적인 화학적 정보를 모델에 포함시켜 성능을 향상시킬 수 있습니다. 예를 들어, 분자량, 극성 표면적 등의 물리화학적 특성을 포함하는 방법이 있습니다.

6. Dropout 및 Regularization:
모델에 Dropout을 추가하거나 가중치 감쇠(Weight Decay)와 같은 정규화를 더 강하게 설정할 수 있습니다.

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from transformers import AdamW

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 8,
    'EPOCHS': 10,  # Increased number of epochs
    'LR': 1e-5,  # Reduced learning rate
    'WARMUP_RATIO': 0.2,  # Increased warmup ratio
    'ENSEMBLE_MODELS': 3,  # Number of models in the ensemble
    'K_FOLDS': 5  # Number of folds for cross-validation
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target:
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

def create_optimizer_and_scheduler(model, total_steps):
    optimizer = AdamW(model.parameters(), lr=CFG['LR'])
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(CFG['WARMUP_RATIO'] * total_steps),
        num_training_steps=total_steps
    )
    return optimizer, scheduler

def train_and_evaluate(train_df, val_df, fold_idx):
    train_dataset = SMILESDataset(train_df, tokenizer)
    val_dataset = SMILESDataset(val_df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

    total_steps = len(train_dataset) // CFG['BATCH_SIZE'] * CFG['EPOCHS']
    optimizer, scheduler = create_optimizer_and_scheduler(model, total_steps)

    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold_idx}',
        evaluation_strategy='epoch',
        per_device_train_batch_size=CFG['BATCH_SIZE'],
        per_device_eval_batch_size=CFG['BATCH_SIZE'],
        num_train_epochs=CFG['EPOCHS'],
        seed=CFG['SEED'],
        load_best_model_at_end=True,
        logging_dir=f'./logs_fold_{fold_idx}',
        save_strategy='epoch',
        gradient_accumulation_steps=2,  # To simulate a larger batch size
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))},
        optimizers=(optimizer, scheduler)
    )

    trainer.train()

    # 모델 저장
    trainer.save_model(f'./results_fold_{fold_idx}')

    # 검증 데이터 예측
    val_preds = trainer.predict(val_dataset)
    return val_preds.predictions.flatten()

# Cross-Validation 기반 앙상블
kf = KFold(n_splits=CFG['K_FOLDS'], shuffle=True, random_state=CFG['SEED'])
ensemble_preds = []

for fold_idx, (train_index, val_index) in enumerate(kf.split(chembl_data)):
    print(f"Training fold {fold_idx+1}/{CFG['K_FOLDS']}")
    train_df, val_df = chembl_data.iloc[train_index], chembl_data.iloc[val_index]
    fold_preds = train_and_evaluate(train_df, val_df, fold_idx)
    ensemble_preds.append(fold_preds)

# 각 폴드의 예측 값을 numpy 배열로 변환하고, 크기를 맞춰줌
ensemble_preds = [np.array(preds) for preds in ensemble_preds]
min_length = min([len(preds) for preds in ensemble_preds])
ensemble_preds = [preds[:min_length] for preds in ensemble_preds]

# 앙상블 결과 계산
final_ensemble_preds = np.mean(ensemble_preds, axis=0)

# 올바르게 인덱스를 사용하여 RMSE 계산
val_index = np.concatenate([val_index for _, val_index in kf.split(chembl_data)])
val_rmse = np.sqrt(mean_squared_error(chembl_data.iloc[val_index[:min_length]]['pIC50'], final_ensemble_preds))
print(f'Ensemble Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False)

ensemble_test_preds = []

for fold_idx in range(CFG['K_FOLDS']):
    model = AutoModelForSequenceClassification.from_pretrained(f'./results_fold_{fold_idx}', num_labels=1)
    trainer = Trainer(model=model)
    test_preds = trainer.predict(test_dataset)
    ensemble_test_preds.append(test_preds.predictions.flatten())

# 최종 앙상블 테스트 예측
final_test_preds = np.mean(ensemble_test_preds, axis=0)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(final_test_preds)

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_ensemble_submit.csv', index=False)


Training fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,31.124796,5.57896
2,No log,1.220578,1.104797
3,No log,0.844613,0.919029
4,No log,0.696253,0.834418
5,No log,0.558804,0.747532
6,13.039800,0.887116,0.941868
7,13.039800,0.633695,0.796049
8,13.039800,0.548072,0.740319
9,13.039800,0.690251,0.830814
10,13.039800,0.739088,0.859702


Training fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,29.958324,5.47342
2,No log,1.061877,1.030474
3,No log,0.830433,0.911281
4,No log,0.718589,0.847696
5,No log,0.738732,0.859495
6,12.529500,0.832161,0.912229
7,12.529500,0.595247,0.771523
8,12.529500,0.680827,0.825123
9,12.529500,0.580379,0.761826
10,12.529500,0.648351,0.805203


Training fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,31.337585,5.597998
2,No log,1.08972,1.043897
3,No log,0.575194,0.758415
4,No log,0.83146,0.911844
5,No log,0.650155,0.806322
6,12.419100,0.860149,0.927442
7,12.419100,0.618002,0.786131
8,12.419100,0.613324,0.78315
9,12.419100,0.689739,0.830505
10,12.419100,0.56626,0.752503


Training fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,30.598385,5.531581
2,No log,0.953274,0.976358
3,No log,0.544108,0.737637
4,No log,0.847588,0.920645
5,No log,0.591387,0.769017
6,12.488600,0.851984,0.92303
7,12.488600,0.652355,0.807685
8,12.488600,0.662631,0.814021
9,12.488600,0.593564,0.770431
10,12.488600,0.593447,0.770355


Training fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,29.266417,5.409844
2,No log,1.034485,1.017096
3,No log,0.736941,0.858453
4,No log,0.78832,0.887874
5,No log,0.763934,0.874033
6,12.568500,0.76936,0.877132
7,12.568500,0.813334,0.90185
8,12.568500,0.856134,0.925275
9,12.568500,0.720677,0.848927
10,12.568500,0.797771,0.89318


Ensemble Validation RMSE: 0.5112974297320854
