<a href="https://colab.research.google.com/github/kimjaehwankimjaehwan/Dacon/blob/main/seyonec_PubChem10M_SMILES_BPE_450k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers torch rdkit pandas scikit-learn


Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 16,
    'EPOCHS': 3,
    'LR': 5e-5,
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
train, val = train_test_split(chembl_data, test_size=0.2, random_state=CFG['SEED'])

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])
model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True): # Added has_target parameter
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target # Store has_target value

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target: # Check if target should be included
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else: # Return only input_ids and attention_mask
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

train_dataset = SMILESDataset(train, tokenizer)
val_dataset = SMILESDataset(val, tokenizer)

# TrainingArguments 및 Trainer 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch', #Evaluation strategy matches save strategy
    learning_rate=CFG['LR'],
    per_device_train_batch_size=CFG['BATCH_SIZE'],
    per_device_eval_batch_size=CFG['BATCH_SIZE'],
    num_train_epochs=CFG['EPOCHS'],
    seed=CFG['SEED'],
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_strategy = 'epoch' # Changed to epoch to match evaluation strategy

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))}
)

# 모델 학습
trainer.train()

# 검증 데이터 예측 및 평가
val_preds = trainer.predict(val_dataset)
val_rmse = np.sqrt(mean_squared_error(val['pIC50'], val_preds.predictions.flatten()))
print(f'Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False) # Set has_target to False for the test dataset
test_preds = trainer.predict(test_dataset)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(test_preds.predictions.flatten())

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_baseline_submit.csv', index=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.653618,0.808466
2,No log,0.471863,0.686923
3,No log,0.480935,0.693495


Validation RMSE: 0.6869229707557296


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 설정
CFG = {
    'SEED': 42,
    'MODEL_NAME': 'seyonec/PubChem10M_SMILES_BPE_450k',
    'BATCH_SIZE': 8,  # Batch size reduced for finer gradients
    'EPOCHS': 5,  # Increased number of epochs
    'LR': 2e-5,  # Reduced learning rate for more stable training
    'WARMUP_RATIO': 0.1,  # Warmup ratio for the learning rate scheduler
}

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
train, val = train_test_split(chembl_data, test_size=0.2, random_state=CFG['SEED'])

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL_NAME'])
model = AutoModelForSequenceClassification.from_pretrained(CFG['MODEL_NAME'], num_labels=1)

# 데이터셋 정의
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=128, has_target=True):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_target = has_target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        smiles = self.data.iloc[index]['Smiles']

        encoding = self.tokenizer(
            smiles,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.has_target:
            target = self.data.iloc[index]['pIC50']
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(target, dtype=torch.float)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }

train_dataset = SMILESDataset(train, tokenizer)
val_dataset = SMILESDataset(val, tokenizer)

# 옵티마이저 및 스케줄러 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LR'])
total_steps = len(train_dataset) // CFG['BATCH_SIZE'] * CFG['EPOCHS']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(CFG['WARMUP_RATIO'] * total_steps),
    num_training_steps=total_steps
)

# TrainingArguments 및 Trainer 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=CFG['LR'],
    per_device_train_batch_size=CFG['BATCH_SIZE'],
    per_device_eval_batch_size=CFG['BATCH_SIZE'],
    num_train_epochs=CFG['EPOCHS'],
    seed=CFG['SEED'],
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_strategy='epoch',
    gradient_accumulation_steps=2,  # To simulate a larger batch size
    #optimizers=(optimizer, scheduler)  # Custom optimizer and scheduler - This line is removed
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'rmse': np.sqrt(mean_squared_error(p.label_ids, p.predictions.flatten()))}
)

# 모델 학습
trainer.train()

# 검증 데이터 예측 및 평가
val_preds = trainer.predict(val_dataset)
val_rmse = np.sqrt(mean_squared_error(val['pIC50'], val_preds.predictions.flatten()))
print(f'Validation RMSE: {val_rmse}')

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_dataset = SMILESDataset(test, tokenizer, has_target=False)
test_preds = trainer.predict(test_dataset)

# pIC50을 IC50으로 변환
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

test['IC50_nM'] = pIC50_to_IC50(test_preds.predictions.flatten())

# 제출 파일 저장
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = test['IC50_nM']
submit.to_csv('./transformer_tuned_submit.csv', index=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.698729,0.8359
2,No log,0.521701,0.722289
3,No log,0.624996,0.790567
4,No log,0.534792,0.731295
5,No log,0.556339,0.745881


Validation RMSE: 0.7222886035452358


1. 학습률(LR): 기본 학습률을 2e-5로 줄였습니다. 더 낮은 학습률은 학습의 안정성을 높여줄 수 있습니다.
2. 에포크 수 증가: 에포크 수를 5로 늘려서 모델이 충분히 학습할 수 있게 했습니다.
3. 배치 크기 감소: 배치 크기를 8로 줄여서 메모리 사용을 줄이고, 세밀한 학습이 가능하도록 했습니다.
4. Warmup 단계 추가: WARMUP_RATIO를 사용하여 초기 몇 단계 동안 학습률을 점진적으로 증가시키는 warmup 단계를 추가했습니다.
5. Gradient Accumulation: 작은 배치 크기를 사용하는 대신, 6. gradient_accumulation_steps를 2로 설정하여 실질적으로 배치 크기를 16으로 시뮬레이션했습니다.
6. Custom Optimizer and Scheduler: 옵티마이저와 학습률 스케줄러를 직접 설정하여 더 정밀하게 학습을 제어할 수 있도록 했습니다.