# for testing codes

In [2]:
import os
import csv 
import yaml
import pandas as pd
import numpy as np
import torch

In [3]:
with open('../config/filepath.yml', 'r') as f:
    path_config = yaml.safe_load(f)

dude_dir = os.path.join('..', path_config['data']['DUD-E'])
alphafold_dir = os.path.join('..', path_config['data']['alphafold'])
smiles_dir = os.path.join('..', path_config['data']['smiles'])
output_dir = os.path.join('..', path_config['data']['output'])
hist_dir = os.path.join('..', path_config['data']['hist'])
preprocessed_dir = os.path.join('..', path_config['data']['preprocessed'])
model_dir = os.path.join('..', path_config['data']['docking'])

In [8]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def smiles_to_token_embeddings(smiles_list):
    """
    SMILES のトークンごとの埋め込みを取得する関数。

    Parameters:
    - smiles_list: list of str
        SMILES のリスト

    Returns:
    - token_embeddings: torch.Tensor
        各トークンの埋め込み (Shape: (batch_size, seq_len, hidden_size))
    - attention_mask: torch.Tensor
        アテンションマスク (Shape: (batch_size, seq_len))
    """
    # トークナイズ
    encoded_inputs = tokenizer(smiles_list, padding=True, truncation=True, return_tensors="pt")
    
    # デバイスに転送
    input_ids = encoded_inputs["input_ids"].to(device)
    attention_mask = encoded_inputs["attention_mask"].to(device)
    
    # モデルに入力
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # トークンごとの埋め込みを取得
    token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)
    
    return token_embeddings, attention_mask

# 使用例
smiles = ["C[C@@H]1C[C@@H]([C@@H]([C@@H](O1)O[C@@H](C(C)C)[C@@H](C)[C@@H]([C@@H](C)C(=O)O[C@H](C)[C@H](C)[C@@H]([C@@H](C)C(=O)[C@@]2(CO2)C)O)O[C@@H]3C[C@@H]([C@@H]([C@@H](O3)C)O)OC)O)[NH+](C)C", "CCN", "CCC"]  # SMILES 文字列のリスト
token_embeddings, attention_mask = smiles_to_token_embeddings(smiles)

print(f"SMILES_len = {len(smiles[0])}")
print(token_embeddings.shape)  # (batch_size, seq_len, hidden_size)
print(attention_mask.shape)    # (batch_size, seq_len)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SMILES_len = 181
torch.Size([3, 97, 384])
torch.Size([3, 97])


In [8]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../scripts'))
from scripts.utils import load_config
from torch.utils.data import DataLoader
from src.dataset import SmilesProteinDataset

NameError: name '__file__' is not defined

In [None]:
from src.score_prediction_models import 