In [1]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.nn.parallel import DataParallel
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, T5Config, T5ForConditionalGeneration
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

import warnings
warnings.filterwarnings("ignore")

import nltk
# nltk.download('punkt')
# 类

class GPT2Dataset(Dataset):

  def __init__(self, propmt_list, answer_list, tokenizer,
               max_length_propmt=1024, max_length_answer=10):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    self.answer_ids = []

    # 设置填充参数为右填充
    tokenizer.padding_side = "right"

    encodings_list1 = [tokenizer('<s>' + txt, truncation=True, max_length=max_length_propmt, padding="max_length") for txt in propmt_list]
    self.input_ids = [torch.tensor(encodings_dict['input_ids']) for encodings_dict in encodings_list1]
    self.attn_masks = [torch.tensor(encodings_dict['attention_mask']) for encodings_dict in encodings_list1]
        
    encodings_list2 = [tokenizer('<s>' + str(ans), truncation=True, max_length=max_length_answer, padding="max_length") for ans in answer_list]
    self.answer_ids = [torch.tensor(encodings_dict['input_ids']) for encodings_dict in encodings_list2]
        
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx], self.answer_ids[idx]
  

  from .autonotebook import tqdm as notebook_tqdm
2024-04-29 13:33:55.808239: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-29 13:33:55.866443: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
path = '/home/hkqiu/work/PolyGPT/Polymer-Generation/model_save/finetuning-ae-100epoch-1e-5'
tokenizer = AutoTokenizer.from_pretrained(path, pad_token='[PAD]', padding_side='right') 
configuration = T5Config.from_pretrained(path, output_hidden_states=False)
model = T5ForConditionalGeneration.from_pretrained(path, config=configuration)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32130, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32130, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
# Try1
prompt = "-6.19,236,14,0,1,0,0,0,0,2,2,4,0,6,3,2"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generated = generated.to(device)  # 在GPU上生成会快一倍

# sample_outputs = model.module.generate(
sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   # 是否采样，True表示采样搜索，可以设置返回结果数目。False表示贪婪搜索，只能返回一个结果
                                top_k=100, 
                                max_length = 300,
                                top_p=0.999, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i+1, tokenizer.decode(sample_output, skip_special_tokens=True)))

1: <pad> [*]CC(S1)=CC=C1C(S2)=CC=C2C(=S)O[*]<pad><pad><pad><pad>


2: <pad> [*]OC(S1)=CC=C1C(S2)=CC=C2C(=S)[*]<pad><pad><pad><pad><pad>


3: <pad> [*]CC(S1)=CC=C1C(=O)C(S2)=CC=C2C(=S)[*]


4: <pad> [*]C(S1)=CC=C1C(=O)C(S2)=CC=C2C(=S)[*]


5: <pad> [*]C(S1)=CC=C1C(S2)=CC=C2OC(=S)[*]<pad><pad><pad><pad><pad>




In [16]:
# Try2
prompt = "-6.19,236,14,0,1,0,0,0,0,2,2,4,0,6,3,2"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generated = generated.to(device)  # 在GPU上生成会快一倍

# sample_outputs = model.module.generate(
sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   # 是否采样，True表示采样搜索，可以设置返回结果数目。False表示贪婪搜索，只能返回一个结果
                                top_k=100, 
                                max_length = 300,
                                top_p=0.999, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i+1, tokenizer.decode(sample_output, skip_special_tokens=True)))

1: <pad> [*]C(S1)=CC=C1C(S2)=CC=C2C(=S)O[*]<pad><pad><pad><pad><pad><pad><pad><pad><pad>


2: <pad> [*]CC(=S)C(S1)=CC=C1C(=S)C(=O)C(S2)=CC=C2[*]


3: <pad> [*]C(=S)C(S1)=CC=C1C(S2)=CC=C2C(=S)O[*]<pad><pad><pad><pad>


4: <pad> [*]CC(S1)=CC=C1C(S2)=CC=C2OC(=S)[*]<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


5: <pad> [*]C(S1)=CC=C1C(=O)C(S2)=CC=C2C(=S)[*]<pad><pad><pad><pad><pad>




In [15]:
# Try3
prompt = "-6.19,236,14,0,1,0,0,0,0,2,2,4,0,6,3,2"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generated = generated.to(device)  # 在GPU上生成会快一倍

# sample_outputs = model.module.generate(
sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   # 是否采样，True表示采样搜索，可以设置返回结果数目。False表示贪婪搜索，只能返回一个结果
                                top_k=100, 
                                max_length = 300,
                                top_p=0.999, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i+1, tokenizer.decode(sample_output, skip_special_tokens=True)))

1: <pad> [*]C(S1)=CC=C1C(S2)=CC=C2OC(=S)[*]<pad><pad><pad><pad><pad>


2: <pad> [*]CC(S1)=CC=C1OC(S2)=CC=C2C(=S)[*]<pad><pad><pad><pad><pad>


3: <pad> [*]CC(S1)=CC=C1C(S2)=CC=C2C(=S)O[*]<pad><pad><pad><pad>


4: <pad> [*]C(C=C1)=CC=C1C(S2)=CC=C2C(=S)O[*]<pad><pad>


5: <pad> [*]C(=O)C(S1)=CC=C1C(S2)=CC=C2C(=S)[*]




In [25]:
from rdkit import Chem
from rdkit.Chem import Descriptors
# 计算描述符并添加到DataFrame中
can_smiles = ['[*]Nc1cc(C)c(Cl)c(Cl)c1O[Na]']
df = pd.DataFrame()

num_processed = 0
print_frequency = 5

custom_des = ['MolWt','HeavyAtomCount','NHOHCount','NOCount',
                'NumAliphaticCarbocycles','NumAliphaticHeterocycles',
                'NumAliphaticRings','NumAromaticCarbocycles','NumAromaticHeterocycles',
                'NumAromaticRings','NumHAcceptors','NumHDonors','NumHeteroatoms',
                'NumRotatableBonds','RingCount',]

for desc_name, desc_func in Descriptors.descList:
    if desc_name in custom_des:
        descriptors = [desc_func(Chem.MolFromSmiles(smiles)) if Chem.MolFromSmiles(smiles) else None for smiles in can_smiles]
        df[desc_name] = descriptors

        # 打印处理进度
        num_processed += 1
        if num_processed % print_frequency == 0:
            print(f"已计算 {num_processed} 个描述符")

df

已计算 5 个描述符
已计算 10 个描述符
已计算 15 个描述符


Unnamed: 0,MolWt,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,RingCount
0,213.019,12,1,2,0,0,0,1,0,1,2,1,6,2,1


In [32]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem

def calculate_tanimoto_similarity(smiles1, smiles2):
    # 通过SMILES字符串创建分子对象
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    
    # 生成分子的指纹（Fingerprint）
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=1024)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=1024)
    
    # 计算Tanomoto相似性
    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    
    return similarity

# 示例用法
smiles1 = '[*]C(=O)C(S1)=CC=C1C(=S)C(S2)=CC=C2[*]'
smiles2 = '[*]C(=O)C(S1)=CC=C1C(S2)=CC=C2C(=S)[*]'
similarity = calculate_tanimoto_similarity(smiles1, smiles2)
print('Tanomoto相似性:', similarity)

Tanomoto相似性: 0.5151515151515151
