<a href="https://colab.research.google.com/github/lightyisu/colab_collection/blob/main/notebook240606.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch.nn as nn
from transformers import AutoModel

from memory_profiler import profile
class BERTModel(nn.Module):
    def __init__(self, bert_path, label_count):
        super(BERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained('/kaggle/working/DNABERT-2-117M',trust_remote_code=True)
        self.num_labels = label_count
        self.dropout = nn.Dropout(0.1)
        self.loss_func = nn.CrossEntropyLoss()
        self.linear = nn.Linear(768, label_count)
    def forward(self, input_ids=None, label_ids=None, mask=None):


        input_temp=input_ids
        outputs = self.bert(input_ids=input_ids)

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.linear(sequence_output)
        outputs = (logits,) + outputs[2:]
        active_loss = mask.view(-1) == 1

        if label_ids is not None:
            if mask is not None:
                active_loss = mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = label_ids.view(-1)[active_loss]
                loss = self.loss_func(active_logits, active_labels)
            outputs = (loss,) + outputs

        return outputs



In [None]:
input_temp=None

In [None]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset,DataLoader






class DataProcess():
    def __init__(self, data_path, data_type):
        self.data_dir = os.path.join(data_path, data_type+'.npz')
    def process(self):
        data = np.load(self.data_dir, allow_pickle=True)

        data_df = pd.concat([pd.DataFrame(data['words'], columns=['words']),
                            pd.DataFrame(data['labels'], columns=['labels'])],axis=1)
        data_df = data_df.dropna()
        data_df['labels'] = data_df['labels']
        corpus = []
        for _, row in data_df.iterrows():
            words = row['words']
            labels = row['labels']
            corpus.append((words, labels))
        return corpus



class CluenerDataset(Dataset):
    def __init__(self, corpus, tokenizer=None, seq_len=50):
        super(CluenerDataset, self).__init__()
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.len = len(corpus)

    def _tokenize_extend_labels(self, sentence):
        tokens = []
        for word in sentence:
            tokenized_word = self.tokenizer.tokenize(word)
            tokens.extend(tokenized_word)
        return tokens

    def __getitem__(self, item):
        sentence, label_ids = self.corpus[item]

        tokens = self._tokenize_extend_labels(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        label_ids = [0] + label_ids + [0]

        if len(tokens) > self.seq_len:
            tokens = tokens[:self.seq_len]
            label_ids = label_ids[:self.seq_len]
        else:
            tokens += ['[PAD]' for _ in range(self.seq_len - len(tokens))]
            label_ids += [0 for _ in range(self.seq_len - len(label_ids))]

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attn_mask = [1 if token != '[PAD]' else 0 for token in tokens]
        assert len(input_ids) == len(label_ids) == len(attn_mask)
        label_ids=[int(item) for item in label_ids  ]
        return {"input_ids": torch.tensor(input_ids, dtype=torch.long),
                "label_ids": torch.tensor(label_ids, dtype=torch.long),
                "attn_mask": torch.tensor(attn_mask, dtype=torch.long)}

    def __len__(self):
        return self.len


def build_loader(data_path, data_type,tokenizer=None,seq_len=50):
    dataprocess = DataProcess(data_path, data_type)
    corpus = dataprocess.process()
    dataset = CluenerDataset(corpus,tokenizer,seq_len)
    data_loader = DataLoader(dataset, batch_size=8,shuffle=False)
    return data_loader

In [None]:

import random
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path
from tqdm import trange
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from transformers import AutoTokenizer

from torch.utils.tensorboard import SummaryWriter





def train_model(model, epochs, train_loader,save_path,log_dir, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in trange(epochs):
        tr_loss, n_steps, correct_preds, total_preds = 0, 0, 0, 0
        model.train()
        writer = SummaryWriter(log_dir=log_dir + f'/run_{epoch}')
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for _, batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            label_ids = batch['label_ids'].to(device)
            mask = batch['attn_mask'].to(device)

            output = model(input_ids, label_ids,mask )
            loss = output[0]
            logits = output[1]
            tr_loss += loss.item()
            n_steps += 1
            #ls torch.Size([8, 310, 2])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss = tr_loss / n_steps
#         epoch_acc = correct_preds / total_preds
        epoch_acc = 0
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
        writer.add_scalar('Train/Loss', epoch_loss, epoch + 1)
        if (epoch + 1) % 10 == 0:
            torch.save(model.state_dict(), f'{save_path}/{epoch + 1}.pt')
    writer.close()

def main():
    bert_path = "/kaggle/working/DNABERT-2-117M"
    save_path = '/kaggle/working/dnaseq'
    log_dir = '/kaggle/working/dnaseq'
    input_path = "/kaggle/working"
    data_type = "train"
    tokenizer = AutoTokenizer.from_pretrained(bert_path,trust_remote_code=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader = build_loader(data_path= input_path,data_type=data_type,tokenizer=tokenizer, seq_len=320)
    model = BERTModel(bert_path=bert_path,label_count=2).to(device)
    train_model(model, 100, train_loader, save_path, log_dir, device)


main()

Some weights of BertModel were not initialized from the model checkpoint at /kaggle/working/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/6660 [00:00<?, ?it/s][A
  0%|          | 1/6660 [00:00<31:54,  3.48it/s][A
  0%|          | 2/6660 [00:00<35:32,  3.12it/s][A
  0%|          | 3/6660 [00:01<38:25,  2.89it/s][A
  0%|          | 4/6660 [00:01<39:42,  2.79it/s][A
  0%|          | 5/6660 [00:01<40:27,  2.74it/s][A
  0%|          | 6/6660 [00:02<40:52,  2.71it/s][A
  0%|          | 7/6660 [00:02<41:08,  2.70it/s][A
  0%|          | 8/6660 [00:02<41:17,  2.69it/s][A
  0%|          | 9/6660 [00:03<41:22,  2.68it/s][A
  0%|          | 10/6660 [00:03<41:29,  2.67it/s][A
  0%|          | 11/6660 [00:04<41:31,  2.67it/s][A
  0%|          | 12/6660 [00:04<41:32,  2.

KeyboardInterrupt: 

In [None]:
!wget https://pan.tenire.com/down.php/e6f91824c1a792446bd9d8afb45bcdee.npz -O train.npz

In [None]:
!git lfs clone https://huggingface.co/zhihan1996/DNABERT-2-117M

In [None]:
!pip install einops

  pid, fd = os.forkpty()


Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: einops
Successfully installed einops-0.8.0


In [None]:
!git lfs clone https://huggingface.co/zhihan1996/DNABERT-2-117M

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/DNABERT-2-117M",trust_remote_code=True)
model = AutoModel.from_pretrained("/kaggle/working/DNABERT-2-117M",trust_remote_code=True)


In [None]:
import torch
def _tokenize_extend_labels(sentence):
        tokens = []
        for word in sentence:
            tokenized_word = tokenizer.tokenize(word)
            tokens.extend(tokenized_word)
        return tokens
# str=['生', '生', '不', '息', 'C', 'S', 'O', 'L', '生', '化', '狂', '潮', '让', '你', '填', '弹', '狂', '扫']
str=['A', 'C', 'G', 'T', 'A', 'G', 'C', 'A', 'T', 'C', 'G', 'A', 'C', 'A', 'C', 'T', 'T', 'G', 'G', 'T', 'T', 'A', 'T', 'C', 'G', 'A', 'T', 'A', 'G', 'C']
# tokens=tokenizer.tokenize(str)

tokens=_tokenize_extend_labels(str)
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)
#directly tokenizer -shape([18,3]) 3-single tokenzer-id
#([[ 101, 4495,  102],
#[ 101, 4495,  102]])
inputs_id=tokenizer.convert_tokens_to_ids(tokens)
inputs_id=torch.tensor(inputs_id, dtype=torch.long)
inputs_id=inputs_id.unsqueeze(1)
# inputs_id=tokenizer(str)["input_ids"]
# inputs_id=tokenizer(str)["input_ids"]
hidden_states = model(inputs_id)[0] # [1, sequence_length, 768]
# print(inputs_id)
print('hidden_state',hidden_states)