<a href="https://colab.research.google.com/github/lightyisu/colab_collection/blob/main/%E2%80%9CCHINESE_SEMEN_ipynb%E2%80%9D%E7%9A%84%E5%89%AF%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf '/content/ner'
!git lfs clone  https://github.com/ZhengHongChuang/ner

          with new flags from 'git clone'

'git clone' has been updated in upstream Git to have comparable
speeds to 'git lfs clone'.
Cloning into 'ner'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 87 (delta 30), reused 74 (delta 17), pack-reused 0[K
Receiving objects: 100% (87/87), 7.29 MiB | 9.78 MiB/s, done.
Resolving deltas: 100% (30/30), done.
batch response: This repository is over its data quota. Account responsible for LFS bandwidth should purchase more data packs to restore access.
error: failed to fetch some objects from 'https://github.com/ZhengHongChuang/ner.git/info/lfs'


In [None]:
import torch.nn as nn
from transformers import BertModel

class BERTModel(nn.Module):
    def __init__(self, bert_path, label_count):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('google-bert/bert-base-chinese')
        self.num_labels = label_count
        self.dropout = nn.Dropout(0.1)
        self.loss_func = nn.CrossEntropyLoss()
        self.linear = nn.Linear(768, label_count)
    def forward(self, input_ids=None, label_ids=None, mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.linear(sequence_output)
        outputs = (logits,) + outputs[2:]
        active_loss = mask.view(-1) == 1

        if label_ids is not None:
            if mask is not None:
                active_loss = mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = label_ids.view(-1)[active_loss]
                loss = self.loss_func(active_logits, active_labels)
            outputs = (loss,) + outputs

        return outputs



In [None]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset,DataLoader
label2id = {
    "O": 0,
    "B-address": 1,
    "B-book": 2,
    "B-company": 3,
    'B-game': 4,
    'B-government': 5,
    'B-movie': 6,
    'B-name': 7,
    'B-organization': 8,
    'B-position': 9,
    'B-scene': 10,
    "I-address": 11,
    "I-book": 12,
    "I-company": 13,
    'I-game': 14,
    'I-government': 15,
    'I-movie': 16,
    'I-name': 17,
    'I-organization': 18,
    'I-position': 19,
    'I-scene': 20,
    "S-address": 21,
    "S-book": 22,
    "S-company": 23,
    'S-game': 24,
    'S-government': 25,
    'S-movie': 26,
    'S-name': 27,
    'S-organization': 28,
    'S-position': 29,
    'S-scene': 30
}




class DataProcess():
    def __init__(self, data_path, data_type):
        self.data_dir = os.path.join(data_path, data_type+'.npz')
    def process(self):
        data = np.load(self.data_dir, allow_pickle=True)
        print()
        data_df = pd.concat([pd.DataFrame(data['words'], columns=['words']),
                            pd.DataFrame(data['labels'], columns=['labels'])],axis=1)
        data_df = data_df.dropna()
        data_df['labels'] = data_df['labels'].map(lambda x: self.trans(x))
        corpus = []
        for _, row in data_df.iterrows():
            words = row['words']
            labels = row['labels']
            corpus.append((words, labels))
        return corpus

    def trans(self, labels):
        labels = list(labels)
        nums = [label2id[label] for label in labels]
        return nums



class CluenerDataset(Dataset):
    def __init__(self, corpus, tokenizer=None, seq_len=50):
        super(CluenerDataset, self).__init__()
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.len = len(corpus)

    def _tokenize_extend_labels(self, sentence):
        tokens = []
        for word in sentence:
            tokenized_word = self.tokenizer.tokenize(word)
            tokens.extend(tokenized_word)
        return tokens

    def __getitem__(self, item):
        sentence, label_ids = self.corpus[item]
        print('sentence:',sentence)
        tokens = self._tokenize_extend_labels(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        label_ids = [0] + label_ids + [0]

        if len(tokens) > self.seq_len:
            tokens = tokens[:self.seq_len]
            label_ids = label_ids[:self.seq_len]
        else:
            tokens += ['[PAD]' for _ in range(self.seq_len - len(tokens))]
            label_ids += [0 for _ in range(self.seq_len - len(label_ids))]
        print('tokens:',tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attn_mask = [1 if token != '[PAD]' else 0 for token in tokens]
        assert len(input_ids) == len(label_ids) == len(attn_mask)
        print('ipd',input_ids)
        return {"input_ids": torch.tensor(input_ids, dtype=torch.long),
                "label_ids": torch.tensor(label_ids, dtype=torch.long),
                "attn_mask": torch.tensor(attn_mask, dtype=torch.long)}

    def __len__(self):
        return self.len


def build_loader(data_path, data_type,tokenizer=None,seq_len=50):
    dataprocess = DataProcess(data_path, data_type)
    corpus = dataprocess.process()
    dataset = CluenerDataset(corpus,tokenizer,seq_len)
    data_loader = DataLoader(dataset, batch_size=144,shuffle=False)
    return data_loader

In [None]:

import random
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path
from tqdm import trange

from transformers import BertTokenizer

from torch.utils.tensorboard import SummaryWriter

def set_random_seed(seed=2023):
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def train_model(model, epochs, train_loader,save_path,log_dir, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in trange(epochs):
        tr_loss, n_steps, correct_preds, total_preds = 0, 0, 0, 0
        model.train()
        writer = SummaryWriter(log_dir=log_dir + f'/run_{epoch}')
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for _, batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            label_ids = batch['label_ids'].to(device)
            print('input',input_ids.shape)
            print(input_ids)
            break;
            mask = batch['attn_mask'].to(device)
            output = model(input_ids, label_ids , mask)
            loss = output[0]
            logits = output[1]
            tr_loss += loss.item()
            n_steps += 1
            # print('ls',logits.shape)
            temp, predicted = torch.max(logits, 2)
            # print('lshape', label_ids.shape)
            # print('pshape', predicted.shape)
            # print('ls', (predicted == label_ids).shape)
            total_preds += label_ids.size(0)*20
            correct_preds+= (predicted == label_ids).sum().item()


            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss = tr_loss / n_steps
        epoch_acc = correct_preds / total_preds
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
        writer.add_scalar('Train/Loss', epoch_loss, epoch + 1)
        if (epoch + 1) % 10 == 0:
            torch.save(model.state_dict(), f'{save_path}/{epoch + 1}.pt')
    writer.close()

def main():
    bert_path = "google-bert/bert-base-chinese"
    save_path = '/content/ner/weights'
    log_dir = '/content/ner/tensorboard'
    input_path = "/content/ner/dataset/cluener"
    data_type = "train"
    tokenizer = BertTokenizer.from_pretrained(bert_path, do_lower_case=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader = build_loader(data_path= input_path,data_type=data_type,tokenizer=tokenizer, seq_len=20)
    # for i, data in enumerate(train_loader):
    #     shape=data['input_ids'].shape
    #     print(f'Batch {i + 1}:')
    #     print(f'Source: {shape}')

    #     break

    model = BERTModel(bert_path=bert_path,label_count=31).to(device)
    train_model(model, 200, train_loader, save_path, log_dir, device)


if __name__ == "__main__":
    set_random_seed(seed=2023)
    main()




  0%|          | 0/200 [00:00<?, ?it/s]
  0%|          | 0/75 [00:00<?, ?it/s][A

sentence: ['浙', '商', '银', '行', '企', '业', '信', '贷', '部', '叶', '老', '桂', '博', '士', '则', '从', '另', '一', '个', '角', '度', '对', '五', '道', '门', '槛', '进', '行', '了', '解', '读', '。', '叶', '老', '桂', '认', '为', '，', '对', '目', '前', '国', '内', '商', '业', '银', '行', '而', '言', '，']
tokens: ['[CLS]', '浙', '商', '银', '行', '企', '业', '信', '贷', '部', '叶', '老', '桂', '博', '士', '则', '从', '另', '一', '个']
ipd [101, 3851, 1555, 7213, 6121, 821, 689, 928, 6587, 6956, 1383, 5439, 3424, 1300, 1894, 1156, 794, 1369, 671, 702]
sentence: ['生', '生', '不', '息', 'C', 'S', 'O', 'L', '生', '化', '狂', '潮', '让', '你', '填', '弹', '狂', '扫']
tokens: ['[CLS]', '生', '生', '不', '息', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '生', '化', '狂', '潮', '让', '你', '填', '弹', '狂', '扫', '[SEP]']
ipd [101, 4495, 4495, 679, 2622, 100, 100, 100, 100, 4495, 1265, 4312, 4060, 6375, 872, 1856, 2486, 4312, 2812, 102]
sentence: ['那', '不', '勒', '斯', 'v', 's', '锡', '耶', '纳', '以', '及', '桑', '普', 'v', 's', '热', '那', '亚', '之', '上', '呢', '？']
tokens: ['[CLS]', '那', '不', '勒', '斯'


  0%|          | 0/200 [00:00<?, ?it/s]


ipd [101, 704, 1744, 1744, 7354, 1814, 2356, 1265, 1355, 2245, 2773, 4526, 4777, 4955, 1999, 1447, 833, 712, 1215, 8024]
sentence: ['1', '9', '9', '5', '年', '底', '，', '由', '于', '邮', '政', '局', '整', '顿', '集', '邮', '市', '场', '，', '李', '永', '春', '和', '其', '他', '经', '营', '业', '户', '重', '新', '搬', '回', '到', '马', '路', '上', '摆', '摊', '经', '营', '。']
tokens: ['[CLS]', '1', '9', '9', '5', '年', '底', '，', '由', '于', '邮', '政', '局', '整', '顿', '集', '邮', '市', '场', '，']
ipd [101, 122, 130, 130, 126, 2399, 2419, 8024, 4507, 754, 6934, 3124, 2229, 3146, 7561, 7415, 6934, 2356, 1767, 8024]
sentence: ['主', '队', '强', '调', '整', '体', '，', '客', '队', '则', '依', '靠', '球', '星', '一', '锤', '定', '音', '。', '当', '然', '努', '曼', '西', '亚', '的', '主', '场', '优', '势', '不', '容', '小', '视', '。', '而', '马', '竞', '的', '客', '场', '成', '绩', '有', '待', '于', '提', '高', '。']
tokens: ['[CLS]', '主', '队', '强', '调', '整', '体', '，', '客', '队', '则', '依', '靠', '球', '星', '一', '锤', '定', '音', '。']
ipd [101, 712, 7339, 2487, 6444, 3146, 860, 8024, 2145, 




ZeroDivisionError: division by zero

In [None]:
a=torch.randn([144,20])
b=torch.randn([144,20])
print((a==b).sum().item())

0


In [None]:
print('nihao')

nihao
