<a href="https://colab.research.google.com/github/lhua0420/MedNER/blob/main/MedNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 导入必要的 Python 库
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
from collections import Counter
from itertools import chain

# 确保每次实验结果一致
torch.manual_seed(1)


<torch._C.Generator at 0x7d2cb01a9750>

In [3]:
def read_file(file_path):
    """
    读取并解析数据文件，返回句子列表。
    每个句子是字符和标签的元组列表。
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        sentences = []
        sentence = []
        for line in lines:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                char, label = line.strip().split()
                sentence.append((char, label))

        # 检查并添加最后一个句子（如果非空）
        if sentence:
            sentences.append(sentence)

    # 打印读取的句子数量和前几个句子的示例
    print(f"Total sentences read: {len(sentences)}")
    print("Example sentences:")
    for i in range(min(3, len(sentences))):
        print(sentences[i])

    return sentences


def build_vocab(sentences):
    """
    从句子中构建字符和标签的词汇表。
    返回字符和标签的 Counter 对象。
    """
    chars = [[char for char, label in sentence] for sentence in sentences]
    labels = [[label for char, label in sentence] for sentence in sentences]
    vocab = Counter(chain(*chars))
    tagset = Counter(chain(*labels))

    # 打印词汇表和标签集的大小
    print(f"Vocabulary size: {len(vocab)}")
    print(f"Tag set size: {len(tagset)}")

    return vocab, tagset


def collate_fn(batch):
    """
    自定义 collate 函数，用于 DataLoader 中动态填充序列。
    """
    chars, labels = zip(*batch)
    chars_padded = pad_sequence(chars, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=label2idx['O'])

    # 打印批处理的大小和填充后的序列尺寸
    # print(f"Batch size: {len(batch)}")
    # print(f"Padded chars size: {chars_padded.size()}")
    # print(f"Padded labels size: {labels_padded.size()}")

    return chars_padded, labels_padded


class NERDataset(Dataset):
    """
    自定义数据集类，用于处理 NER 数据。
    将句子转换为字符索引和标签索引的形式。
    """
    def __init__(self, sentences, char2idx, label2idx):
        self.sentences = sentences
        self.char2idx = char2idx
        self.label2idx = label2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence, labels = zip(*self.sentences[index])
        char_indices = [self.char2idx.get(char, self.char2idx['<UNK>']) for char in sentence]
        label_indices = [self.label2idx[label] for label in labels]

        # 打印正在处理的句子索引和内容（可选）
        # print(f"Processing sentence index: {index}")
        # print(f"Sentence: {sentence}")

        return torch.tensor(char_indices, dtype=torch.long), torch.tensor(label_indices, dtype=torch.long)



In [4]:
class NERModelRNN(nn.Module):
    """
    定义用于 NER 的 RNN 神经网络模型。
    包括嵌入层、单向 RNN 层和全连接层。
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super(NERModelRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_tags)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        out = self.fc(rnn_out)
        return out


In [5]:
class NERModelBiLSTM(nn.Module):
    """
    定义用于 NER 的神经网络模型。
    包括嵌入层、双向 LSTM 层和全连接层。
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super(NERModelBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_tags)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out


In [2]:
# 设置超参数
embedding_dim = 128
hidden_dim = 256
num_epochs = 10
learning_rate = 0.001
batch_size = 32

# 读取数据
train_sentences = read_file('ResumeNER/train.char.bmes')
dev_sentences = read_file('ResumeNER/dev.char.bmes')
test_sentences = read_file('ResumeNER/test.char.bmes')

# 构建词汇表和标签集
vocab, tagset = build_vocab(train_sentences)
char2idx = {char: idx + 1 for idx, char in enumerate(vocab)}  # +1 for padding
label2idx = {label: idx for idx, label in enumerate(tagset)}
idx2label = {idx: label for label, idx in label2idx.items()}
char2idx['<UNK>'] = len(char2idx)  # 添加 <UNK> 标记

# 创建数据集和数据加载器
train_dataset = NERDataset(train_sentences, char2idx, label2idx)
dev_dataset = NERDataset(dev_sentences, char2idx, label2idx)
test_dataset = NERDataset(test_sentences, char2idx, label2idx)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 实例化模型
model = NERModelBiLSTM(len(char2idx), embedding_dim, hidden_dim, len(tagset))

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


NameError: name 'read_file' is not defined