# 標題


In [74]:
import os
from functools import partial
from typing import Union, Tuple

from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
    _wrap_split_argument,
    _create_dataset_directory,
)

if is_module_available("torchdata"):
    from torchdata.datapipes.iter import FileOpener, IterableWrapper
    from torchtext._download_hooks import HttpReader

URL = {
    "train": "http://localhost:8000/openid_train.tsv.gz",
    "valid": "http://localhost:8000/openid_dev.gz",
}

# MD5 = {
#     "train": "6969c2903a1f19a83569db643e43dcc8",
#     "test": "a916e1c2d83eb3004b38fc6fcd628939",
# }

# NUM_LINES = {
#     "train": 8936,
#     "test": 2012,
# }

_EXTRACTED_FILES = {"train": "train.txt", "valid": "valid.txt"}

DATASET_NAME = "OpenID"


def _filepath_fn(root, split, _=None):
    return os.path.join(root, os.path.basename(URL[split]))


def _extracted_filepath_fn(root, split, _=None):
    return os.path.join(root, _EXTRACTED_FILES[split])


#[docs]@_create_dataset_directory(dataset_name=DATASET_NAME)
@_wrap_split_argument(("train", "valid"))
def OpenDeidAICUP2023(root: str, split: Union[Tuple[str], str]):
    """CoNLL2000Chunking Dataset

    .. warning::

        using datapipes is still currently subject to a few caveats. if you wish
        to use this dataset with shuffling, multi-processing, or distributed
        learning, please see :ref:`this note <datapipes_warnings>` for further
        instructions.

    For additional details refer to https://www.clips.uantwerpen.be/conll2000/chunking/

    Number of lines per split:
        - train: 8936
        - test: 2012

    Args:
        root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
        split: split or splits to be returned. Can be a string or tuple of strings. Default: (`train`, `test`)

    :returns: DataPipe that yields list of words along with corresponding Parts-of-speech tag and chunk tag
    :rtype: [list(str), list(str), list(str)]
    """

    if not is_module_available("torchdata"):
        raise ModuleNotFoundError(
            "Package `torchdata` not found. Please install following instructions at https://github.com/pytorch/data"
        )

    url_dp = IterableWrapper([URL[split]])

    # Cache and check HTTP response
    cache_compressed_dp = url_dp.on_disk_cache(
        filepath_fn=partial(_filepath_fn, root, split),
        #hash_dict={_filepath_fn(root, split): MD5[split]},
        hash_type="md5",
    )
    cache_compressed_dp = HttpReader(cache_compressed_dp).end_caching(mode="wb", same_filepath_fn=True)

    # Cache and check the gzip extraction for relevant split
    cache_decompressed_dp = cache_compressed_dp.on_disk_cache(filepath_fn=partial(_extracted_filepath_fn, root, split))
    cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").extract(file_type="gzip")
    cache_decompressed_dp = cache_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)

    data_dp = FileOpener(cache_decompressed_dp, encoding="utf-8")
    return data_dp.readlines().read_iob(sep="\t").shuffle().set_shuffle(False).sharding_filter()


In [75]:
train_data,valid_data = OpenDeidAICUP2023(root="C:/Users/User//",split=("train", "valid"))

In [76]:
print(f"Number of training examples:{len(list(train_data))}") #f是Python中的格式化字符串，它可以用來將特定的值插入到字符串中

Number of training examples:50565


In [77]:
print(f"Number of training examples:{len(list(valid_data))}")

Number of training examples:50155


In [78]:
train_iter = iter(train_data)
print(next(train_iter))
print(next(train_iter))

[['3437088.RAN'], ['B-MEDICALRECORD']]
[['GADNEY', ',', 'JOAN'], ['B-PATIENT', 'I-PATIENT', 'I-PATIENT']]


In [79]:
next(train_iter)

[['12G01449'], ['B-IDNUM']]

In [80]:
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import sys
MIN_FREQ=2
voc_counter = Counter()

for(token,label) in train_data:
    voc_counter.update(token)
token_voc = vocab(voc_counter,min_freq = MIN_FREQ,specials = ("<unk>","<pad>"))
token_voc.set_default_index(0)

In [81]:
INPUT_DIM = len(token_voc)
print("Unique token in the corpus is",INPUT_DIM)
PAD_IDX = token_voc.get_stoi()["<pad>"]
print("The index of <pad>",PAD_IDX)
print("The index of <unk>",token_voc.get_stoi()["<unk>"])
 
#token_voc.get_stoi() # get_stoi()會回傳一個字典(dictionary)，將詞彙表中的詞對應到整數(index)
token_voc.get_itos() #get_itos()則是將get_stoi()回傳的字典做反轉，將整數對應到詞彙

Unique token in the corpus is 7958
The index of <pad> 1
The index of <unk> 0


['<unk>',
 '<pad>',
 ',',
 'JOAN',
 'Last',
 'edited',
 ':',
 'Page',
 '2',
 'CLINICAL',
 'SCC',
 'vulva',
 '.',
 'Right',
 'suture',
 'anteriorly',
 'MACROSCOPIC',
 'A',
 'Specimen',
 'labelled',
 '"',
 'vulvectomy',
 'stich',
 'anterior',
 'consists',
 'of',
 'a',
 'right',
 'with',
 'one',
 'stitch',
 'taken',
 'as',
 '12',
 "o'clock",
 'The',
 'specimen',
 'measures',
 '120mm',
 'from',
 'to',
 '6',
 'and',
 '40mm',
 '3',
 '9',
 'There',
 'are',
 'two',
 'lesions',
 'on',
 'the',
 'mucosal',
 'surface',
 'main',
 'lesion',
 'being',
 'ulcerated',
 '25',
 'x',
 '20mm',
 'is',
 'located',
 '3mm',
 'margin',
 'second',
 'flat',
 'pigmented',
 '15mm',
 'inked',
 'blue',
 '(',
 ')',
 'black',
 'serially',
 'sectioned',
 'into',
 '21',
 'blocks',
 'Representative',
 'sections',
 'according',
 'diagram',
 'attached',
 'Blocks',
 '1',
 '-',
 'longitudinal',
 'section',
 'slice',
 ';',
 'four',
 '&',
 '4',
 'five',
 '5',
 'six',
 '7',
 '8',
 'seven',
 '10',
 'eight',
 '11',
 'ten',
 '13',
 

In [82]:
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import sys
MIN_FREQ=2
voc_counter = Counter()

for(token,label) in train_data:
    voc_counter.update(label)
label_voc = vocab(voc_counter,min_freq = MIN_FREQ,specials = ("<unk>","<pad>"))
label_voc.set_default_index(0)

In [83]:
#篩選(MIN_FREQ=2)後剩下的label
OUPUT_DIM = len(label_voc)
print("Unique label in the corpus is",OUPUT_DIM)
LABEL_PAD_IDX = label_voc.get_stoi()["<pad>"]
print("The index of <pad> for label is",LABEL_PAD_IDX)
print("The index of <unk> for label is",label_voc.get_stoi()["<unk>"])
print(label_voc.get_itos())


Unique label in the corpus is 26
The index of <pad> for label is 1
The index of <unk> for label is 0
['<unk>', '<pad>', 'B-MEDICALRECORD', 'B-PATIENT', 'I-PATIENT', 'B-IDNUM', 'O', 'B-DATE', 'B-DOCTOR', 'I-DOCTOR', 'B-STREET', 'I-STREET', 'B-CITY', 'B-STATE', 'B-ZIP', 'B-DEPARTMENT', 'I-DEPARTMENT', 'B-HOSPITAL', 'I-CITY', 'I-HOSPITAL', 'B-AGE', 'I-DATE', 'I-IDNUM', 'I-ROOM', 'B-LOCATION-OTHER', 'I-LOCATION-OTHER']


In [84]:
#label_voc.get_stoi()

In [85]:
type(label_voc.get_itos())

list

In [86]:
#全部label各自的數量
from collections import defaultdict

label_counts = defaultdict(int)#用於建立一個預設值為0的字典，這樣可以避免在使用普通字典時，當尋找不存在的鍵時產生KeyError異常
total_label_count = 0

for _, labels in train_data:
    for label in labels:
        label_counts[label] += 1
        total_label_count += 1

print("各個label的數量:", dict(label_counts))#將label_counts轉成字典型態
print("各個label的數量相加:", total_label_count)


各個label的數量: {'B-MEDICALRECORD': 693, 'B-PATIENT': 696, 'I-PATIENT': 1620, 'B-IDNUM': 2203, 'O': 479633, 'B-DATE': 2566, 'B-DOCTOR': 3172, 'I-DOCTOR': 2312, 'B-STREET': 515, 'I-STREET': 1084, 'B-CITY': 535, 'B-STATE': 491, 'B-ZIP': 515, 'B-DEPARTMENT': 620, 'I-DEPARTMENT': 757, 'B-HOSPITAL': 335, 'I-CITY': 108, 'I-HOSPITAL': 174, 'B-AGE': 40, 'I-DATE': 34, 'I-IDNUM': 7, 'B-ROOM': 1, 'I-ROOM': 3, 'B-PHONE': 1, 'B-ORGANIZATION': 1, 'I-ORGANIZATION': 1, 'B-BIOID': 1, 'B-LOCATION-OTHER': 2, 'I-LOCATION-OTHER': 4}
各個label的數量相加: 498124


In [87]:
# 把篩選(MIN_FREQ=2)過後label各自的數量標出來
filter_label_counts_dict = {}
for label in label_voc.get_itos():
    if label in label_counts:   #把篩選(MIN_FREQ=2)過後label從全部的label挑出來顯示
        filter_label_counts_dict[label] = label_counts[label]

if 'O' in filter_label_counts_dict: #把"O"刪掉
    del filter_label_counts_dict['O']

print(filter_label_counts_dict)

{'B-MEDICALRECORD': 693, 'B-PATIENT': 696, 'I-PATIENT': 1620, 'B-IDNUM': 2203, 'B-DATE': 2566, 'B-DOCTOR': 3172, 'I-DOCTOR': 2312, 'B-STREET': 515, 'I-STREET': 1084, 'B-CITY': 535, 'B-STATE': 491, 'B-ZIP': 515, 'B-DEPARTMENT': 620, 'I-DEPARTMENT': 757, 'B-HOSPITAL': 335, 'I-CITY': 108, 'I-HOSPITAL': 174, 'B-AGE': 40, 'I-DATE': 34, 'I-IDNUM': 7, 'I-ROOM': 3, 'B-LOCATION-OTHER': 2, 'I-LOCATION-OTHER': 4}


In [88]:
##由大排到小
#sorted() 函数透過 filter_label_counts_dict.get 用於獲取每個標籤對應的計數，並將其用於排序。reverse=True 表示按降序排列。
sorted_labels = sorted(filter_label_counts_dict, key=filter_label_counts_dict.get, reverse=True)


for label in sorted_labels:
    print(f"{label}: {filter_label_counts_dict[label]}")

B-DOCTOR: 3172
B-DATE: 2566
I-DOCTOR: 2312
B-IDNUM: 2203
I-PATIENT: 1620
I-STREET: 1084
I-DEPARTMENT: 757
B-PATIENT: 696
B-MEDICALRECORD: 693
B-DEPARTMENT: 620
B-CITY: 535
B-STREET: 515
B-ZIP: 515
B-STATE: 491
B-HOSPITAL: 335
I-HOSPITAL: 174
I-CITY: 108
B-AGE: 40
I-DATE: 34
I-IDNUM: 7
I-LOCATION-OTHER: 4
I-ROOM: 3
B-LOCATION-OTHER: 2


In [89]:
#算出篩選後label的總數
total_counts = 0
for label in filter_label_counts_dict:
    total_counts += filter_label_counts_dict[label]
    
print(total_counts)

18486


In [90]:
#算出各個label的百分比
for label in filter_label_counts_dict:
    percent = filter_label_counts_dict[label] / total_counts * 100
    print(f"{label}:{percent:.2f}%")#{percent:.2f}可以顯示小數點後兩位的percent

B-MEDICALRECORD:3.75%
B-PATIENT:3.77%
I-PATIENT:8.76%
B-IDNUM:11.92%
B-DATE:13.88%
B-DOCTOR:17.16%
I-DOCTOR:12.51%
B-STREET:2.79%
I-STREET:5.86%
B-CITY:2.89%
B-STATE:2.66%
B-ZIP:2.79%
B-DEPARTMENT:3.35%
I-DEPARTMENT:4.09%
B-HOSPITAL:1.81%
I-CITY:0.58%
I-HOSPITAL:0.94%
B-AGE:0.22%
I-DATE:0.18%
I-IDNUM:0.04%
I-ROOM:0.02%
B-LOCATION-OTHER:0.01%
I-LOCATION-OTHER:0.02%


In [91]:
#算出各個label的百分比，由大排到小
#sorted() 函数透過 filter_label_counts_dict.get 用於獲取每個標籤對應的計數，並將其用於排序。reverse=True 表示按降序排列。
sorted_labels = sorted(filter_label_counts_dict, key=filter_label_counts_dict.get, reverse=True)

for label in sorted_labels:
    percent = filter_label_counts_dict[label] / total_counts * 100
    print(f"{label}: {percent:.2f}%")

B-DOCTOR: 17.16%
B-DATE: 13.88%
I-DOCTOR: 12.51%
B-IDNUM: 11.92%
I-PATIENT: 8.76%
I-STREET: 5.86%
I-DEPARTMENT: 4.09%
B-PATIENT: 3.77%
B-MEDICALRECORD: 3.75%
B-DEPARTMENT: 3.35%
B-CITY: 2.89%
B-STREET: 2.79%
B-ZIP: 2.79%
B-STATE: 2.66%
B-HOSPITAL: 1.81%
I-HOSPITAL: 0.94%
I-CITY: 0.58%
B-AGE: 0.22%
I-DATE: 0.18%
I-IDNUM: 0.04%
I-LOCATION-OTHER: 0.02%
I-ROOM: 0.02%
B-LOCATION-OTHER: 0.01%


In [92]:
import pandas as pd

# 創建一個包含三個欄位的表格
df = pd.DataFrame(columns=['Tag', 'Count', 'Percentage'])

# 計算總label和percent
for label in sorted_labels:
    counts = filter_label_counts_dict[label]
    percent = counts / total_counts * 100
    df = df.append({'Tag': label, 'Count': counts, 'Percentage': f'{percent:.2f}%'}, ignore_index=True)


# 打印表格
print(df)


                 Tag Count Percentage
0           B-DOCTOR  3172     17.16%
1             B-DATE  2566     13.88%
2           I-DOCTOR  2312     12.51%
3            B-IDNUM  2203     11.92%
4          I-PATIENT  1620      8.76%
5           I-STREET  1084      5.86%
6       I-DEPARTMENT   757      4.09%
7          B-PATIENT   696      3.77%
8    B-MEDICALRECORD   693      3.75%
9       B-DEPARTMENT   620      3.35%
10            B-CITY   535      2.89%
11          B-STREET   515      2.79%
12             B-ZIP   515      2.79%
13           B-STATE   491      2.66%
14        B-HOSPITAL   335      1.81%
15        I-HOSPITAL   174      0.94%
16            I-CITY   108      0.58%
17             B-AGE    40      0.22%
18            I-DATE    34      0.18%
19           I-IDNUM     7      0.04%
20  I-LOCATION-OTHER     4      0.02%
21            I-ROOM     3      0.02%
22  B-LOCATION-OTHER     2      0.01%


lab2

In [93]:
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import sys
MIN_FREQ=2
voc_counter = Counter()

for(token,label) in train_data:
    voc_counter.update(token)
token_voc = vocab(voc_counter,min_freq = MIN_FREQ,specials = ("<unk>","<pad>"))
token_voc.set_default_index(0)

In [94]:
INPUT_DIM = len(token_voc)
print("Unique token in the corpus is",INPUT_DIM)
PAD_IDX = token_voc.get_stoi()["<pad>"]
print("The index of <pad>",PAD_IDX)
print("The index of <unk>",token_voc.get_stoi()["<unk>"])
 
#token_voc.get_stoi() # get_stoi()會回傳一個字典(dictionary)，將詞彙表中的詞對應到整數(index)
token_voc.get_itos() #get_itos()則是將get_stoi()回傳的字典做反轉，將整數對應到詞彙

Unique token in the corpus is 7958
The index of <pad> 1
The index of <unk> 0


['<unk>',
 '<pad>',
 ',',
 'JOAN',
 'Last',
 'edited',
 ':',
 'Page',
 '2',
 'CLINICAL',
 'SCC',
 'vulva',
 '.',
 'Right',
 'suture',
 'anteriorly',
 'MACROSCOPIC',
 'A',
 'Specimen',
 'labelled',
 '"',
 'vulvectomy',
 'stich',
 'anterior',
 'consists',
 'of',
 'a',
 'right',
 'with',
 'one',
 'stitch',
 'taken',
 'as',
 '12',
 "o'clock",
 'The',
 'specimen',
 'measures',
 '120mm',
 'from',
 'to',
 '6',
 'and',
 '40mm',
 '3',
 '9',
 'There',
 'are',
 'two',
 'lesions',
 'on',
 'the',
 'mucosal',
 'surface',
 'main',
 'lesion',
 'being',
 'ulcerated',
 '25',
 'x',
 '20mm',
 'is',
 'located',
 '3mm',
 'margin',
 'second',
 'flat',
 'pigmented',
 '15mm',
 'inked',
 'blue',
 '(',
 ')',
 'black',
 'serially',
 'sectioned',
 'into',
 '21',
 'blocks',
 'Representative',
 'sections',
 'according',
 'diagram',
 'attached',
 'Blocks',
 '1',
 '-',
 'longitudinal',
 'section',
 'slice',
 ';',
 'four',
 '&',
 '4',
 'five',
 '5',
 'six',
 '7',
 '8',
 'seven',
 '10',
 'eight',
 '11',
 'ten',
 '13',
 

In [95]:
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import sys
MIN_FREQ=2
voc_counter = Counter()

for(token,label) in train_data:
    voc_counter.update(label)
label_voc = vocab(voc_counter,min_freq = MIN_FREQ,specials = ("<unk>","<pad>"))
label_voc.set_default_index(0)

In [96]:
#篩選(MIN_FREQ=2)後剩下的label
OUTPUT_DIM = len(label_voc)
print("Unique label in the corpus is",OUTPUT_DIM)
LABEL_PAD_IDX = label_voc.get_stoi()["<pad>"]
print("The index of <pad> for label is",LABEL_PAD_IDX)
print("The index of <unk> for label is",label_voc.get_stoi()["<unk>"])
print(label_voc.get_itos())


Unique label in the corpus is 26
The index of <pad> for label is 1
The index of <unk> for label is 0
['<unk>', '<pad>', 'B-MEDICALRECORD', 'B-PATIENT', 'I-PATIENT', 'B-IDNUM', 'O', 'B-DATE', 'B-DOCTOR', 'I-DOCTOR', 'B-STREET', 'I-STREET', 'B-CITY', 'B-STATE', 'B-ZIP', 'B-DEPARTMENT', 'I-DEPARTMENT', 'B-HOSPITAL', 'I-CITY', 'I-HOSPITAL', 'B-AGE', 'I-DATE', 'I-IDNUM', 'I-ROOM', 'B-LOCATION-OTHER', 'I-LOCATION-OTHER']


# Generate batches of vectors

In [143]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_batch(batch):
    labels_list, tokens_list = [], []
    for(tokens, labels) in batch:
        labels_list.append(torch.tensor(label_voc.lookup_indices(labels)))
        tokens_list.append(torch.tensor(token_voc.lookup_indices(tokens)))
    return pad_sequence(tokens_list, padding_value = PAD_IDX, batch_first = False),\
           pad_sequence(labels_list, padding_value = LABEL_PAD_IDX, batch_first = False)
train_dataloader = DataLoader(list(train_data), batch_size = 8, shuffle = False, collate_fn = collate_batch)
next(iter(train_dataloader))

(tensor([[ 0,  0,  0,  4,  9, 10, 13, 16],
         [ 1,  2,  1,  5,  6, 11, 11,  6],
         [ 1,  3,  1,  6,  1, 12, 14,  1],
         [ 1,  1,  1,  0,  1,  1, 15,  1],
         [ 1,  1,  1,  7,  1,  1, 12,  1],
         [ 1,  1,  1,  6,  1,  1,  1,  1],
         [ 1,  1,  1,  8,  1,  1,  1,  1]]),
 tensor([[2, 3, 5, 6, 6, 6, 6, 6],
         [1, 4, 1, 6, 6, 6, 6, 6],
         [1, 4, 1, 6, 1, 6, 6, 1],
         [1, 1, 1, 7, 1, 1, 6, 1],
         [1, 1, 1, 6, 1, 1, 6, 1],
         [1, 1, 1, 6, 1, 1, 1, 1],
         [1, 1, 1, 6, 1, 1, 1, 1]]))

In [144]:
import random
BATCH_SIZE = 1024
class BatchSampler():
    def __init__(self, data, batch_size):
        self.pooled_indices = []
        self.data = data
        self.batch_size = batch_size
        self.len = len(list(data))
        
    def __iter__(self):
        self.pooled_indices = []
        indices = [(index, len(data[0])) for index, data in enumerate(self.data)]
        random.shuffle(indices)
        #create pool of indices with similar lengths
        for i in range(0, len(indices), BATCH_SIZE * 100):
            self.pooled_indices.extend(sorted(indices[i:i + BATCH_SIZE * 100], key = lambda x:x[1],\
                                             reverse = True))
        self.pooled_indices = [x[0] for x in self.pooled_indices]
        #yield indices gor current batch
        for i in range(0, len(self.pooled_indices),BATCH_SIZE):
            yield self.pooled_indices[i:i + BATCH_SIZE]
    
    def __len__(self):
        return(self.len + self.batch_size -1) // self.batch_size
    
train_list = list(train_data)
bucket_train_dataloader = DataLoader(train_list, batch_sampler = BatchSampler(train_list, BATCH_SIZE), \
                                     collate_fn = collate_batch, pin_memory = True)

for idx, batch in enumerate(bucket_train_dataloader):
    print(batch)
    print(batch[0].shape)
    print(batch[1].shape)
    break

(tensor([[  84,   84,   71,  ...,   35,   18, 3822],
        [   6,    6,   85,  ..., 1988,   19,    6],
        [  85,   85, 1325,  ..., 1569,   20,    0],
        ...,
        [  40,   90,    1,  ...,    1,    1,    1],
        [  27,    1,    1,  ...,    1,    1,    1],
        [  12,    1,    1,  ...,    1,    1,    1]]), tensor([[6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6],
        ...,
        [6, 6, 1,  ..., 1, 1, 1],
        [6, 1, 1,  ..., 1, 1, 1],
        [6, 1, 1,  ..., 1, 1, 1]]))
torch.Size([270, 1024])
torch.Size([270, 1024])


# Create a network model with embeddings

In [145]:
from torch import nn

class BiLSTMTagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim,
                                      padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers = n_layers,
                           bidirectional = bidirectional,
                           dropout = dropout if n_layers > 1
                           else 0)
        
        self.fc = nn.Linear(hidden_dim *2 if bidirectional else 
                           hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        
        #output = [sent len, batch size, hid dim*n directions ]
        predictions = self.fc(self.dropout(outputs))
        
        #predictions = [sent len, batch size, output dim]
        return predictions

In [146]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
DROPOUT = 0.25
N_LAYERS = 1
BIDIRECTION = True
model = BiLSTMTagger(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM,
                    OUTPUT_DIM, N_LAYERS, BIDIRECTION, DROPOUT, PAD_IDX)
model

BiLSTMTagger(
  (embedding): Embedding(7958, 300, padding_idx=1)
  (lstm): LSTM(300, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=26, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

# Train the model

In [152]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = LABEL_PAD_IDX)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [157]:
def train(model, iterator,optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_f_score = 0
    model.train()
    y_preds = []
    y_golds = []
    
    for idx, (text,tags) in enumerate(iterator):
        text = text.to(device)
        tags = tags.to(device)
        
        optimizer.zero_grad()
        predictions = model(text)
        
        y_pred, y_gold = collect_f_score_info(predictions, tags, tag_pad_idx)
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        y_preds.extend(y_pred)
        y_golds.extend(y_gold)
    f_score = f1_score(y_golds, y_preds, zero_division = 0)
    
    return epoch_loss / len(iterator), f_score

In [158]:
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report

def collect_f_score_info(preds, ys, tag_pad_idx):
    max_preds = preds.argmax(dim = 2, keepdim = True) #get the index of the max probability
    max_preds = torch.transpose(max_preds, 0, 1)
    y_pred = []
    y_gold = []
    ys = torch.transpose(ys, 0, 1)
    max_preds = list(max_preds)
    for idx, y in enumerate(ys):
        s_g = []
        s_p = []
        py = list(max_preds[idx])
        for idx, g in enumerate(y):
            g = g.item()
            p = py[idx].item()
            if g != tag_pad_idx:
                s_g.append(label_voc.lookup_token(g))
                s_p.append(label_voc.lookup_token(p))
                y_gold.append(s_g)
                y_pred.append(s_p)
                
        return y_pred, y_gold
        
    

In [159]:
def epoch_time(star_time, end_time):
    elapsed_time = end_time = start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [162]:
import os

directory = "./models/lstm"
if not os.path.exists(directory):
    os.makedirs(directory)

In [165]:
import time
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_f = train(model, bucket_train_dataloader, optimizer, criterion, LABEL_PAD_IDX)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), "./models/lstm/lstm.pt")
    
    print(f"Epoch:{epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss:{train_loss:.3f} | Train F-score:{train_f*100:.2f}%")

Epoch:01 | Epoch Time: 27975382m 44s
	Train Loss:0.097 | Train F-score:44.90%
Epoch:02 | Epoch Time: 27975384m 8s
	Train Loss:0.077 | Train F-score:100.00%
Epoch:03 | Epoch Time: 27975385m 49s
	Train Loss:0.065 | Train F-score:48.05%
Epoch:04 | Epoch Time: 27975387m 46s
	Train Loss:0.056 | Train F-score:80.95%
Epoch:05 | Epoch Time: 27975389m 22s
	Train Loss:0.052 | Train F-score:63.58%
Epoch:06 | Epoch Time: 27975390m 49s
	Train Loss:0.046 | Train F-score:100.00%
Epoch:07 | Epoch Time: 27975392m 17s
	Train Loss:0.041 | Train F-score:94.39%
Epoch:08 | Epoch Time: 27975393m 54s
	Train Loss:0.040 | Train F-score:83.81%
Epoch:09 | Epoch Time: 27975395m 33s
	Train Loss:0.037 | Train F-score:81.74%
Epoch:10 | Epoch Time: 27975397m 9s
	Train Loss:0.035 | Train F-score:98.11%


In [168]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_f_score = 0
    model.eval()
    y_preds = []
    y_golds = []
    with torch.no_grad():
          for idx, (text,tags) in enumerate(iterator):
                text = text.to(device)
                tags = tags.to(device)

                optimizer.zero_grad()
                predictions = model(text)

                y_pred, y_gold = collect_f_score_info(predictions, tags, tag_pad_idx)
                predictions = predictions.view(-1, predictions.shape[-1])
                tags = tags.view(-1)
                loss = criterion(predictions, tags)      

                epoch_loss += loss.item()
                y_preds.extend(y_pred)
                y_golds.extend(y_gold)
                
    print(classification_report(y_golds, y_preds, zero_division = 0))
    f_score = f1_score(y_golds, y_preds, zero_division = 0)
    return epoch_loss / len(iterator), f_score

In [169]:
valid_list = list(valid_data)
bucket_valid_dataloader = DataLoader(valid_list, batch_sampler = BatchSampler(valid_list,BATCH_SIZE), collate_fn = collate_batch, pin_memory = True)
valid_loss, valid_f =  evaluate(model, bucket_valid_dataloader, criterion, LABEL_PAD_IDX)
print(f"\Validation Loss:{valid_loss:.3f} | Validation F-score:{ valid_f * 100:.2f}%")


               precision    recall  f1-score   support

         CITY       1.00      1.00      1.00         3
   DEPARTMENT       1.00      1.00      1.00        12
       DOCTOR       1.00      1.00      1.00        13
     HOSPITAL       1.00      1.00      1.00         6
        IDNUM       1.00      1.00      1.00         4
MEDICALRECORD       1.00      1.00      1.00         1
      PATIENT       1.00      1.00      1.00         3
        STATE       1.00      1.00      1.00         3
       STREET       1.00      1.00      1.00         3
          ZIP       1.00      1.00      1.00         3

    micro avg       1.00      1.00      1.00        51
    macro avg       1.00      1.00      1.00        51
 weighted avg       1.00      1.00      1.00        51

\Validation Loss:0.037 | Validation F-score:100.00%
