In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install vocab

Collecting vocab
  Downloading vocab-0.0.5-py3-none-any.whl (7.6 kB)
Installing collected packages: vocab
Successfully installed vocab-0.0.5


In [None]:
!unzip /content/gdrive/MyDrive/multicardioner_train+dev_240429.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0465-546X2009000300008-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1699-695X2016000200009-1.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0210-56912009000800006-3.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1887-85712013000200013-1.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0365-66912011000400005-2.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0376-78922014000200011-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0365-66912006000400010-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1134-80462015000100006-1.ann  
  inflating: multicardioner_train+dev_2

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import os

class BiLSTMCRF(nn.Module):
    def __init__(self, sent_vocab_size, tag_vocab_size, dropout_rate=0.5, embed_size=256, hidden_size=256):
        super(BiLSTMCRF, self).__init__()
        self.dropout_rate = dropout_rate
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(sent_vocab_size, embed_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bidirectional=True)
        self.hidden2emit_score = nn.Linear(hidden_size * 2, tag_vocab_size)
        self.transition = nn.Parameter(torch.randn(tag_vocab_size, tag_vocab_size))

    def forward(self, sentences, tags, sen_lengths):
        mask = (sentences != 0).to(self.device)
        sentences = sentences.transpose(0, 1)
        sentences = self.embedding(sentences)
        emit_score = self.encode(sentences, sen_lengths)
        loss = self.cal_loss(tags, mask, emit_score)
        return loss

    def encode(self, sentences, sent_lengths):
        padded_sentences = pack_padded_sequence(sentences, sent_lengths)
        hidden_states, _ = self.encoder(padded_sentences)
        hidden_states, _ = pad_packed_sequence(hidden_states, batch_first=True)
        emit_score = self.hidden2emit_score(hidden_states)
        emit_score = self.dropout(emit_score)
        return emit_score

    def cal_loss(self, tags, mask, emit_score):
        batch_size, sent_len = tags.shape
        score = torch.gather(emit_score, dim=2, index=tags.unsqueeze(dim=2)).squeeze(dim=2)
        score[:, 1:] += self.transition[tags[:, :-1], tags[:, 1:]]
        total_score = (score * mask.type(torch.float)).sum(dim=1)
        d = torch.unsqueeze(emit_score[:, 0], dim=1)
        for i in range(1, sent_len):
            n_unfinished = mask[:, i].sum()
            d_uf = d[: n_unfinished]
            emit_and_transition = emit_score[: n_unfinished, i].unsqueeze(dim=1) + self.transition
            log_sum = d_uf.transpose(1, 2) + emit_and_transition
            max_v = log_sum.max(dim=1)[0].unsqueeze(dim=1)
            log_sum = log_sum - max_v
            d_uf = max_v + torch.logsumexp(log_sum, dim=1).unsqueeze(dim=1)
            d = torch.cat((d_uf, d[n_unfinished:]), dim=0)
        d = d.squeeze(dim=1)
        max_d = d.max(dim=-1)[0]
        d = max_d + torch.logsumexp(d - max_d.unsqueeze(dim=1), dim=1)
        llk = total_score - d
        loss = -llk
        return loss

    def predict(self, sentences, sen_lengths):
        batch_size = sentences.shape[0]
        mask = (sentences != 0)
        sentences = sentences.transpose(0, 1)
        sentences = self.embedding(sentences)
        emit_score = self.encode(sentences, sen_lengths)
        tags = [[[i] for i in range(self.hidden2emit_score.out_features)]] * batch_size
        d = torch.unsqueeze(emit_score[:, 0], dim=1)
        for i in range(1, sen_lengths[0]):
            n_unfinished = mask[:, i].sum()
            d_uf = d[: n_unfinished]
            emit_and_transition = self.transition + emit_score[: n_unfinished, i].unsqueeze(dim=1)
            new_d_uf = d_uf.transpose(1, 2) + emit_and_transition
            d_uf, max_idx = torch.max(new_d_uf, dim=1)
            max_idx = max_idx.tolist()
            tags[: n_unfinished] = [[tags[b][k] + [j] for j, k in enumerate(max_idx[b])] for b in range(n_unfinished)]
            d = torch.cat((torch.unsqueeze(d_uf, dim=1), d[n_unfinished:]), dim=0)
        d = d.squeeze(dim=1)
        _, max_idx = torch.max(d, dim=1)
        max_idx = max_idx.tolist()
        tags = [tags[b][k] for b, k in enumerate(max_idx)]
        return tags

    @property
    def device(self):
        return self.embedding.weight.device


def parse_annotations(ann_file_path):
    annotations = {}
    with open(ann_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            if len(parts) >= 5 and parts[1] == 'FARMACO':
                word = parts[-1]
                annotations[word] = parts[1]
    return annotations


def parse_text_with_annotations(txt_file_path, annotations):
    sentences = []
    tags = []
    with open(txt_file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = text.split()
        current_sentence = []
        current_tags = []
        for word in words:
            if word in annotations:
                current_sentence.append(word)
                current_tags.append(annotations[word])
            else:
                current_sentence.append(word)
                current_tags.append("O")
            if word.endswith(('.', '!', '?')):
                sentences.append(current_sentence)
                tags.append(current_tags)
                current_sentence = []
                current_tags = []
    return sentences, tags


def load_and_preprocess_data(data_folder):
    data = []
    for file_name in os.listdir(data_folder):
        if file_name.endswith('.txt'):
            txt_file_path = os.path.join(data_folder, file_name)
            ann_file_path = os.path.join(data_folder, file_name.replace('.txt', '.ann'))
            annotations = parse_annotations(ann_file_path)
            sentences, tags = parse_text_with_annotations(txt_file_path, annotations)
            data.append((sentences, tags))
    return data


def main():
    # Load and preprocess your data
    data = load_and_preprocess_data('/content/multicardioner_train+dev_240429/track2/cardioccc_dev/en/brat')

    # Build vocabularies
    sent_vocab = {}
    tag_vocab = {}
    for sentences, tags in data:
        for word_list in sentences:
            for word in word_list:
                if word not in sent_vocab:
                    sent_vocab[word] = len(sent_vocab)
        for tag_list in tags:
            tag_tuple = tuple(tag_list)
            if tag_tuple not in tag_vocab:
                tag_vocab[tag_tuple] = len(tag_vocab)

    # Initialize the model
    device = torch.device('cpu')
    model = BiLSTMCRF(len(sent_vocab), len(tag_vocab))
    model.to(device)

    # Define loss function
    criterion = nn.CrossEntropyLoss()

    # Define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        total_loss = 0.0
        for sentences, tags in data:
            # Convert sentences and tags to tensors
            sentences_tensor = torch.tensor([[sent_vocab[word] for word in word_list] for word_list in sentences], dtype=torch.long)
            tags_tensor = torch.tensor([[tag_vocab[tag_tuple] for tag_tuple in tag_list] for tag_list in tags], dtype=torch.long)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            loss = model(sentences_tensor, tags_tensor, [len(sentences)])

            # Backward pass
            loss.backward()

            # Update weights
            optimizer.step()

            total_loss += loss.item()

        # Calculate average loss for the epoch
        avg_loss = total_loss / len(data)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

if __name__ == '__main__':
    main()


ValueError: expected sequence of length 17 at dim 1 (got 5)