In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!unzip /content/drive/MyDrive/multicardioner_train+dev_240429.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0465-546X2009000300008-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1699-695X2016000200009-1.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0210-56912009000800006-3.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1887-85712013000200013-1.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0365-66912011000400005-2.ann  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0376-78922014000200011-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S0365-66912006000400010-1.txt  
  inflating: multicardioner_train+dev_240429/track2/drugtemist_train/en/brat/es-S1134-80462015000100006-1.ann  
  inflating: multicardioner_train+dev_2

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import FastText
from gensim.utils import simple_preprocess

In [None]:
# Function to read text from a file
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Function to read annotations from a file
def read_annotations_file(file_path):
    annotations = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 4:
                _, _, start, end, word = parts[:5]
                for i in range(int(start), int(end)):
                    annotations[i] = "FARMACO"  # Store word indices with entity type "FARMACO"
    return annotations

In [None]:
# Define BiLSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # *2 for bidirectional

    def forward(self, input):
        lstm_out, _ = self.lstm(input)
        output = self.fc(lstm_out)
        return output


In [None]:
train_folder_path = "/content/multicardioner_train+dev_240429/track2/cardioccc_dev/en/brat"

In [None]:
# Function to process text files and generate word embeddings
def process_text_files(folder_path):
    all_words = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            # Get corresponding annotation file
            annotation_file_name = file_name.replace(".txt", ".ann")
            annotation_file_path = os.path.join(folder_path, annotation_file_name)

            # Read text from the text file
            text = read_text_file(os.path.join(folder_path, file_name))
            # Tokenize the text and add words to the list
            all_words.extend(simple_preprocess(text))

            # Read annotations from the annotation file
            annotations = read_annotations_file(annotation_file_path)

    # Train FastText model
    model = FastText(sentences=[all_words], vector_size=100, window=5, min_count=1, workers=4)

    # Create word-to-index mapping
    word_to_ix = {word: idx for idx, word in enumerate(model.wv.index_to_key)}

    # Generate ground truth annotation list
    ground_truth_annotations = []
    for idx, word in enumerate(all_words):
        if idx in annotations:
            ground_truth_annotations.append((idx, annotations[idx]))
        else:
            ground_truth_annotations.append((idx, "O"))  # Assign "O" tag to non-entity words

    # Convert word embeddings and annotations to PyTorch tensors
    word_embeddings_tensor = torch.tensor(model.wv[model.wv.index_to_key])
    annotations_tensor = torch.tensor([1 if anno[1] == "FARMACO" else 0 for anno in ground_truth_annotations])

    return word_embeddings_tensor, annotations_tensor, word_to_ix


In [None]:
# Function to train BiLSTM model
def train_bilstm_model(word_embeddings_tensor, annotations_tensor, word_to_ix):
    # Define hyperparameters
    input_size = word_embeddings_tensor.size(1)  # Embedding dimension
    hidden_size = 128
    output_size = 2  # Binary classification: FARMACO or not

    # Instantiate BiLSTM model
    model = BiLSTM(input_size, hidden_size, output_size)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10
    batch_size = 32
    num_batches = len(word_embeddings_tensor) // batch_size

    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(num_batches):
            model.train()
            optimizer.zero_grad()

            # Get current batch
            batch_embeddings = word_embeddings_tensor[i*batch_size:(i+1)*batch_size]
            batch_annotations = annotations_tensor[i*batch_size:(i+1)*batch_size]

            # Forward pass
            outputs = model(batch_embeddings)

            # Calculate loss
            loss = criterion(outputs.view(-1, output_size), batch_annotations)
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, total_loss / num_batches))


In [None]:
# Main function
def main():
    word_embeddings_tensor, annotations_tensor, word_to_ix = process_text_files(train_folder_path)
    train_bilstm_model(word_embeddings_tensor, annotations_tensor, word_to_ix)

In [None]:
if __name__ == "__main__":
    main()

Epoch [1/10], Loss: 0.0158
Epoch [2/10], Loss: 0.0000
Epoch [3/10], Loss: 0.0000
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
