In [1]:
import pandas as pd
from transformers import BertModel, AutoTokenizer
import torch
import os
from tqdm import tqdm

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mode = 'train'

# Read the CSV file
data = pd.read_csv(f'semeval-{mode}-spans.csv')

# Initialize the BERT tokenizer and model and move to GPU
model_name = 'SpanBERT/spanbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
model = BertModel.from_pretrained(model_name, add_pooling_layer=False).to(device)

# Function to process text and generate embeddings based on text spans
def process_text_and_save_embeddings(text_path, text_span, index, model_name, mode):
    # Load text from file using the path
    with open(text_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Tokenize text and span
    tokens = tokenizer.tokenize(text)
    span_tokens = tokenizer.tokenize(text_span)

    # Convert tokens to token IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    span_token_ids = tokenizer.convert_tokens_to_ids(span_tokens)

    # Find the start and end token indexes of the span in the text tokens
    span_start, span_end = 0, 0
    for i in range(len(token_ids) - len(span_token_ids) + 1):
        if token_ids[i:i + len(span_token_ids)] == span_token_ids:
            span_start, span_end = i, i + len(span_token_ids) - 1
            break

    # print(span_start, span_end)

    # Split tokens into chunks with specified length and overlap
    chunk_length = 510
    overlap = 100
    token_chunks = [[101] + token_ids[i:i + chunk_length] + [102] for i in range(0, len(token_ids), chunk_length - overlap)]

    # Process each chunk and get embeddings for the specified span
    embeddings = []
    for i, chunk in enumerate(token_chunks):
        input_ids = torch.tensor(chunk).unsqueeze(0).to(device)

        # Get embeddings
        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            last_hidden_states = outputs.last_hidden_state

        # Get CLS embedding
        if i == 0:
            cls_embedding = last_hidden_states[:, 0, :]

        # Remove special tokens and concatenate remaining embeddings
        trimmed_embeddings = last_hidden_states[:, 1:len(chunk) + 1, :]
        if i < len(token_chunks) - 1:
            trimmed_embeddings = trimmed_embeddings[:, overlap:, :]

        embeddings.append(trimmed_embeddings)

    # Concatenate embeddings from all chunks into one tensor
    concatenated_embeddings = torch.cat(embeddings, dim=1)
    concatenated_embeddings = concatenated_embeddings[:, span_start:span_end + 1, :]
    concatenated_embeddings = torch.cat((cls_embedding.unsqueeze(1), concatenated_embeddings), dim=1)

    # Save the tensor to a file in the 'embeddings' folder
    embeddings_folder = f'{model_name}_{mode}_embeddings/'
    os.makedirs(embeddings_folder, exist_ok=True)  # Create 'embeddings' folder if it doesn't exist
    embeddings_filename = os.path.basename(text_path).replace('.txt', f'_{index}_span_raw_with_cls.pt')
    embeddings_path = os.path.join(embeddings_folder, embeddings_filename)
    torch.save(concatenated_embeddings, embeddings_path)

    return embeddings_path

# Process unique paths and save span embeddings
embeddings_info = []
for index, row in tqdm(data.iterrows(), total=len(data), desc='Processing spans'):
    text_path = row['path']  # Update this with the actual folder path
    text_span = row['span']  # Assuming 'span' column contains text-based spans like 'start-end'
    embeddings_path = process_text_and_save_embeddings(text_path, text_span, index, model_name, mode)
    embeddings_info.append({'path_to_text': text_path, 'path_to_embeddings': embeddings_path, 'label': row['label']})

# Create DataFrame with paths to text and embeddings
embeddings_df = pd.DataFrame(embeddings_info)

# Save the DataFrame to a CSV file
embeddings_info_path = f'{model_name}_{mode}_embeddings_with_cls.csv'
embeddings_df.to_csv(embeddings_info_path, index=False)


mode = 'test'

# Read the CSV file
data = pd.read_csv(f'semeval-{mode}-spans.csv')

# Process unique paths and save span-max-pooled embeddings
embeddings_info = []
for index, row in tqdm(data.iterrows(), total=len(data), desc='Processing spans'):
    text_path = row['path']  # Update this with the actual folder path
    text_span = row['span']  # Assuming 'span' column contains text-based spans like 'start-end'
    embeddings_path = process_text_and_save_embeddings(text_path, text_span, index, model_name, mode)
    embeddings_info.append({'path_to_text': text_path, 'path_to_embeddings': embeddings_path, 'label': row['label']})

# Create DataFrame with paths to text and embeddings
embeddings_df = pd.DataFrame(embeddings_info)

# Save the DataFrame to a CSV file
embeddings_info_path = f'{model_name}_{mode}_embeddings_with_cls.csv'
embeddings_df.to_csv(embeddings_info_path, index=False)

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
data = pd.read_csv('SpanBERT/spanbert-base-cased_train_embeddings_with_cls.csv')['label'].values.reshape(-1, 1)
onehot_encoder.fit(data)

class CustomDataset(Dataset):
    def __init__(self, csv_file, onehot_encoder, proj_dim=256, padding_size=256):
        self.data = pd.read_csv(csv_file)
        self.labels = onehot_encoder.transform(self.data['label'].values.reshape(-1, 1))
        self.padding_size = padding_size
        self.proj_dim = proj_dim

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_path = self.data.iloc[idx]['path_to_embeddings']
        label = self.labels[idx]

        # Load embedding from .pt file
        embedding = torch.load(file_path).cpu()[:, 1:, :]

        # Get attention mask
        # attention_mask = torch.cat((
        #     torch.ones((self.proj_dim, embedding.shape[1])),
        #     torch.zeros((self.proj_dim, self.padding_size - embedding.shape[1]))
        # ), dim=1)

        padding = torch.zeros(1, self.padding_size - embedding.shape[1], embedding.shape[-1])
        embedding = torch.cat((embedding, padding), dim=1)

        return embedding, torch.FloatTensor(label)  # , torch.FloatTensor(attention_mask)
    
# Paths to train and test CSV files
train_csv_path = 'SpanBERT/spanbert-base-cased_train_embeddings_with_cls.csv'
test_csv_path = 'SpanBERT/spanbert-base-cased_test_embeddings_with_cls.csv'

# Create instances of CustomDataset for train and test
train_dataset = CustomDataset(train_csv_path, onehot_encoder)
test_dataset = CustomDataset(test_csv_path, onehot_encoder)

# Define batch sizes
train_batch_size = 64
test_batch_size = 64

# Create DataLoaders for train and test datasets
train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)



In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_validate_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.1, stepslr=10, gamma=0.9):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, stepslr, gamma=gamma)

    train_losses = []
    val_losses = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for epoch in tqdm(range(num_epochs), total=num_epochs):
        model.train()
        running_train_loss = 0.0
        # for inputs, labels, attn_mask in tqdm(train_loader, desc=f'Training   {epoch+1}/{num_epochs}'):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs.float())
            # print(outputs.shape, labels.shape)
            # print(outputs)
        #     break
        # break
            # print(outputs.squeeze(1), labels)
            loss = criterion(outputs.squeeze(1), labels)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item()
        
        # print(running_train_loss, len(train_loader))
        train_loss = running_train_loss / len(train_loader)
        train_losses.append(train_loss)

        if (epoch + 1) % 100 == 0:
            model.eval()
            running_val_loss = 0.0
            all_preds = []
            all_labels = []
            with torch.no_grad():
                for inputs, labels in val_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs.float())
                    loss = criterion(outputs.squeeze(1), labels)
                    running_val_loss += loss.item()

                    # print(outputs.shape, labels.shape)
                    # print(outputs, labels)
                    predicted = torch.argmax(outputs.squeeze(1), -1)
                    labels = torch.argmax(labels, 1)
                    # print(predicted.shape, labels.shape)
                    # print(predicted, labels)
                    # break
                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

            val_loss = running_val_loss / len(val_loader)
            val_losses.append(val_loss)

            # print(all_preds, all_labels)
            # break

            precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

            scheduler.step()

            print(f"Epoch [{epoch + 1}/{num_epochs}] "
                f"Train Loss: {train_loss:.4f} "
                f"Val Loss: {val_loss:.4f} "
                f"Precision: {precision:.4f} "
                f"Recall: {recall:.4f} "
                f"F1 Score: {f1:.4f}")
            print(classification_report(all_labels, all_preds))

    return train_losses, val_losses, precision_scores, recall_scores, f1_scores

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, proj_dim=256, num_layers=3, num_classes=20):
        super(CustomAttention, self).__init__()

        self.projection = nn.Linear(input_dim, proj_dim)

        self.attention_params = nn.Linear(proj_dim, 1)

        layers = []
        layers.append(nn.Linear(proj_dim, hidden_dim))
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm1d(1))
        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(1))
        layers.append(nn.Linear(hidden_dim, num_classes))
        self.layers = nn.Sequential(*layers)
        
    def forward(self, embeddings):
        embeddings = self.projection(embeddings)

        attn_logits = self.attention_params(embeddings)  #  + attention_mask
        # print(attn_logits)
        attention_wts = nn.functional.softmax(attn_logits, dim=2)
        # print(attention_wts.shape, embeddings.shape)

        attention_term = torch.sum(attention_wts * embeddings, dim=-1)  #  * attention_mask
        # print(attention_term.shape)

        output = self.layers(attention_term)
        # print(output.shape)
        
        output = F.softmax(output, dim=0)
        # print(output.shape)
        
        return output

In [8]:
model = CustomAttention(input_dim=train_dataset[0][0].shape[-1], hidden_dim=256)
train_losses, val_losses, precision_scores, recall_scores, f1_scores = train_validate_model(model, train_data_loader, test_data_loader, num_epochs=300, learning_rate=3e-4, stepslr=50, gamma=0.9)

 33%|███▎      | 100/300 [19:56<1:04:55, 19.48s/it]

Epoch [100/300] Train Loss: 2.9090 Val Loss: 2.9956 Precision: 0.0615 Recall: 0.0540 F1 Score: 0.0324
              precision    recall  f1-score   support

           0       0.01      0.05      0.01        21
           1       0.05      0.05      0.05       133
           2       0.00      0.00      0.00         7
           3       0.01      0.11      0.02        35
           4       0.02      0.15      0.04        20
           5       0.02      0.12      0.03        26
           6       0.08      0.07      0.08       188
           7       0.01      0.01      0.01       121
           8       0.05      0.11      0.07        61
           9       0.02      0.03      0.02        95
          10       0.00      0.00      0.00         4
          11       0.39      0.08      0.13       569
          12       0.13      0.04      0.06       296
          13       0.37      0.04      0.07      1479
          14       0.00      0.00      0.00        13
          15       0.00      0.06

 67%|██████▋   | 200/300 [39:33<20:48, 12.49s/it]  

Epoch [200/300] Train Loss: 2.8997 Val Loss: 2.9961 Precision: 0.0623 Recall: 0.0462 F1 Score: 0.0329
              precision    recall  f1-score   support

           0       0.01      0.10      0.02        21
           1       0.04      0.05      0.04       133
           2       0.00      0.00      0.00         7
           3       0.01      0.06      0.02        35
           4       0.01      0.10      0.02        20
           5       0.01      0.08      0.02        26
           6       0.11      0.11      0.11       188
           7       0.03      0.02      0.03       121
           8       0.05      0.13      0.07        61
           9       0.02      0.02      0.02        95
          10       0.00      0.00      0.00         4
          11       0.30      0.04      0.07       569
          12       0.13      0.06      0.08       296
          13       0.45      0.06      0.10      1479
          14       0.01      0.08      0.01        13
          15       0.00      0.00

100%|██████████| 300/300 [58:59<00:00, 11.80s/it]

Epoch [300/300] Train Loss: 2.8946 Val Loss: 2.9978 Precision: 0.0590 Recall: 0.0434 F1 Score: 0.0317
              precision    recall  f1-score   support

           0       0.01      0.10      0.02        21
           1       0.06      0.05      0.06       133
           2       0.00      0.00      0.00         7
           3       0.01      0.06      0.02        35
           4       0.01      0.10      0.02        20
           5       0.01      0.08      0.02        26
           6       0.08      0.09      0.08       188
           7       0.04      0.05      0.04       121
           8       0.03      0.07      0.04        61
           9       0.01      0.01      0.01        95
          10       0.00      0.00      0.00         4
          11       0.27      0.04      0.06       569
          12       0.14      0.07      0.10       296
          13       0.45      0.06      0.11      1479
          14       0.01      0.08      0.02        13
          15       0.00      0.00


