In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../../data/cordis-multilabel-telecoms.csv')

# Display the first few rows to understand the structure
df.head()


Unnamed: 0,text,topics
0,Title: METIS-II - Mobile and wireless communic...,"[""5G"", ""radio technology""]"
1,Title: interACT - Designing cooperative intera...,"[""radar"", ""mobile phones""]"
2,Title: 5GCITY - 5GCITY Abstract: Delivering on...,"[""5G""]"
3,Title: Light UP - Visible Light Ultrafast Phot...,"[""radio technology""]"
4,Title: 5G-DRIVE - 5G HarmoniseD Research and T...,"[""5G""]"


In [6]:
from transformers import BertTokenizer
import torch

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the abstracts
tokenized_data = tokenizer(list(df['text'].values), padding='max_length', truncation=True, max_length=512, return_tensors="pt")

# Build the label list
unique_labels = sorted(set(label for sublist in df['topics'].apply(eval).tolist() for label in sublist))
label_map = {label: i for i, label in enumerate(unique_labels)}

# Encode the labels
def encode_labels(labels):
    label_ids = [0] * len(label_map)
    for label in labels:
        if label in label_map:
            label_ids[label_map[label]] = 1
    return label_ids

encoded_labels = df['topics'].apply(lambda x: encode_labels(eval(x)))

# Convert to lists
input_ids = tokenized_data['input_ids']
attention_masks = tokenized_data['attention_mask']
labels = torch.tensor(encoded_labels.tolist()).float()

# Print shapes for verification
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention masks shape: {attention_masks.shape}")
print(f"Labels shape: {labels.shape}")





Input IDs shape: torch.Size([1044, 512])
Attention masks shape: torch.Size([1044, 512])
Labels shape: torch.Size([1044, 560])


In [7]:
from torch.utils.data import Dataset, DataLoader, random_split

class AbstractDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create the dataset
dataset = AbstractDataset(input_ids, attention_masks, labels)

# Split the dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)


In [10]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
import json

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map), problem_type="multi_label_classification")

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save model every epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model='f1'  # Metric to use for choosing the best model
)

# Define metrics for evaluation
def compute_metrics(p):
    # Apply sigmoid to the predictions
    probs = torch.sigmoid(torch.tensor(p.predictions))
    
    # Apply threshold to get binary predictions
    preds = probs > 0.5
    
    # Convert predictions and labels to numpy arrays
    preds = preds.cpu().numpy()
    labels = p.label_ids
    
    # Compute metrics
    f1 = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    
    # Print a few sample predictions and labels for debugging
    for i in range(5):
        print(f"Sample {i}:")
        print(f"Predictions: {preds[i]}")
        print(f"Labels: {labels[i]}")
    
    return {'f1': f1, 'precision': precision, 'recall': recall}

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()

# Save the model
model.save_pretrained('./model')

# Save the tokenizer
tokenizer.save_pretrained('./model')

# Save the metrics to a file
with open('metrics.json', 'w') as f:
    json.dump(metrics, f)

# Print the metrics
print(metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/315 [00:00<?, ?it/s]

{'loss': 0.6974, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6923, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.19}
{'loss': 0.6852, 'learning_rate': 3e-06, 'epoch': 0.29}
{'loss': 0.676, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.38}
{'loss': 0.6637, 'learning_rate': 5e-06, 'epoch': 0.48}
{'loss': 0.6462, 'learning_rate': 6e-06, 'epoch': 0.57}
{'loss': 0.6254, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.67}
{'loss': 0.6021, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.76}
{'loss': 0.5769, 'learning_rate': 9e-06, 'epoch': 0.86}
{'loss': 0.5509, 'learning_rate': 1e-05, 'epoch': 0.95}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5148403644561768, 'eval_f1': 0.008548150126886603, 'eval_precision': 0.004420194764831825, 'eval_recall': 0.1292929292929293, 'eval_runtime': 1.0151, 'eval_samples_per_second': 205.9, 'eval_steps_per_second': 26.599, 'epoch': 1.0}
{'loss': 0.521, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.05}
{'loss': 0.4901, 'learning_rate': 1.2e-05, 'epoch': 1.14}
{'loss': 0.4564, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.24}
{'loss': 0.4219, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.33}
{'loss': 0.3874, 'learning_rate': 1.5e-05, 'epoch': 1.43}
{'loss': 0.3523, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.52}
{'loss': 0.3205, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.62}
{'loss': 0.29, 'learning_rate': 1.8e-05, 'epoch': 1.71}
{'loss': 0.2618, 'learning_rate': 1.9e-05, 'epoch': 1.81}
{'loss': 0.2363, 'learning_rate': 2e-05, 'epoch': 1.9}
{'loss': 0.2147, 'learning_rate': 2.1e-05, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.19787342846393585, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.9958, 'eval_samples_per_second': 209.871, 'eval_steps_per_second': 27.113, 'epoch': 2.0}
{'loss': 0.1919, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.1}
{'loss': 0.1744, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.19}
{'loss': 0.1573, 'learning_rate': 2.4e-05, 'epoch': 2.29}
{'loss': 0.1424, 'learning_rate': 2.5e-05, 'epoch': 2.38}
{'loss': 0.1312, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.48}
{'loss': 0.1205, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.57}
{'loss': 0.1089, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.67}
{'loss': 0.1029, 'learning_rate': 2.9e-05, 'epoch': 2.76}
{'loss': 0.0951, 'learning_rate': 3e-05, 'epoch': 2.86}
{'loss': 0.0876, 'learning_rate': 3.1e-05, 'epoch': 2.95}


  0%|          | 0/27 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.07934024930000305, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 1.032, 'eval_samples_per_second': 202.52, 'eval_steps_per_second': 26.163, 'epoch': 3.0}
{'train_runtime': 43.6087, 'train_samples_per_second': 57.443, 'train_steps_per_second': 7.223, 'train_loss': 0.37214191508671596, 'epoch': 3.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5148403644561768, 'eval_f1': 0.008548150126886603, 'eval_precision': 0.004420194764831825, 'eval_recall': 0.1292929292929293, 'eval_runtime': 1.0241, 'eval_samples_per_second': 204.089, 'eval_steps_per_second': 26.366, 'epoch': 3.0}
