In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import pandas as pd
import re
import torch
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForTokenClassification, TrainingArguments, Trainer

model_id = "FacebookAI/roberta-base"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cce_df = pd.read_csv("/raid/deallab/CCE_Data/raw_data/finecite/full_data.csv")


In [3]:
# Process the DataFrame
results = []
for index, row in cce_df.iterrows():
    # Clean the paragraph by replacing <ref> tags with '[TREF]'
    clean_paragraph = re.sub(r'<ref.*?>.*?</ref>', '[TREF]', row["paragraph"])

    # Split the cleaned paragraph into words using ';' as the delimiter
    words = clean_paragraph.split(';')

    # Process the context_location1 list
    context_location1 = eval(row["context_location1"])

    # Check if the lengths match, and map the context_location1 to the words
    if len(context_location1) == len(words):
        # Aggregate the mapped results for the current row
        mapped_result = list(zip(context_location1, words))
        
        # Separate the numbers and words into separate lists
        numbers = [str(item[0]) for item in mapped_result]  # Convert numbers to strings
        mapped_words = [item[1].strip() for item in mapped_result]  # Strip extra spaces from words
        
        results.append({
            "Paragraph": ' '.join(mapped_words),
            "Scope": numbers
        })
    else:
        results.append({
            "Paragraph": "Length of context_location1 and words don't match",
            "Scope": "Mismatch"
        })

# Convert results to DataFrame
df = pd.DataFrame(results)


In [5]:
# Convert the Scope column elements to lists of integers
def convert_scope_to_int(scope):
    if isinstance(scope, str):
        # Convert string representation of list to an actual list of integers
        scope = eval(scope)
    # Ensure all elements in the list are integers
    return [int(i) for i in scope]

# Apply the conversion to the Scope column
df["Scope"] = df["Scope"].apply(convert_scope_to_int)

In [10]:
# Load the tokenizer for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Check if the tokenizer has a pad_token and set it
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize and prepare dataset
def tokenize_function(examples):
    # Tokenize the paragraphs
    tokens = tokenizer(examples["Paragraph"], padding="max_length", truncation=True, max_length=512)
    
    # Process labels (Scope)
    max_label_length = 512  # Same as max_length for consistency
    padded_labels = []
    
    for label_list in examples["Scope"]:
        # Truncate if necessary
        if len(label_list) > max_label_length:
            label_list = label_list[:max_label_length]
        
        # Pad with -100
        padded_label = label_list + [-100] * (max_label_length - len(label_list))
        padded_labels.append(padded_label)
    
    # Convert to tensor
    tokens["labels"] = torch.tensor(padded_labels, dtype=torch.long)
    
    return tokens

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)




In [11]:
# Apply the tokenization function to the dataset
tokenized_dataset = dataset.map(
    tokenize_function, 
    batched=True, 
    remove_columns=dataset.column_names  # Remove all original columns
)

Map: 100%|██████████| 1055/1055 [00:01<00:00, 1002.97 examples/s]


In [12]:
# Load the model for token classification
model = RobertaForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=4  # Adjust this number based on your specific task
)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,1.1818,1.177531
2,1.1472,1.224751
3,1.1748,1.091165
4,1.0739,0.996247
5,0.9449,0.893089
6,0.9927,0.820209
7,0.6737,0.645233
8,0.7205,0.538606
9,0.5264,0.457363
10,0.4531,0.435886


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=2640, training_loss=0.9061096646568992, metrics={'train_runtime': 417.2796, 'train_samples_per_second': 25.283, 'train_steps_per_second': 6.327, 'total_flos': 2756730629529600.0, 'train_loss': 0.9061096646568992, 'epoch': 10.0})

In [14]:
model.save_pretrained('./trained_Roberta')
tokenizer.save_pretrained('./trained_Roberta')

('./trained_Roberta/tokenizer_config.json',
 './trained_Roberta/special_tokens_map.json',
 './trained_Roberta/vocab.json',
 './trained_Roberta/merges.txt',
 './trained_Roberta/added_tokens.json')

In [23]:
# Example inference on a new paragraph
def classify_paragraph(paragraph):
    # Clean and tokenize the input paragraph
    clean_paragraph = re.sub(r'<ref.*?>.*?</ref>', '[TREF]', paragraph)
    tokens = tokenizer(clean_paragraph, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    tokens = {key: value.to(model.device) for key, value in tokens.items()}
    
    # Run inference
    outputs = model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=-1)
    
    # Map predictions to scopes
    predicted_scope = predictions[0].cpu().numpy().tolist()
    
    return predicted_scope

In [24]:
def compare_scopes(true_scope, predicted_scope):
    # Ensure both lists have the same length
    max_length = max(len(true_scope), len(predicted_scope))
    
    # Pad shorter list with a placeholder for comparison
    true_scope_padded = true_scope + [-1] * (max_length - len(true_scope))
    predicted_scope_padded = predicted_scope + [-1] * (max_length - len(predicted_scope))
    
    # Compute the number of correct predictions
    correct_predictions = sum(t == p for t, p in zip(true_scope_padded, predicted_scope_padded) if t != -1)
    total_predictions = sum(1 for t in true_scope_padded if t != -1)
    
    # Calculate accuracy
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
    
    return correct_predictions, total_predictions, accuracy

In [25]:
# Example usage
new_paragraph = 'Neural Machine Translation (NMT) has opened several research directions to exploit as many and diverse data as possible. Massive multilingual NMT models, for instance, take advantage of several language-pair datasets in a single system [TREF] . This offers several advantages, such as a simple training process and enhanced performance of the language-pairs with little data (although sometimes detrimental to the high-resource language-pairs). However, massive models of dozens of languages are not necessarily the best outcome, as it is demonstrated that smaller clusters still offer the same benefits [TREF] .'

# Get the predicted scope for the new paragraph
predicted_scope = classify_paragraph(new_paragraph)
print("Predicted Scope:", predicted_scope)

# df["Scope"][0] is the true scope for comparison
true_scope = df["Scope"][0]
print("True Scope:", true_scope)

# Compare scopes
correct_predictions, total_predictions, accuracy = compare_scopes(true_scope, predicted_scope)
print(f"Correct Predictions: {correct_predictions}/{total_predictions}")
print(f"Accuracy: {accuracy:.2%}")


Predicted Scope: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0