In [None]:
! pip install bert-for-sequence-classification

Collecting bert-for-sequence-classification
  Downloading bert_for_sequence_classification-0.1.1-py3-none-any.whl (17 kB)
Collecting wget (from bert-for-sequence-classification)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.7.1->bert-for-sequence-classification)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.7.1->bert-for-sequence-classification)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.7.1->bert-for-sequence-classification)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.7.1->bert-for-sequence-classification)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 

In [None]:

import os
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import json

from transformers import AutoModel, AutoTokenizer


In [None]:
from sklearn.metrics import classification_report, accuracy_score

## Prepare dataset for testing

In [None]:
df = pd.read_csv('sentence_full.csv', sep =',')

In [None]:
df.head()

Unnamed: 0,Sentence,Label,Components
0,"Once again, since the last briefing to the Cou...",claim,{'the situation regarding Ukraine has seriousl...
1,This is now the tenth time that the Council ha...,none,{}
2,The General Assembly also took up the matter o...,none,{}
3,"Following close to two weeks of relative calm,...",premise,{'Following close to two weeks of relative cal...
4,The individuals involved called for secession ...,premise,{'The individuals involved called for secessio...


In [None]:
df['Label'] = df['Label'].str.replace('claim','Arg')
df['Label'] = df['Label'].str.replace('premise','Arg')
df['Label'] = df['Label'].str.replace('none','O')

df.head()

Unnamed: 0,Sentence,Label,Components
0,"Once again, since the last briefing to the Cou...",Arg,{'the situation regarding Ukraine has seriousl...
1,This is now the tenth time that the Council ha...,O,{}
2,The General Assembly also took up the matter o...,O,{}
3,"Following close to two weeks of relative calm,...",Arg,{'Following close to two weeks of relative cal...
4,The individuals involved called for secession ...,Arg,{'The individuals involved called for secessio...


In [None]:
utest = df

In [None]:
df = pd.read_csv('sentence_db_candidate.csv')

In [None]:
df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,Speech,Speaker,SpeakerType,Set,Date,Year,Name,MainTag
0,"CHENEY: Gwen, I want to thank you, and I want ...",1,30_2004,0,0,2101,2221,,"{""O"": 27}",O,"Gwen, I want to thank you, and I want to than...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
1,"It's a very important event, and they've done ...",1,30_2004,1,1,2221,2304,,"{""O"": 19}",O,"It's a very important event, and they've done ...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
2,It's important to look at all of our developme...,1,30_2004,2,2,2304,2418,,"{""O"": 23}",O,It's important to look at all of our developme...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,"And, after 9/11, it became clear that we had t...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,And we also then finally had to stand up democ...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed


In [None]:
df['Component'] = df['Component'].str.replace('Claim','Arg')
df['Component'] = df['Component'].str.replace('Premise','Arg')

df = df.dropna(subset=['Component'])

In [None]:

#splitting as the authors did
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

df_train = df_train[['Speech', 'Component']]
df_val = df_val[['Speech', 'Component']]
df_test = df_test[['Speech', 'Component']]


In [None]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

# Assuming your training data is already loaded in df_train
X_train = df_train['Speech'].values.reshape(-1, 1)
y_train = df_train['Component'].values

# Initialize the oversampler
ros = RandomOverSampler(random_state=42)

# Resample the training data
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Create a new DataFrame with the resampled data
df_train_resampled = pd.DataFrame({
    'Speech': X_resampled.flatten(),
    'Component': y_resampled
})

# Optionally, check the class distribution to verify oversampling
print(df_train_resampled['Component'].value_counts())



Component
O      10464
Arg    10464
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Assuming your data is already loaded in df
# Split the data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['Component'], random_state=42)

# Function to balance a dataset using RandomOverSampler
def balance_dataset(df):
    X = df['Speech'].values.reshape(-1, 1)
    y = df['Component'].values
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    df_resampled = pd.DataFrame({
        'Speech': X_resampled.flatten(),
        'Component': y_resampled
    })
    return df_resampled

# Balance the training and testing sets
df_train_resampled = balance_dataset(df_train)
df_test_resampled = balance_dataset(df_test)

# Optionally, check the class distribution to verify oversampling
print("Training set class distribution:")
print(df_train_resampled['Component'].value_counts())

print("Testing set class distribution:")
print(df_test_resampled['Component'].value_counts())


Training set class distribution:
Component
Arg    17824
O      17824
Name: count, dtype: int64
Testing set class distribution:
Component
Arg    4456
O      4456
Name: count, dtype: int64


In [None]:
config = dict(
    transformer_model = dict(
        model = "chkla/roberta-argument",
        path_to_state_dict = False,
        device = 'cuda',
        dropout = 0.2,
        learning_rate = 2e-5,
        batch_size = 16,
        shuffle = True,
        maxlen = 128,
    ),
    data = dict(
        train_data_path = df_train,
        test_data_path = df_val,
        text_column = "Speech",
        target_column = "Component",
        random_state = 52,
        test_size = 0.3,
        stratify=True
    ),
    training = dict (
    save_state_dict = False, # if False the model will be saved using torch.save()
        # and should be loaded like this: model = torch.load()
        # you will have to install the library to do so
    early_stopping = True,
    delta = 0.001,
    patience = 7,
    num_epochs = 2,
    average_f1 = 'macro',
    other_metrics = ['micro', 'weighted'],
    output_dir = "../results/",
    class_weight = True
    )
)


In [None]:

optimizer = optim.Adam(model.parameters(), lr=float(config['transformer_model']['learning_rate']))
criterion = nn.NLLLoss()


In [None]:
# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(df_train['Component'].unique()))  # Adjust num_labels based on your task

# Assuming 'df_train_resampled' has a column named 'Component' with string labels
label2id = {label: i for i, label in enumerate(df_train_resampled['Component'].unique())}

class ArgumentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, label2id):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        speech = self.data.iloc[idx]['Speech']
        component = self.data.iloc[idx]['Component']
        encoding = self.tokenizer.encode_plus(
            speech,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.label2id[component], dtype=torch.long)  # Convert label to ID
        }

# Constants
MAX_LENGTH = 128  # Adjust as needed

# Create datasets
train_dataset = ArgumentDataset(df_train_resampled, tokenizer, MAX_LENGTH, label2id)
test_dataset = ArgumentDataset(df_test_resampled, tokenizer, MAX_LENGTH, label2id)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('/content/fine_tuned_roberta_model')
model = RobertaForSequenceClassification.from_pretrained('/content/fine_tuned_roberta_model')

# Example input
input_text = "I hate everything"

# Tokenize input text
inputs = tokenizer(input_text, return_tensors='pt')

# Ensure inputs are on the same device as the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Move model to the same device
model.to(device)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted label
predicted_label = torch.argmax(outputs.logits).item()

# Print predicted label
print(f"Predicted Label: {predicted_label}")

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
# Training parameters
num_epochs = 3
learning_rate = 2e-5

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


for epoch in range(num_epochs):
    # Training
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Evaluation on validation set (if needed)
    # For brevity, validation evaluation is omitted in this example

# Evaluation on test set
model.eval()
test_predictions = []
test_targets = []
for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        test_predictions.extend(predictions.cpu().numpy())
        test_targets.extend(labels.cpu().numpy())

# Calculate test metrics
test_accuracy = accuracy_score(test_targets, test_predictions)
test_classification_report = classification_report(test_targets, test_predictions, target_names=list(label2id.keys()))

print("Test Set Metrics:")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Classification Report:\n{test_classification_report}")

# Save the fine-tuned model
model_path = "./fine_tuned_roberta_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)



Test Set Metrics:
Test Accuracy: 0.7609964093357271
Test Classification Report:
              precision    recall  f1-score   support

         Arg       0.71      0.89      0.79      4456
           O       0.86      0.63      0.72      4456

    accuracy                           0.76      8912
   macro avg       0.78      0.76      0.76      8912
weighted avg       0.78      0.76      0.76      8912



('./fine_tuned_roberta_model/tokenizer_config.json',
 './fine_tuned_roberta_model/special_tokens_map.json',
 './fine_tuned_roberta_model/vocab.json',
 './fine_tuned_roberta_model/merges.txt',
 './fine_tuned_roberta_model/added_tokens.json')

In [None]:
!unzip /content/fine_tuned_roberta_model.zip -d /content/fine_tuned_roberta_model


Archive:  /content/fine_tuned_roberta_model.zip
   creating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/config.json  
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/tokenizer_config.json  
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/model.safetensors  
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/special_tokens_map.json  
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/vocab.json  
  inflating: /content/fine_tuned_roberta_model/content/fine_tuned_roberta_model/merges.txt  


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
import torch
tokenizer = RobertaTokenizer.from_pretrained('/content/fine_tuned_roberta_model')
model = RobertaForSequenceClassification.from_pretrained('/content/fine_tuned_roberta_model')

# test input
input_text = "Taxonomists use Latin words to classify various animals into such categories as kingdom, phylum, class, order, family, genus, and species. Thus, some species of bear are Ursus americanus (American black bear), Ursus arctos (brown bear), and Ursus maritimus (polar bear).."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors='pt')

# Ensure inputs are on the same device as the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Move model to the same device
model.to(device)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted label
predicted_label = torch.argmax(outputs.logits).item()

# Print predicted label
print(f"Predicted Label: {predicted_label}")

Predicted Label: 1
