In [1]:
import os

os.chdir("../")

In [None]:
import argparse
import pandas as pd

import torch
from torch.utils.data import Dataset

import numpy as np
import evaluate  

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

from sklearn.metrics import classification_report
from torch.nn.functional import softmax


def get_args():
    parser = argparse.ArgumentParser(description="Train and evaluate an AA model.")
    parser.add_argument("--training_df_fp", type=str, required=True, help="Filepath for the training dataset")
    parser.add_argument("--test_df_fp", type=str, required=True, help="Filepath for the test dataset")

    parser.add_argument("--model_name", type=str, default="allenai/longformer-base-4096", help="Name of the model to be used")
    parser.add_argument("--max_length", type=int, default=2048, help="Maximum length of the input sequences")
    parser.add_argument("--num_train_epochs", type=int, default=10, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size for training")
    parser.add_argument("--eval_batch_size", type=int, default=16, help="Batch size for evaluation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4, help="Gradient accumulation steps")
    parser.add_argument("--warmup_steps", type=int, default=500, help="Number of warmup steps for learning rate scheduler")
    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer")
    parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate for optimizer")
    parser.add_argument("--logging_steps", type=int, default=100, help="Logging steps")
    parser.add_argument("--evaluation_strategy", type=str, default="epoch", help="Evaluation strategy")
    parser.add_argument("--load_best_model_at_end", type=str, default="True", help="Load the best model at the end of training")
    parser.add_argument("--fp16", type=str, default="True", help="Use mixed precision training")
    parser.add_argument("--save_total_limit", type=int, default=1, help="Limit the total amount of checkpoints")
    parser.add_argument("--resume_from_checkpoint", type=str, default="True", help="Resume training from checkpoint")

    return parser.parse_args()


def get_author_map(df, author_col="author"):
    author_map = {author: i for i, author in 
    enumerate(df[author_col].unique())}
    return author_map


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def bool_str_to_bool(value):
    return value.lower() in ('true', '1', 'yes', 'y', 't')


def compute_metrics(eval_pred):
    f1_score = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_score.add_batch(predictions=predictions, references=labels)
    return f1_score.compute(average="weighted")


def main():
    args = get_args()
    # Check if the dataset names match
    # between training and evaluation datasets
    dataset = args.training_df_fp.split("/")[-1].split(".")[0].split("_")[0]
    dataset_ = args.test_df_fp.split("/")[-1].split(".")[0].split("_")[0]
    assert dataset == dataset_, f"Dataset name mismatch: {dataset} != {dataset_}"

    # Load the training and test datasets
    df = pd.read_csv(args.training_df_fp)
    test_df = pd.read_csv(args.test_df_fp)

    # Get the author map
    if "label" not in df.columns:
        author_map = get_author_map(df)
        df["label"] = df["author"].map(author_map)
        test_df["label"] = test_df["author"].map(author_map)
        df.to_csv(args.training_df_fp, index=False)
        test_df.to_csv(args.test_df_fp, index=False)
        print(f"Appended author labels to {args.training_df_fp} and {args.test_df_fp}") 
    
    # Split the training data into train and validation sets
    train_df, valid_df = train_test_split(df, test_size=0.2, 
                                          random_state=42, 
                                          stratify=df["label"])
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    # Load the tokenizer and tokenize the datasets
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    train_encodings = tokenizer(list(train_df['text']), 
                                truncation=True, padding="max_length",
                                max_length=args.max_length)
    valid_encodings = tokenizer(list(valid_df['text']), 
                                truncation=True, padding="max_length",
                                max_length=args.max_length)
    test_encodings = tokenizer(list(test_df['text']),
                               truncation=True, padding="max_length",
                               max_length=args.max_length)
    train_dataset = CustomDataset(train_encodings, train_df['label'])
    valid_dataset = CustomDataset(valid_encodings, valid_df['label'])
    test_dataset = CustomDataset(test_encodings, test_df['label'])

    # Load the model and train it
    model_output_dir = f"./AA_models/{args.model_name.split('/')[-1]}/" + dataset
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name, device_map="auto",
                                                               num_labels=len(df["label"].unique()))
    
    training_args = TrainingArguments(
        output_dir=model_output_dir,  # output directory
        fp16=bool_str_to_bool(args.fp16),  # Use mixed precision training
        num_train_epochs=args.num_train_epochs,  # total # of training epochs
        per_device_train_batch_size=args.train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=args.eval_batch_size,  # batch size for evaluation
        # number of updates steps to accumulate before performing a backward/update pass
        gradient_accumulation_steps=args.gradient_accumulation_steps,  
        warmup_steps=args.warmup_steps,  # Number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # Strength of weight decay
        learning_rate=args.learning_rate,  # Initial learning rate
        save_total_limit=args.save_total_limit,  # Limit the total amount of checkpoints
        logging_steps=args.logging_steps,  # Log every X updates steps
        evaluation_strategy=args.evaluation_strategy,  # evaluation strategy to adopt during training
        save_strategy=args.evaluation_strategy,  # save strategy to adopt during training
        load_best_model_at_end=args.load_best_model_at_end,  # load the best model when finished training
        metric_for_best_model="eval_loss",  # use f1 score to compare models
        greater_is_better=False,  # f1 score should be greater
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    resume_from_checkpoint = bool_str_to_bool(args.resume_from_checkpoint)
    if any(["checkpoint" in file for file in os.listdir(model_output_dir)]) and resume_from_checkpoint:
        print("Resuming from the latest checkpoint")
        trainer.train(resume_from_checkpoint=True)
    else:
        trainer.train(resume_from_checkpoint=False)

    # Evaluate the model
    predictions = trainer.predict(test_dataset)
    y_pred = predictions.predictions.argmax(-1)

    y_test = test_df.label.tolist()
    print(classification_report(y_test, y_pred))

    logits = predictions.predictions  # This contains the raw logits output
    # Convert logits to probabilities using softmax
    probabilities = softmax(torch.tensor(logits), dim=1).tolist()

    model_name = args.model_name.split('/')[-1]
    test_df[f"{model_name}-prediction"]=y_pred
    test_df[f"{model_name}-probabilities"] = [prob[1] for prob in probabilities]
    test_df.to_csv(args.test_df_fp, index=False)
    print(f"Predictions and probabilities by {args.model_name} saved to {args.test_df_fp}")


if __name__ == "__main__":
    main()

In [None]:
def get_author_map(df, author_col="author"):
    author_map = {author: i for i, author in 
    enumerate(df[author_col].unique())}
    return author_map

In [None]:
import pandas as pd


df = pd.read_csv("dataset_prepare/blog_train.csv")
test_df = pd.read_csv("dataset_prepare/blog_test.csv")
author_map = get_author_map(df, author_col="author")
df["label"] = df["author"].map(author_map)
test_df["label"] = test_df["author"].map(author_map)

In [12]:
import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

model_name = "bert-base-cased"
max_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=len(author_map), 
                                                           device_map="auto")

train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
train_encodings = tokenizer(list(train_df['text']),truncation=True, padding="max_length",max_length=max_length)
valid_encodings = tokenizer(list(valid_df['text']), truncation=True, padding="max_length",max_length=max_length)
test_encodings = tokenizer(list(test_df['text']),truncation=True, padding="max_length",max_length=max_length)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
train_dataset = CustomDataset(train_encodings, train_df['label'])
valid_dataset = CustomDataset(valid_encodings, valid_df['label'])
test_dataset = CustomDataset(test_encodings, test_df['label'])

In [24]:
import numpy as np
import evaluate  # Make sure to import this
def compute_metrics(eval_pred):
    f1_score = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_score.add_batch(predictions=predictions, references=labels)
    return f1_score.compute(average="weighted")



from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

dataset = "blog"
model_output_dir = f"./AA_models/{model_name.split('/')[-1]}/" + dataset


# Define training arguments
training_args = TrainingArguments(
    output_dir=model_output_dir,  # output directory
    num_train_epochs=1,  # total # of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    learning_rate=2e-5,  # learning rate
    save_total_limit=1,  # limit the total amount of checkpoints, delete older checkpoints
    logging_steps=100,
    eval_strategy="steps",  # evaluate at the end of each epoch
    save_strategy="steps",  # save model at the end of each epoch
    load_best_model_at_end=True,  # load the best model at the end of training
    metric_for_best_model="eval_loss",  # metric to track for early stopping
    greater_is_better=False,  # validation loss should decrease
)

# Create the Trainer object with Early Stopping Callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping patience
)

# Train the model
trainer.train()


Step,Training Loss,Validation Loss,F1
2,3.8062,3.822578,0.166283


KeyboardInterrupt: 

In [25]:
from sklearn.metrics import classification_report
from torch.nn.functional import softmax

# Evaluate the model
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)

y_test = test_df.label.tolist()
print(classification_report(y_test, y_pred))



logits = predictions.predictions  # This contains the raw logits output
# Convert logits to probabilities using softmax
probabilities = softmax(torch.tensor(logits), dim=1).tolist()
test_df["prediction"] = y_pred
test_df["probabilities"] = [prob[1] for prob in probabilities]

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       218
           1       0.00      0.00      0.00       216
           2       0.00      0.00      0.00       179
           3       1.00      0.01      0.02       221
           4       0.00      0.00      0.00       262
           5       0.50      0.01      0.01       273
           6       0.05      0.20      0.09       308
           7       0.14      0.11      0.12       170
           8       0.20      0.01      0.02       268
           9       0.12      0.54      0.20       375
          10       0.68      0.57      0.62       261
          11       0.09      0.42      0.14       406
          12       0.30      0.02      0.04       304
          13       0.27      0.01      0.02       256
          14       0.00      0.00      0.00       197
          15       0.16      0.02      0.03       322
          16       0.50      0.68      0.58       204
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
test_df

Unnamed: 0,author,text,topic,gender,age,sign,date,summary,training sample indices,prompt,label,prediction,probabilities
0,664485,so in comes the sexalicious alltel guy ...,indUnk,male,25,Taurus,"05,April,2003",The writer describes their monthly visits to t...,16746628612122972315564,"Given the following summary, your task is to g...",94,9,0.008105
1,449628,"urlLink 500,000 U.S. IT Jobs Projecte...",indUnk,male,34,Aries,"21,July,2003","According to a Gartner, Inc. report cited by I...",163851262523405182867342,"Given the following summary, your task is to g...",19,27,0.006858
2,1651222,"Yo and hello, y'all, Plez here. Hm...",indUnk,female,38,Virgo,"23,February,2003","Plez reflects on a recent nightclub fire, expr...",186984838196381713814149,"Given the following summary, your task is to g...",61,11,0.013973
3,1784456,I WILL STAND IN THEIR WAY! Ge...,Student,female,16,Aquarius,"09,October,2003",The speaker expresses strong opposition to the...,870310531445384020824,"Given the following summary, your task is to g...",29,11,0.010807
4,180519,Well! long time no post. Lots of things...,indUnk,female,23,Cancer,"29,May,2001",The writer describes a recent enjoyable trip t...,1110317517252031193717823,"Given the following summary, your task is to g...",92,6,0.009998
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25220,585884,Hizzallar! What! Man Horkins no questi...,indUnk,male,17,Sagittarius,"02,August,2004",The writer criticizes a top 500 music list for...,124517881120492340722690,"Given the following summary, your task is to g...",75,54,0.014114
25221,942828,"On July 2, 1871 Venerable J...",indUnk,female,34,Cancer,"02,July,2004","In his July 2, 1871 sermon, Venerable John Hen...",5688250934891443313449,"Given the following summary, your task is to g...",72,72,0.004994
25222,1417798,Hey Gals! It's gonna take me ...,indUnk,female,35,Scorpio,"17,August,2003",The writer is busy with work while their mom i...,37512592974673667875,"Given the following summary, your task is to g...",78,78,0.010120
25223,1107146,"To tell you the truth, and I ...",Student,female,16,Libra,"21,October,2003",The writer expresses indifference about the ou...,179901459413131484521511,"Given the following summary, your task is to g...",20,58,0.010056
