## Steps

1. Construct a dataset for each corpus for AV classifier training and testing
2. Traing + Testing

In [None]:
# %%writefile ../scripts/create_AV_datasets.py
# import os
# import pandas as pd
# from random import sample
# from itertools import combinations, product
# from sklearn.model_selection import train_test_split


# def construct_AV_dataset(df, 
#                          author_col="author", 
#                          text_col="text", 
#                          num_of_samples=100000, 
#                          pos_ratio=0.4):
#     cols = ["author1", "author2", "text1", "text2", "label"]
#     pos_samples = []
#     neg_samples = []

#     # Group texts by author
#     author_groups = df.groupby(author_col)

#     # Generate all possible positive pairs (same author)
#     for author, group in author_groups:
#         indices = group.index.tolist()
#         if len(indices) >= 2:
#             pos_samples.extend(combinations(indices, 2))

#     # Generate all possible negative pairs (different authors)
#     authors = df[author_col].unique()
#     author_indices = {author: df[df[author_col] == author].index.tolist() for author in authors}
#     author_list = list(author_indices.keys())

#     for i in range(len(author_list)):
#         for j in range(i+1, len(author_list)):
#             idx1 = author_indices[author_list[i]]
#             idx2 = author_indices[author_list[j]]
#             neg_samples.extend(product(idx1, idx2))

#     # Shuffle and sample
#     pos_needed = int(num_of_samples * pos_ratio)
#     neg_needed = num_of_samples - pos_needed

#     pos_pairs = sample(pos_samples, min(pos_needed, len(pos_samples)))
#     neg_pairs = sample(neg_samples, min(neg_needed, len(neg_samples)))

#     # Construct output
#     out = []
#     for ix1, ix2 in pos_pairs:
#         out.append([df.at[ix1, author_col], df.at[ix2, author_col], df.at[ix1, text_col], df.at[ix2, text_col], 1])
#     for ix1, ix2 in neg_pairs:
#         out.append([df.at[ix1, author_col], df.at[ix2, author_col], df.at[ix1, text_col], df.at[ix2, text_col], 0])

#     out_df = pd.DataFrame(out, columns=cols)
#     out_df = out_df.sample(frac=1).reset_index(drop=True)
#     return out_df


# def create_AV_train_valid_test_sets(fp_train, 
#                                     save_dir,
#                                     fp_test=None,
#                                     author_col="author", 
#                                     text_col="text", 
#                                     num_of_samples_train=100000, 
#                                     num_of_samples_test=20000,
#                                     valid_set_ratio=0.2,
#                                     pos_ratio=0.4):
    
#     if os.path.exists(save_dir):
#         print(f"==> {save_dir} already exists. Please remove it to create a new dataset.")
#         return
    
#     print(f"===== Creating AV dataset for {save_dir} =====")

#     df_train = pd.read_csv(fp_train)

#     if fp_test is None:
#         df_test = pd.read_csv(fp_train.replace("train", "test"))
#     else:
#         df_test = pd.read_csv(fp_test)
    
#     train_df = construct_AV_dataset(df_train, author_col, text_col, 
#                                     num_of_samples_train, pos_ratio)
    
#     train_df, valid_df = train_test_split(train_df, test_size=valid_set_ratio, random_state=42)
#     test_df = construct_AV_dataset(df_test, author_col, text_col, 
#                                     num_of_samples_test, pos_ratio)

#     os.makedirs(save_dir, exist_ok=True)
#     train_df.to_csv(os.path.join(save_dir, "train.csv"), index=False)
#     valid_df.to_csv(os.path.join(save_dir, "valid.csv"), index=False)
#     test_df.to_csv(os.path.join(save_dir, "test.csv"), index=False)
#     print(f"==> Train set size: {len(train_df)}")
#     print("==> Train set label distribution:\n", train_df.label.value_counts(), "\n\n")

#     print(f"==> Validation set size: {len(valid_df)}")
#     print("==> Validation set label distribution:\n", valid_df.label.value_counts(), "\n\n")

#     print(f"==> Test set size: {len(test_df)}")
#     print("+=> Test set label distribution:\n", test_df.label.value_counts())


# if __name__ == "__main__":
#     create_AV_train_valid_test_sets("../dataset_prepare/blog_train.csv", "../dataset_prepare/blog_AV_datasets")
#     create_AV_train_valid_test_sets("../dataset_prepare/CCAT50_train.csv", "../dataset_prepare/CCAT50_AV_datasets")
#     create_AV_train_valid_test_sets("../dataset_prepare/enron_train.csv", "../dataset_prepare/enron_AV_datasets")
#     create_AV_train_valid_test_sets("../dataset_prepare/reddit_train.csv", "../dataset_prepare/reddit_AV_datasets")

Writing ../scripts/create_AV_datasets.py


## Training

In [None]:
%%writefile ../train_and_eval_an_AV_model.py
import os
import torch
import argparse
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import evaluate
import numpy as np
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report
from torch.nn.functional import softmax


def get_args():
    parser = argparse.ArgumentParser(description="Create AV datasets")
    parser.add_argument("--data_dir", type=str, required=True, help="Directory containing the dataset")
    parser.add_argument("--model_name", type=str, default="allenai/longformer-base-4096", help="Name of the model to be used")
    parser.add_argument("--max_length", type=int, default=2048, help="Maximum length of the input sequences")
    parser.add_argument("--num_train_epochs", type=int, default=1, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size for training")
    parser.add_argument("--eval_batch_size", type=int, default=16, help="Batch size for evaluation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4, help="Gradient accumulation steps")
    parser.add_argument("--warmup_steps", type=int, default=500, help="Number of warmup steps for learning rate scheduler")
    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for optimizer")
    parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate for optimizer")
    parser.add_argument("--logging_steps", type=int, default=100, help="Logging steps")
    parser.add_argument("--evaluation_strategy", type=str, default="epoch", help="Evaluation strategy")
    parser.add_argument("--load_best_model_at_end", type=str, default="True", help="Load the best model at the end of training")
    parser.add_argument("--fp16", type=str, default="True", help="Use mixed precision training")
    parser.add_argument("--save_total_limit", type=int, default=1, help="Limit the total amount of checkpoints")
    parser.add_argument("--output_dir", type=str, default="./models", help="Output directory for model checkpoints")
    
    return parser.parse_args()


def load_AV_dataset(data_dire):
    train_df = pd.read_csv(os.path.join(data_dire, "train.csv"))
    valid_df = pd.read_csv(os.path.join(data_dire, "valid.csv"))
    test_df = pd.read_csv(os.path.join(data_dire, "test.csv"))

    return train_df, valid_df, test_df


def bool_str_to_bool(value):
    return value.lower() in ('true', '1', 'yes', 'y', 't')


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def compute_metrics(eval_pred):
    f1_metric = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric.add_batch(predictions=predictions, references=labels)
    return f1_metric.compute()


def main():
    args = get_args()
    
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
    torch.cuda.empty_cache()

    # Load the datasets
    train_df, valid_df, test_df = load_AV_dataset(args.data_dir)
    print(f"Loaded datasets from {args.data_dir}")

    # Load the tokenizer and tokenize the datasets
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    print(f"Loaded tokenizer from {args.model_name}")

    train_df = train_df.copy()[:1000]
    valid_df = valid_df.copy()[:1000]
    test_df = test_df.copy()[:1000]

    train_encodings = tokenizer(list(train_df['text1']), list(train_df['text2']), 
                                truncation=True, padding="max_length", max_length=args.max_length)
    valid_encodings = tokenizer(list(valid_df['text1']), list(valid_df['text2']), truncation=True, 
                                padding="max_length", max_length=args.max_length)
    test_encodings = tokenizer(list(test_df['text1']), list(test_df['text2']), truncation=True, 
                               padding="max_length", max_length=args.max_length)
    
    # Create datasets
    train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
    valid_dataset = CustomDataset(valid_encodings, valid_df['label'].tolist())
    test_dataset = CustomDataset(test_encodings, test_df['label'].tolist())

    # Load the Longformer model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2)

    dataset = args.data_dir.split("/")[-1]
    model_output_dir = f"./{args.model_name.split('/')[-1]}_models//" + dataset

    training_args = TrainingArguments(
        output_dir=model_output_dir,  # Output directory
        fp16=bool_str_to_bool(args.fp16),  # Use mixed precision training
        num_train_epochs=args.num_train_epochs,  # Total number of training epochs
        per_device_train_batch_size=args.train_batch_size,  # Batch size per device during training
        per_device_eval_batch_size=args.eval_batch_size,  # Batch size for evaluation
        gradient_accumulation_steps=args.gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
        warmup_steps=args.warmup_steps,  # Number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # Strength of weight decay
        learning_rate=args.learning_rate,  # Initial learning rate
        save_total_limit=args.save_total_limit,  # Limit the total amount of checkpoints
        logging_steps=args.logging_steps,  # Log every X updates steps
        eval_strategy=args.evaluation_strategy,  # Evaluation strategy to adopt during training
        save_strategy=args.evaluation_strategy,  # Save strategy to adopt during training
        load_best_model_at_end= bool_str_to_bool(args.load_best_model_at_end),  # Load the best model at the end of training
    )


    # Create the Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions = trainer.predict(test_dataset)
    y_pred = predictions.predictions.argmax(-1)

    y_test = test_df.label.tolist()
    print(classification_report(y_test, y_pred))

    logits = predictions.predictions  # This contains the raw logits output
    # Convert logits to probabilities using softmax
    probabilities = softmax(torch.tensor(logits), dim=1).tolist()
    test_df["prediction"]=y_pred
    test_df["probabilities"] = [prob[1] for prob in probabilities]
    test_df.to_csv(os.path.join(args.data_dir, "test_results.csv"), index=False)


if __name__ == "__main__":
    main()

Writing ../train_and_eval_an_AV_model.py


In [1]:
import os
import torch
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
torch.cuda.empty_cache()

In [2]:
dataset = 'enron_email'
col_1 = 'longformer_prediction'
col_2 = 'longformer_prob'
model_output_dir = "./AV_longformer_models//"+dataset

In [None]:
import os
import pandas as pd


def load_AV_dataset(data_dire):
    train_df = pd.read_csv(os.path.join(data_dire, "train.csv"))
    valid_df = pd.read_csv(os.path.join(data_dire, "valid.csv"))
    test_df = pd.read_csv(os.path.join(data_dire, "test.csv"))

    return train_df, valid_df, test_df


train_df, valid_df, test_df = load_AV_dataset("../dataset_prepare/enron_AV_datasets")
train_df.label.value_counts(), valid_df.label.value_counts(), test_df.label.value_counts()

(label
 0    48020
 1    31980
 Name: count, dtype: int64,
 label
 0    11980
 1     8020
 Name: count, dtype: int64,
 label
 0    12000
 1     8000
 Name: count, dtype: int64)

In [4]:
dataset = 'enron_email'
col_1 = 'longformer_prediction'
col_2 = 'longformer_prob'
model_output_dir = "./AV_longformer_models//"+dataset

In [5]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset


# Load the Longformer tokenizer
model_name = 'allenai/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the datasets with max_length set to 4096
train_df = train_df.copy()[:1000]
valid_df = valid_df.copy()[:1000]
test_df = test_df.copy()[:1000]

train_encodings = tokenizer(list(train_df['text1']), list(train_df['text2']), truncation=True, padding="max_length", max_length=2048)
valid_encodings = tokenizer(list(valid_df['text1']), list(valid_df['text2']), truncation=True, padding="max_length", max_length=2048)
test_encodings = tokenizer(list(test_df['text1']), list(test_df['text2']), truncation=True, padding="max_length", max_length=2048)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
valid_dataset = CustomDataset(valid_encodings, valid_df['label'].tolist())
test_dataset = CustomDataset(test_encodings, test_df['label'].tolist())

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import evaluate
import numpy as np
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification


def compute_metrics(eval_pred):
    f1_metric = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric.add_batch(predictions=predictions, references=labels)
    return f1_metric.compute()

# Load the Longformer model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=model_output_dir,  # Output directory
    fp16=True,
    num_train_epochs=1,  # Total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,  # batch size for evaluation
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate batch size of 16
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    learning_rate=2e-5,  # Learning rate
    save_total_limit=1,  # Limit the total amount of checkpoints
    logging_steps=100,
    evaluation_strategy="epoch",
)

# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainer.train()

Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,F1
0,No log,0.672696,0.0


TrainOutput(global_step=31, training_loss=0.6791960193264869, metrics={'train_runtime': 216.9593, 'train_samples_per_second': 4.609, 'train_steps_per_second': 0.143, 'total_flos': 1303193597509632.0, 'train_loss': 0.6791960193264869, 'epoch': 0.992})

In [9]:
trainer.evaluate()

{'eval_loss': 0.6726962924003601,
 'eval_f1': 0.0,
 'eval_runtime': 88.3509,
 'eval_samples_per_second': 11.319,
 'eval_steps_per_second': 0.713,
 'epoch': 0.992}

In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)

y_test = test_df.label.tolist()
print(classification_report(y_test,y_pred))

[[ 17 588]
 [  4 391]]
              precision    recall  f1-score   support

           0       0.81      0.03      0.05       605
           1       0.40      0.99      0.57       395

    accuracy                           0.41      1000
   macro avg       0.60      0.51      0.31      1000
weighted avg       0.65      0.41      0.26      1000



In [17]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [18]:
# Extract logits from predictions
from torch.nn.functional import softmax
logits = predictions.predictions  # This contains the raw logits output
# Convert logits to probabilities using softmax
probabilities = softmax(torch.tensor(logits), dim=1).tolist()
# Extract probabilities for class 1
class_1_probabilities = [prob[1] for prob in probabilities]
test_df[col_1]=y_pred
test_df[col_2] = class_1_probabilities
# new_test_file_name =  'predictions//'+corpora+'_av_prediction_'+dataset+'.csv'
# test_df.to_csv(new_test_file_name,index=False,header=True, sep='\t')
test_df

Unnamed: 0,author1,author2,text1,text2,label,longformer_prediction,longformer_prob
0,Sally Beck,Sally Beck,I will be out of the office for most of the da...,"Mary Solmonson, one of my direct reports, has ...",1,1,0.517556
1,Sara Shackleton,Wordsmith,Tanya: There's no short answer with these peo...,punctilious (pungk-TIL-ee-uhs) adjective\n\n ...,0,1,0.527005
2,Bruno Gaillard,Bruno Gaillard,"SB 1712, Author, Polanco. Universal communica...",AB 2198 Telecommunications: local telephone se...,1,1,0.529499
3,Sally Beck,Sally Beck,Contact numbers are listed below for Greg Pipe...,I will be on vacation the week of March while ...,1,1,0.547146
4,Mike Grigsby,Marie Heard,"Dear John,\n\nI had a meeting with Phillip aft...","Hi, Tanya!\n\nJay said that he has passed AK S...",0,1,0.527708
...,...,...,...,...,...,...,...
995,Kimberly Watson,Michelle Cash,"Earl and Mansoor,\n\nI have received a copy of...","Dee,\n\nI left you a voice mail yesterday, but...",0,1,0.533542
996,Susan Bailey,Mark Greenberg,We have received an executed Master Agreement:...,Julia/Alan/Mark -\n\nMark asked me to work on ...,0,1,0.522552
997,Susan M Scott,Mary Cook,Fifth grade assignment\n\nThe teacher gave her...,1. I was not sure whose doc the referenced no...,0,1,0.511511
998,Pete Davis,Pete Davis,\n\nStart Date: 1/21/02; HourAhead hour: 16; ...,\n\nStart Date: 10/23/01; HourAhead hour: 21; ...,1,1,0.523344


In [19]:
(test_df.label == test_df[col_1]).mean()

np.float64(0.408)