In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
from datasets import load_dataset, load_metric, Dataset, Value, ClassLabel, Features, DatasetDict
import torch

sns.set_theme(font='Liberation Serif',
              rc={'figure.figsize': (7.5,3.75),
                  'font.size': 11,
                  'figure.dpi': 300,
                 })

seed = 42

In [8]:
DATA = '../data/'
DATAFRAME = 'final_summaries_ai_aloe_fixed.csv'
summaries_df = pd.read_csv(DATA + DATAFRAME)
texts_to_remove = ['Q31', 'Q35', 'Q41', 'Q39', 'Q33', 'Red Blood Cells SE Task', 'GW_C', 'GW_A', 'GW_B', 
                   'GW_D', '14_ComputerVirus', '5_FireworksDanger', '23_InternetShopping', '6_Smoking', 
                   '10_Internet', 'Low Air Fares_Money']

In [9]:
def prepare_dataframe(summaries_df):
    # copy the data
    df_normalized = summaries_df.copy()  
    # apply normalization techniques
    df_normalized['content_pca'] = StandardScaler().fit_transform(np.array(df_normalized['content_pca']).reshape(-1,1))
    df_normalized['paraphrase_pca'] = StandardScaler().fit_transform(np.array(df_normalized['paraphrase_pca']).reshape(-1,1))
    # combine labels into a single vector
    df_normalized['labels'] = df_normalized.apply(lambda row: [row['content_pca'], row['paraphrase_pca']], axis=1)
    # combine source and summary
    df_normalized['text'] = df_normalized['text'] + '</s>' + df_normalized['source']
    # remove \r tokens
    df_normalized['text'] = df_normalized['text'].str.replace('\xa0', '')
    return df_normalized

In [10]:
def buildDataset(df):
    full_dataset = Dataset.from_pandas(df, preserve_index=False)
    # 70% train, 30% test
    train_valid = full_dataset.train_test_split(test_size=0.176, seed=seed)
    # gather everyone if you want to have a single DatasetDict
    final_dataset = DatasetDict({
        'train': train_valid['train'],
        'valid': train_valid['test']})
    return final_dataset

def prepare_dataset(df_normalized, texts_to_remove):
    test_df = df_normalized[df_normalized['source_text_filename_clean'].isin(texts_to_remove)][['text', 'labels']]
    test_df.columns = ['text', 'labels']
    train_df = df_normalized[df_normalized['source_text_filename_clean'].isin(texts_to_remove) == False][['text', 'labels']]
    test_df.columns = ['text', 'labels']    
    ds = buildDataset(train_df)
    ds['test'] = Dataset.from_pandas(test_df, preserve_index=False)
    return ds


In [11]:
ds = prepare_dataset(prepare_dataframe(summaries_df), texts_to_remove)

In [12]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification, LongformerConfig, logging
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers.modeling_outputs import SequenceClassifierOutput

logging.set_verbosity_error()
logging.set_verbosity_warning()

import torch
import torch.nn as nn
import transformers

model_name = 'allenai/longformer-base-4096'

tokenizer = LongformerTokenizer.from_pretrained(model_name, padding=True, truncation=True)
model = LongformerForSequenceClassification.from_pretrained(model_name)

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")

print(device)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

cuda


In [13]:
def tokenize_inputs(example):
    return tokenizer(example['text'], truncation = True, padding='max_length')

def custom_global_attention(example):
    try:
        sep_index = example['input_ids'].index(2)
        global_attention_mask = [1]*(sep_index + 1) + [0]*(len(example['input_ids'])-(sep_index + 1))
        return {'global_attention_mask': global_attention_mask}
    except:
        print(example['input_ids'], example['text'])

def tokenize_dataset(ds):
    ds1 = ds.map(tokenize_inputs, batched=True)
    ds2 = ds1.map(custom_global_attention, batched=False)
    return ds2

In [14]:
ds_t = tokenize_dataset(ds)

                                                                

In [43]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()  
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.hidden_dropout_prob = 0
        self.config.attention_probs_dropout_prob = 0
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, len(CONFIG['label_cols']))
        
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids,
                         attention_mask=attention_mask, 
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, attention_mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return SequenceClassifierOutput(logits=outputs)

In [57]:
class RMSELoss(nn.Module):
    """
    Code taken from Y Nakama's notebook (https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train)
    """
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, predictions, targets):
        loss = torch.sqrt(self.mse(predictions, targets) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        loss_func = RMSELoss(reduction='mean')
        loss = loss_func(outputs.logits.float(), inputs['labels'].float())
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    colwise_rmse = np.sqrt(np.mean((labels - predictions) ** 2, axis=0))
    res = {
        f"{analytic.upper()}_RMSE" : colwise_rmse[i]
        for i, analytic in enumerate(model.config.label_cols)
    }
    res["MCRMSE"] = np.mean(colwise_rmse)
    return res

In [61]:
learning_rate = 3e-05
batch_size = 8
seed = 42
num_epochs = 10

def model_init():
    model = LongformerForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    model.config.attention_window = [256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256]
    model.config.label_cols = ['content', 'wording']
    model.config.loss_type = "rmse"
    print(model.config)
    return model

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=4096)

In [62]:
ds_t

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'global_attention_mask'],
        num_rows: 3285
    })
    valid: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'global_attention_mask'],
        num_rows: 702
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'global_attention_mask'],
        num_rows: 703
    })
})

In [63]:
trainer = Trainer(
    model_init=model_init,
)

training_args = TrainingArguments(
    output_dir = f'./results/longformer_one-mod_checkpoints',
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    learning_rate = learning_rate,
    gradient_accumulation_steps=4, 
    gradient_checkpointing=True,
    logging_dir = f'./logs/content',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_MCRMSE',
    evaluation_strategy = "epoch",
    save_strategy = "epoch", 
    greater_is_better = False,
    seed=seed,
    disable_tqdm = False, 
) 

    # Call the Trainer
trainer = CustomTrainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = ds_t['train'],
    eval_dataset = ds_t['valid'],
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]
)

# Train the model
trainer.train()

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label_cols": [
    "content",
    "wording"
  ],
  "layer_norm_eps": 1e-05,
  "loss_type": "rmse",
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.28.1",
  "type_vocab_size": 1,
  "vocab_size": 50265
}



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label_cols": [
    "content",
    "wording"
  ],
  "layer_norm_eps": 1e-05,
  "loss_type": "rmse",
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.28.1",
  "type_vocab_size": 1,
  "vocab_size": 50265
}



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label_cols": [
    "content",
    "wording"
  ],
  "layer_norm_eps": 1e-05,
  "loss_type": "rmse",
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.28.1",
  "type_vocab_size": 1,
  "vocab_size": 50265
}



Epoch,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
0,No log,0.613957,0.817851,0.840009,0.82893
1,No log,0.443717,0.563365,0.623571,0.593468
2,No log,0.408237,0.504193,0.583107,0.54365
4,No log,0.407543,0.502683,0.593058,0.547871
4,0.474200,0.404073,0.494443,0.571877,0.53316
5,0.474200,0.377127,0.468664,0.536159,0.502411
6,0.474200,0.388321,0.483676,0.557724,0.5207
8,0.474200,0.376389,0.46883,0.546936,0.507883
8,0.474200,0.37761,0.467427,0.543436,0.505431
9,0.281200,0.375735,0.466987,0.543408,0.505198




TrainOutput(global_step=1020, training_loss=0.37509298838821115, metrics={'train_runtime': 23504.4792, 'train_samples_per_second': 1.398, 'train_steps_per_second': 0.043, 'total_flos': 8.56876064427049e+16, 'train_loss': 0.37509298838821115, 'epoch': 9.93})

In [64]:
trainer.save_model('../bin/one_model_longformer')

In [18]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('../bin/one_model_longformer', num_labels=2)

In [20]:
model(ds_t['test'])

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not Dataset