# Training

In [6]:
from transformers import (DataCollatorWithPadding, Trainer, TrainingArguments,
                          LongformerTokenizer, LongformerForSequenceClassification)

from datasets import Dataset, DatasetDict

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import torch
from torch.utils.data import DataLoader
assert torch.cuda.is_available(), 'GPU not found. You should fix this.'

import wandb
%env WANDB_ENTITY = langdon
%env WANDB_PROJECT = ellipse
%env WANDB_DIR = /home/jovyan/active-projects/ellipse-methods-showcase/bin

env: WANDB_ENTITY=langdon
env: WANDB_PROJECT=ellipse
env: WANDB_DIR=/home/jovyan/active-projects/ellipse-methods-showcase/bin


## Data

In [159]:
def get_datadict(score_to_predict):
    
    scores = {
        'Overall',
        'Cohesion',
        'Syntax',
        'Vocabulary',
        'Phraseology',
        'Grammar',
        'Conventions'
    }
    
    columns_to_remove = scores.symmetric_difference([score_to_predict])
    
    dd = (DatasetDict
          .load_from_disk('../data/ellipse.hf')
          .remove_columns(columns_to_remove)
          .rename_column(score_to_predict, 'label')
         )
    
    return dd

In [160]:
score_to_predict = 'Grammar'

dd = get_datadict(score_to_predict)
dd

DatasetDict({
    train: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4537
    })
    dev: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 972
    })
    test: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 973
    })
})

## Helper Functions

In [None]:
def model_init():
    return (LongformerForSequenceClassification
            .from_pretrained(model_name,
                             num_labels=1)
            .cuda()
           )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [82]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)

    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

## Training Paramaters

In [None]:
learning_rate = 1e-05
batch_size = 16
num_epochs = 4


training_args = TrainingArguments(
    output_dir = '../bin',
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    gradient_accumulation_steps = 4, 
    gradient_checkpointing = True,
    weight_decay = 0.01,
    learning_rate = learning_rate,
    logging_dir = f'../logs/{score_to_predict}',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'rmse',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch', 
    greater_is_better = False,
    seed = SEED,
    log_level = 'error',
    disable_tqdm = False,
    report_to = 'wandb',
) 

## Run Train

In [None]:
trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = dd['train'],
    eval_dataset = dd['dev'],
    compute_metrics = compute_metrics_for_regression,
)

# Train the model
trainer.train()

## Unused

In [None]:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True,output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)
s = 'I want to sleep'
inputs = tokenizer.encode_plus(s,return_tensors='pt', add_special_tokens=False,is_pretokenized=True)
input_ids = inputs['input_ids']
output = model(input_ids)
hidden_states = output[-2]

In [22]:
import scipy
preds, labs, metrics = model.predict(ds_t['test'])
print(scipy.stats.pearsonr(labs, preds))

from matplotlib import pyplot as plt
preds.flatten()
plt.scatter(preds, actual, alpha=0.5)
plt.ylabel('true score')
plt.xlabel('predicted score')
plt.title('Model Accuracy')
plt.show()

Some weights of the model checkpoint at ./test/checkpoint-6810 were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'LongformerForSequenceClassification' object has no attribute 'predict'

In [None]:
def model_init():
    return LongformerForSequenceClassification.from_pretrained(model_name, num_labels=1, return_dict=True).to(device)

training_args = TrainingArguments(
    'test', 
    evaluation_strategy='epoch', 
    disable_tqdm=True,
    gradient_accumulation_steps=4, 
    gradient_checkpointing=True,
    report_to='wandb',)

trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds_t['train'],
    eval_dataset=ds_t['valid'],
    model_init=model_init,
    compute_metrics=compute_metrics_for_regression,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),    
)

trainer.train()

trainer.hyperparameter_search(
    direction='minimize', 
    backend='ray', 
    n_trials=10, # number of trials

)