### Fine-Tune Longformer Mini

- Joel Stremmel
- 04-11-23

##### About

Fine-Tune Longformer Mini on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [1]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

##### Set Parameters

In [2]:
max_seq_len = 1024
batch_size = 32
accumulation_steps = 1
lr = 0.00005
weight_decay = 0.01
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 0.00000001
warmup_steps = 10
logging_steps = 1
num_workers = 8
seed = 44
epochs = 100
fp16 = True
output_dir = "lf_output"
lm_path = "kiddothe2b/longformer-mini-1024"

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [4]:
with open('data/X_folds.pkl', 'rb') as f:
    X_folds = pickle.load(f)

with open('data/y_folds.pkl', 'rb') as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [5]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [6]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5.


##### Check that GPU is Available

In [7]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Tokenize Text and Fit Model to Each Fold

In [8]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]
    
    # Format text and label data as HuggingFace dataset
    train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
    test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})
    
    # Load model and tokenizer
    # This will reset the model weights with each new iteration
    tokenizer = AutoTokenizer.from_pretrained(lm_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        lm_path,
        num_labels=2,
        return_dict=True,
        problem_type="single_label_classification"
    )
    
    # Define function to tokenize text
    def tokenize_function(batch):
        
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=max_seq_len
        )
    
    # Tokenize train dataset
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    train_dataset.set_format("pt")
    
    # Tokenize test dataset
    test_dataset = test_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    test_dataset.set_format("pt")
    
    # Define training arguments
    training_args= TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_steps=warmup_steps,
        logging_steps=logging_steps,
        weight_decay=weight_decay,
        learning_rate=lr,
        seed=seed,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        dataloader_num_workers=num_workers,
        fp16=fp16,
        logging_strategy="steps",
        save_strategy="no",
        lr_scheduler_type='linear',
        optim="adamw_torch",
        run_name='lf',
        do_eval=False,
        fp16_full_eval=False,
        sharded_ddp=False,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
        prediction_loss_only=False,
        disable_tqdm=True,
        logging_dir=None,
    )
    
    # Define model training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # Train model
    trainer.train()
    
    # Predict on test dataset
    output = trainer.predict(test_dataset)
    labels = output.label_ids
    y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[:, 1]

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(labels)

Fitting model using fold 0 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

{'loss': 0.698, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7025, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6969, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.6966, 'learning_rate': 2e-05, 'epoch': 2.0}
{'loss': 0.678, 'learning_rate': 2.5e-05, 'epoch': 2.5}
{'loss': 0.6969, 'learning_rate': 3e-05, 'epoch': 3.0}
{'loss': 0.6701, 'learning_rate': 3.5e-05, 'epoch': 3.5}
{'loss': 0.6919, 'learning_rate': 4e-05, 'epoch': 4.0}
{'loss': 0.6643, 'learning_rate': 4.5e-05, 'epoch': 4.5}
{'loss': 0.6983, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 0.6894, 'learning_rate': 4.973684210526316e-05, 'epoch': 5.5}
{'loss': 0.6354, 'learning_rate': 4.9473684210526315e-05, 'epoch': 6.0}
{'loss': 0.6571, 'learning_rate': 4.921052631578947e-05, 'epoch': 6.5}
{'loss': 0.6423, 'learning_rate': 4.8947368421052635e-05, 'epoch': 7.0}
{'loss': 0.6159, 'learning_rate': 4.868421052631579e-05, 'epoch': 7.5}
{'loss': 0.6729, 'learning_rate': 4.842105263157895e-05, 'epoch': 8.0}
{'loss': 0.6183,

{'loss': 0.1564, 'learning_rate': 2.1578947368421053e-05, 'epoch': 59.0}
{'loss': 0.1117, 'learning_rate': 2.1315789473684212e-05, 'epoch': 59.5}
{'loss': 0.1275, 'learning_rate': 2.105263157894737e-05, 'epoch': 60.0}
{'loss': 0.0562, 'learning_rate': 2.078947368421053e-05, 'epoch': 60.5}
{'loss': 0.2231, 'learning_rate': 2.0526315789473685e-05, 'epoch': 61.0}
{'loss': 0.1572, 'learning_rate': 2.0263157894736842e-05, 'epoch': 61.5}
{'loss': 0.0589, 'learning_rate': 2e-05, 'epoch': 62.0}
{'loss': 0.0674, 'learning_rate': 1.9736842105263158e-05, 'epoch': 62.5}
{'loss': 0.1764, 'learning_rate': 1.9473684210526315e-05, 'epoch': 63.0}
{'loss': 0.0956, 'learning_rate': 1.9210526315789474e-05, 'epoch': 63.5}
{'loss': 0.158, 'learning_rate': 1.8947368421052634e-05, 'epoch': 64.0}
{'loss': 0.1399, 'learning_rate': 1.868421052631579e-05, 'epoch': 64.5}
{'loss': 0.0833, 'learning_rate': 1.8421052631578947e-05, 'epoch': 65.0}
{'loss': 0.1378, 'learning_rate': 1.8157894736842107e-05, 'epoch': 65.5}

Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.6798, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6816, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6954, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.6815, 'learning_rate': 2e-05, 'epoch': 2.0}
{'loss': 0.6849, 'learning_rate': 2.5e-05, 'epoch': 2.5}
{'loss': 0.6762, 'learning_rate': 3e-05, 'epoch': 3.0}
{'loss': 0.6759, 'learning_rate': 3.5e-05, 'epoch': 3.5}
{'loss': 0.6771, 'learning_rate': 4e-05, 'epoch': 4.0}
{'loss': 0.6682, 'learning_rate': 4.5e-05, 'epoch': 4.5}
{'loss': 0.6837, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 0.695, 'learning_rate': 4.973684210526316e-05, 'epoch': 5.5}
{'loss': 0.6241, 'learning_rate': 4.9473684210526315e-05, 'epoch': 6.0}
{'loss': 0.6768, 'learning_rate': 4.921052631578947e-05, 'epoch': 6.5}
{'loss': 0.6314, 'learning_rate': 4.8947368421052635e-05, 'epoch': 7.0}
{'loss': 0.656, 'learning_rate': 4.868421052631579e-05, 'epoch': 7.5}
{'loss': 0.6294, 'learning_rate': 4.842105263157895e-05, 'epoch': 8.0}
{'loss': 0.6257,

{'loss': 0.0802, 'learning_rate': 2.1578947368421053e-05, 'epoch': 59.0}
{'loss': 0.0877, 'learning_rate': 2.1315789473684212e-05, 'epoch': 59.5}
{'loss': 0.2442, 'learning_rate': 2.105263157894737e-05, 'epoch': 60.0}
{'loss': 0.1436, 'learning_rate': 2.078947368421053e-05, 'epoch': 60.5}
{'loss': 0.1696, 'learning_rate': 2.0526315789473685e-05, 'epoch': 61.0}
{'loss': 0.2064, 'learning_rate': 2.0263157894736842e-05, 'epoch': 61.5}
{'loss': 0.0757, 'learning_rate': 2e-05, 'epoch': 62.0}
{'loss': 0.1516, 'learning_rate': 1.9736842105263158e-05, 'epoch': 62.5}
{'loss': 0.1529, 'learning_rate': 1.9473684210526315e-05, 'epoch': 63.0}
{'loss': 0.1866, 'learning_rate': 1.9210526315789474e-05, 'epoch': 63.5}
{'loss': 0.1204, 'learning_rate': 1.8947368421052634e-05, 'epoch': 64.0}
{'loss': 0.1489, 'learning_rate': 1.868421052631579e-05, 'epoch': 64.5}
{'loss': 0.1538, 'learning_rate': 1.8421052631578947e-05, 'epoch': 65.0}
{'loss': 0.1274, 'learning_rate': 1.8157894736842107e-05, 'epoch': 65.5

Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.6936, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6873, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.7067, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.6952, 'learning_rate': 2e-05, 'epoch': 2.0}
{'loss': 0.6913, 'learning_rate': 2.5e-05, 'epoch': 2.5}
{'loss': 0.6817, 'learning_rate': 3e-05, 'epoch': 3.0}
{'loss': 0.6844, 'learning_rate': 3.5e-05, 'epoch': 3.5}
{'loss': 0.6789, 'learning_rate': 4e-05, 'epoch': 4.0}
{'loss': 0.6784, 'learning_rate': 4.5e-05, 'epoch': 4.5}
{'loss': 0.6648, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 0.6612, 'learning_rate': 4.973684210526316e-05, 'epoch': 5.5}
{'loss': 0.6731, 'learning_rate': 4.9473684210526315e-05, 'epoch': 6.0}
{'loss': 0.6531, 'learning_rate': 4.921052631578947e-05, 'epoch': 6.5}
{'loss': 0.6617, 'learning_rate': 4.8947368421052635e-05, 'epoch': 7.0}
{'loss': 0.6691, 'learning_rate': 4.868421052631579e-05, 'epoch': 7.5}
{'loss': 0.6201, 'learning_rate': 4.842105263157895e-05, 'epoch': 8.0}
{'loss': 0.620

{'loss': 0.0528, 'learning_rate': 2.1842105263157896e-05, 'epoch': 59.0}
{'loss': 0.0687, 'learning_rate': 2.1578947368421053e-05, 'epoch': 59.5}
{'loss': 0.1732, 'learning_rate': 2.1315789473684212e-05, 'epoch': 60.0}
{'loss': 0.1124, 'learning_rate': 2.105263157894737e-05, 'epoch': 60.5}
{'loss': 0.1447, 'learning_rate': 2.078947368421053e-05, 'epoch': 61.0}
{'loss': 0.1679, 'learning_rate': 2.0526315789473685e-05, 'epoch': 61.5}
{'loss': 0.0594, 'learning_rate': 2.0263157894736842e-05, 'epoch': 62.0}
{'loss': 0.1774, 'learning_rate': 2e-05, 'epoch': 62.5}
{'loss': 0.0829, 'learning_rate': 1.9736842105263158e-05, 'epoch': 63.0}
{'loss': 0.167, 'learning_rate': 1.9473684210526315e-05, 'epoch': 63.5}
{'loss': 0.1051, 'learning_rate': 1.9210526315789474e-05, 'epoch': 64.0}
{'loss': 0.1288, 'learning_rate': 1.8947368421052634e-05, 'epoch': 64.5}
{'loss': 0.1108, 'learning_rate': 1.868421052631579e-05, 'epoch': 65.0}
{'loss': 0.1003, 'learning_rate': 1.8421052631578947e-05, 'epoch': 65.5}

Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

{'loss': 0.6865, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7122, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.7037, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.7075, 'learning_rate': 2e-05, 'epoch': 2.0}
{'loss': 0.6808, 'learning_rate': 2.5e-05, 'epoch': 2.5}
{'loss': 0.6838, 'learning_rate': 3e-05, 'epoch': 3.0}
{'loss': 0.6836, 'learning_rate': 3.5e-05, 'epoch': 3.5}
{'loss': 0.6898, 'learning_rate': 4e-05, 'epoch': 4.0}
{'loss': 0.6586, 'learning_rate': 4.5e-05, 'epoch': 4.5}
{'loss': 0.6852, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 0.6241, 'learning_rate': 4.973684210526316e-05, 'epoch': 5.5}
{'loss': 0.6966, 'learning_rate': 4.9473684210526315e-05, 'epoch': 6.0}
{'loss': 0.6518, 'learning_rate': 4.921052631578947e-05, 'epoch': 6.5}
{'loss': 0.6486, 'learning_rate': 4.8947368421052635e-05, 'epoch': 7.0}
{'loss': 0.6592, 'learning_rate': 4.868421052631579e-05, 'epoch': 7.5}
{'loss': 0.6163, 'learning_rate': 4.842105263157895e-05, 'epoch': 8.0}
{'loss': 0.607

{'loss': 0.0876, 'learning_rate': 2.1578947368421053e-05, 'epoch': 59.0}
{'loss': 0.0455, 'learning_rate': 2.1315789473684212e-05, 'epoch': 59.5}
{'loss': 0.1406, 'learning_rate': 2.105263157894737e-05, 'epoch': 60.0}
{'loss': 0.1181, 'learning_rate': 2.078947368421053e-05, 'epoch': 60.5}
{'loss': 0.0962, 'learning_rate': 2.0526315789473685e-05, 'epoch': 61.0}
{'loss': 0.1006, 'learning_rate': 2.0263157894736842e-05, 'epoch': 61.5}
{'loss': 0.0943, 'learning_rate': 2e-05, 'epoch': 62.0}
{'loss': 0.1631, 'learning_rate': 1.9736842105263158e-05, 'epoch': 62.5}
{'loss': 0.0442, 'learning_rate': 1.9473684210526315e-05, 'epoch': 63.0}
{'loss': 0.0401, 'learning_rate': 1.9210526315789474e-05, 'epoch': 63.5}
{'loss': 0.1772, 'learning_rate': 1.8947368421052634e-05, 'epoch': 64.0}
{'loss': 0.0885, 'learning_rate': 1.868421052631579e-05, 'epoch': 64.5}
{'loss': 0.1119, 'learning_rate': 1.8421052631578947e-05, 'epoch': 65.0}
{'loss': 0.0429, 'learning_rate': 1.8157894736842107e-05, 'epoch': 65.5

Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.6856, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6948, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6737, 'learning_rate': 1.5e-05, 'epoch': 1.5}
{'loss': 0.7271, 'learning_rate': 2e-05, 'epoch': 2.0}
{'loss': 0.6925, 'learning_rate': 2.5e-05, 'epoch': 2.5}
{'loss': 0.6949, 'learning_rate': 3e-05, 'epoch': 3.0}
{'loss': 0.6806, 'learning_rate': 3.5e-05, 'epoch': 3.5}
{'loss': 0.6957, 'learning_rate': 4e-05, 'epoch': 4.0}
{'loss': 0.6887, 'learning_rate': 4.5e-05, 'epoch': 4.5}
{'loss': 0.6745, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 0.6789, 'learning_rate': 4.973684210526316e-05, 'epoch': 5.5}
{'loss': 0.6635, 'learning_rate': 4.9473684210526315e-05, 'epoch': 6.0}
{'loss': 0.6836, 'learning_rate': 4.921052631578947e-05, 'epoch': 6.5}
{'loss': 0.6308, 'learning_rate': 4.8947368421052635e-05, 'epoch': 7.0}
{'loss': 0.6288, 'learning_rate': 4.868421052631579e-05, 'epoch': 7.5}
{'loss': 0.6618, 'learning_rate': 4.842105263157895e-05, 'epoch': 8.0}
{'loss': 0.632

{'loss': 0.0935, 'learning_rate': 2.2105263157894736e-05, 'epoch': 59.0}
{'loss': 0.1387, 'learning_rate': 2.1842105263157896e-05, 'epoch': 59.5}
{'loss': 0.2127, 'learning_rate': 2.1578947368421053e-05, 'epoch': 60.0}
{'loss': 0.198, 'learning_rate': 2.1315789473684212e-05, 'epoch': 60.5}
{'loss': 0.1197, 'learning_rate': 2.105263157894737e-05, 'epoch': 61.0}
{'loss': 0.1671, 'learning_rate': 2.078947368421053e-05, 'epoch': 61.5}
{'loss': 0.1476, 'learning_rate': 2.0526315789473685e-05, 'epoch': 62.0}
{'loss': 0.1874, 'learning_rate': 2.0263157894736842e-05, 'epoch': 62.5}
{'loss': 0.1418, 'learning_rate': 2e-05, 'epoch': 63.0}
{'loss': 0.1341, 'learning_rate': 1.9736842105263158e-05, 'epoch': 63.5}
{'loss': 0.2222, 'learning_rate': 1.9473684210526315e-05, 'epoch': 64.0}
{'loss': 0.1218, 'learning_rate': 1.9210526315789474e-05, 'epoch': 64.5}
{'loss': 0.2267, 'learning_rate': 1.8947368421052634e-05, 'epoch': 65.0}
{'loss': 0.2523, 'learning_rate': 1.868421052631579e-05, 'epoch': 65.5}

##### Save Model Probabilities on Test Folds and True Labels

In [9]:
with open('results/lfm_y_trues.pkl', 'wb') as f:
    pickle.dump(y_trues, f)

with open('results/lfm_y_probs.pkl', 'wb') as f:
    pickle.dump(y_probs, f)