In [1]:
#24 new backbone pretrained
DEBUG = False

In [2]:
EXP = 'PL26'

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl

import pandas as pd
import numpy as np
import random
import re
import itertools
import argparse

from torch.utils.data import Dataset
import spacy
import ast
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import pickle
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import warnings
from torch.optim import Adam, SGD, AdamW

import wandb
from pytorch_lightning.loggers import WandbLogger

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [5]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=1024, stage='train', rand_prob=0.1, lowup_proba=0.0, swap_proba=0.0):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.mask_token = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
        self.stage = stage
        self.rand_prob = rand_prob
        self.lowup_proba = lowup_proba
        self.swap_proba = swap_proba

        self.essay_id = df['essay_id'].values
        self.input_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.token_class_labels = df['token_class_labels'].values
        self.token_scores_labels = df['token_scores_labels'].values
        self.token_examples_mapping = df['token_examples_mapping'].values
        self.examples_scores = df['examples_scores'].values
        self.examples_classes = df['examples_classes'].values    
        
    def __getitem__(self, idx):
        essay_id = self.essay_id[idx]

        token_examples_mapping = self.token_examples_mapping[idx]
        examples_scores = self.examples_scores[idx]
        examples_classes = self.examples_classes[idx]

#         token_examples_mapping = torch.tensor(token_examples_mapping, dtype=torch.long)
#         examples_scores = torch.tensor(examples_scores + [-1] * (40 - len(examples_scores)), dtype=torch.long)
#         examples_classes = torch.tensor(examples_classes + [-1] * (40 - len(examples_classes)), dtype=torch.long)
        
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        token_class_labels = self.token_class_labels[idx]
        token_scores_labels = self.token_scores_labels[idx]

#         input_ids = torch.tensor(input_ids, dtype=torch.long)
#         attention_mask = torch.tensor(attention_mask, dtype=torch.long)
#         token_class_labels = torch.tensor(token_class_labels, dtype=torch.long)
#         token_scores_labels = torch.tensor(token_scores_labels, dtype=torch.long)

#         if self.stage == 'train':
#             ix = torch.rand(size=(len(input_ids),)) < self.rand_prob
#             input_ids[ix] = self.mask_token
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_class_labels": token_class_labels,
            "token_scores_labels": token_scores_labels,
            "token_examples_mapping": token_examples_mapping,
            "examples_scores": examples_scores,
            "examples_classes": examples_classes
        }

    def __len__(self):
        return len(self.df)

In [6]:
emb_dim = 32
pooler_dropout = 0.1

In [7]:
from transformers.activations import GELUActivation

class Pooler(nn.Module):
    def __init__(self, hidden_size, dropout):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.act = GELUActivation()
        self.dropout = nn.Dropout(dropout)

    def forward(self, features):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.

        features = self.dropout(features)
        features = self.dense(features)
        features = self.act(features)
        return features

In [8]:
class MyModule(pl.LightningModule):
    def __init__(self, lr, model_checkpoint, num_classes, num_classes_class, emb_dim):
        super().__init__()
        self.lr = lr
        self.num_classes = num_classes
        self.num_classes_class = num_classes_class
        self.emb_dim = emb_dim
        self.name = model_checkpoint
        self.pad_idx = 1 if "roberta" in self.name else 0
        config = AutoConfig.from_pretrained(model_checkpoint, output_hidden_states=True)
        self.longformer = AutoModel.from_pretrained(model_checkpoint, config=config)
        self.nb_features = config.hidden_size
        self.logits = nn.Linear(self.nb_features, num_classes)
        self.pooler_class = Pooler(self.nb_features, pooler_dropout)
        self.pooler_scores = Pooler(self.nb_features, pooler_dropout)
        self.pooler = Pooler(self.nb_features + self.emb_dim, pooler_dropout)
        self.example_logits = nn.Linear(self.nb_features + self.emb_dim, num_classes)
        self.class_logits = nn.Linear(self.nb_features, num_classes_class)  
        transformers.logging.set_verbosity_error()
        self.embedding = nn.Embedding(num_classes_class, emb_dim, max_norm=True)
    
    def load_model(self, path):
        self.load_state_dict(torch.load(path, map_location='cuda:0'), strict=False)
        print('Model Loaded!')
    
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.named_parameters() if (not any(nd in n for nd in no_decay)) and \
                       ('deberta' in n)],
             'lr': self.lr, 'weight_decay': 0.01},
            {'params': [p for n, p in self.named_parameters() if (any(nd in n for nd in no_decay)) and \
                       ('deberta' in n)],
             'lr': self.lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.named_parameters() if (not any(nd in n for nd in no_decay)) and \
                       ('deberta' not in n)],
             'lr': self.lr * 10, 'weight_decay': 0.01},
            {'params': [p for n, p in self.named_parameters() if (any(nd in n for nd in no_decay)) and \
                       ('deberta' not in n)],
             'lr': self.lr * 10, 'weight_decay': 0.0},
        ]
        optimizer = AdamW(optimizer_parameters, lr=self.lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=100,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]
                
    def training_step(self, train_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, \
        examples_scores, examples_classes, token_class_labels = \
            train_batch["input_ids"], train_batch["attention_mask"], train_batch["token_scores_labels"], \
            train_batch['token_examples_mapping'], train_batch['examples_scores'], \
            train_batch['examples_classes'], train_batch['token_class_labels']
        
        hidden_states = self.longformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(self.pooler_scores(features))
        class_logits = self.class_logits(self.pooler_class(features))
        loss = F.cross_entropy(logits.view(-1, self.num_classes), token_scores_labels.view(-1))
        class_loss = F.cross_entropy(class_logits.view(-1, self.num_classes_class), token_class_labels.view(-1))

        # Convert to examples loss
        bs, ml, nc1 = logits.shape
        
        batch_preds = []
        batch_targs = []
        
        for i in range(bs):
            example_preds = []
            example_targs = []
            num_examples = token_examples_mapping[i].max()
            assert examples_scores[i,num_examples] >= 0 # and examples_scores[i,num_examples+1] < 0 # truncation breaks this
            for j in range(num_examples + 1):
                indices = token_examples_mapping[i] == j
                fts = features[i][indices].mean(dim=0)
                class_idx = examples_classes[i,j]
                emb = self.embedding(class_idx)
                example_features = self.pooler(torch.cat([emb,fts]))
                preds = self.example_logits(example_features)
                example_preds.append(preds)
                example_targs.append(examples_scores[i,j].view(1))
                
            example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
            example_targs = torch.cat(example_targs, dim=0)
            batch_preds.append(example_preds)
            batch_targs.append(example_targs)
        
        batch_preds = torch.cat(batch_preds, dim=0).view(-1, nc1)
        batch_targs = torch.cat(batch_targs, dim=0)
        
        example_loss = F.cross_entropy(batch_preds, batch_targs)
        
        if self.current_epoch == 0:
            total_loss = loss + class_loss + example_loss
        if self.current_epoch == 1:
            total_loss = 0.5 * loss + 0.5 * class_loss + example_loss
        if self.current_epoch == 2:
            total_loss = 0.1 * loss + 0.1 * class_loss + example_loss
            
        self.log('train_scores_loss', loss)
        self.log('train_classes_loss', class_loss)
        self.log('train_examples_loss', example_loss)
        self.log('train_total_loss', total_loss)

        return total_loss
        
    def validation_step(self, val_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, \
        examples_scores, examples_classes, token_class_labels = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["token_scores_labels"], \
            val_batch['token_examples_mapping'], val_batch['examples_scores'], val_batch['examples_classes'], \
            val_batch['token_class_labels']
        hidden_states = self.longformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(features)
        class_logits = self.class_logits(features)
        y_pred = F.log_softmax(logits, dim=-1)                                                
        loss = F.cross_entropy(logits.view(-1, self.num_classes), token_scores_labels.view(-1))
        class_loss = F.cross_entropy(class_logits.view(-1, self.num_classes_class), token_class_labels.view(-1))
        self.log('val_loss', loss)
        self.log('val_class_loss', class_loss)
        return {"preds": y_pred,
                "logits": logits,
                "features": features,
                "val_losses": loss,
                "token_examples_mapping": token_examples_mapping,
                "examples_scores": examples_scores,
                "examples_classes": examples_classes}   
    
    def validation_epoch_end(self, validation_step_outputs):

        bs, ml, nc1 = validation_step_outputs[0]["preds"].shape
        ml2 = validation_step_outputs[0]["examples_scores"].shape[-1]
        all_preds = [x["logits"][0] for x in validation_step_outputs]
        all_features = [x["features"][0] for x in validation_step_outputs]
        all_mappings = [x["token_examples_mapping"][0] for x in validation_step_outputs]
        all_scores = [x["examples_scores"][0] for x in validation_step_outputs]
        all_classes = [x["examples_classes"][0] for x in validation_step_outputs]

        num_texts = len(all_scores)
        
        example_preds = []
        example_targs = []
        
        for i in range(num_texts):
            num_examples = all_mappings[i].max()
            for j in range(num_examples + 1):
                indices = all_mappings[i] == j
                fts = all_features[i][indices].mean(dim=0)
                class_idx = all_classes[i][j]
                emb = self.embedding(class_idx)
                example_features = self.pooler(torch.cat([emb,fts]))
                preds = self.example_logits(example_features)
                example_preds.append(preds)
                example_targs.append(all_scores[i][j].view(1))        
                               
        example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
        example_targs = torch.cat(example_targs, dim=0)
        
        example_loss = F.cross_entropy(example_preds, example_targs)
        self.log('example_loss', example_loss)
        print(example_loss)
        
    def predict_step(self, val_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, examples_scores, examples_classes = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["token_scores_labels"], \
            val_batch['token_examples_mapping'], val_batch['examples_scores'], val_batch['examples_classes']
        hidden_states = self.longformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(features)
        y_pred = F.softmax(logits, dim=-1)  
        
        bs, ml, nc1 = logits.shape
        ml2 = 40
        
        batch_preds = []
        batch_targs = []
        
        for i in range(bs):
            example_preds = []
            example_targs = []
            num_examples = token_examples_mapping[i].max()
            assert examples_scores[i,num_examples] >= 0 # and examples_scores[i,num_examples+1] < 0 # truncation breaks this
            for j in range(num_examples + 1):               
                indices = token_examples_mapping[i] == j
                fts = features[i][indices].mean(dim=0)
                class_idx = examples_classes[i,j]
                emb = self.embedding(class_idx)
                example_features = self.pooler(torch.cat([emb,fts]))
                preds = self.example_logits(example_features)
                example_preds.append(preds)
                example_targs.append(examples_scores[i,j].view(1))                  
                
            example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
            example_targs = torch.cat(example_targs, dim=0)
            batch_preds.append(example_preds)
            batch_targs.append(example_targs)
        
        return batch_preds, batch_targs
        

In [9]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["token_class_labels"] = [sample["token_class_labels"] for sample in batch]
        output["token_scores_labels"] = [sample["token_scores_labels"] for sample in batch]
        output["token_examples_mapping"] = [sample["token_examples_mapping"] for sample in batch]
        output["examples_scores"] = [sample["examples_scores"] for sample in batch]
        output["examples_classes"] = [sample["examples_classes"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])
        batch_max_ex = max([len(sco) for sco in output["examples_scores"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
            output["token_class_labels"] = [s + (batch_max - len(s)) * [-100] for s in output["token_class_labels"]]
            output["token_scores_labels"] = [s + (batch_max - len(s)) * [-100] for s in output["token_scores_labels"]]
            output["token_examples_mapping"] = [s + (batch_max - len(s)) * [-1] for s in output["token_examples_mapping"]]
            output["examples_scores"] = [s + (batch_max_ex - len(s)) * [-1] for s in output["examples_scores"]]
            output["examples_classes"] = [s + (batch_max_ex - len(s)) * [-1] for s in output["examples_classes"]]

        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]
            output["token_class_labels"] = [(batch_max - len(s)) * [-100] + s for s in output["token_class_labels"]]
            output["token_scores_labels"] = [(batch_max - len(s)) * [-100] + s for s in output["token_scores_labels"]]
            output["token_examples_mapping"] = [(batch_max - len(s)) * [-1] + s for s in output["token_examples_mapping"]]
            output["examples_scores"] = [(batch_max_ex - len(s)) * [-1] + s for s in output["examples_scores"]]
            output["examples_classes"] = [(batch_max_ex - len(s)) * [-1] + s for s in output["examples_classes"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["token_class_labels"] = torch.tensor(output["token_class_labels"], dtype=torch.long)
        output["token_scores_labels"] = torch.tensor(output["token_scores_labels"], dtype=torch.long)
        output["token_examples_mapping"] = torch.tensor(output["token_examples_mapping"], dtype=torch.long)
        output["examples_scores"] = torch.tensor(output["examples_scores"], dtype=torch.long)
        output["examples_classes"] = torch.tensor(output["examples_classes"], dtype=torch.long)

        return output


In [10]:
from pytorch_lightning.callbacks import ModelCheckpoint

CKPT_PTH = '../output/PLpret1/0/feedback-epoch=01-val_class_loss=2.03.ckpt'
model_checkpoint = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
collate_fn = Collate(tokenizer)
bs = 2
project = 'fbck'
seed = 42
randmask_proba = 0.1
lr = 3e-6
epochs = 1 if DEBUG else 2
num_classes = 3
num_classes_class = 8

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
import pickle
with open('processed-deberta-v3-large-nbroad.pickle', 'rb') as handle:
    pdf = pickle.load(handle)

In [12]:
if DEBUG: pdf = pdf.sample(n=200, random_state=4)

In [13]:
def train_fold(fold):
    
    print()
    print('*' * 100)
    print(f'Training fold {fold}')
    print('*' * 100)
    
    df_train = pdf[pdf.fold != fold].reset_index(drop=True)
    df_valid = pdf[pdf.fold == fold].reset_index(drop=True)

    tags = ['debug'] if DEBUG else ['PL', 'train', f'fold_{fold}']

    run = wandb.init(project=project, 
                       name=f"{EXP}_fold_{fold}",
                       tags=tags,
                       group=f"{EXP}")
    run.log_code()

    pl.seed_everything(seed)


    train_dataset = MyDataset(
        df_train,
        tokenizer,
        stage='train',
        rand_prob=randmask_proba
    )

    valid_dataset = MyDataset(
        df_valid,
        tokenizer,
        stage='valid',
        rand_prob=randmask_proba
    )

    train_loader = DataLoader(train_dataset,
                              batch_size=bs,
                              shuffle=True,
                              collate_fn=collate_fn,
                              num_workers=4, pin_memory=True, drop_last=True)

    val_loader = DataLoader(valid_dataset,
                              batch_size=1,
                              shuffle=False,
                              collate_fn=collate_fn,
                              num_workers=4, pin_memory=True, drop_last=False)

    warnings.filterwarnings("ignore")

    model = MyModule(lr=lr,
                     model_checkpoint=model_checkpoint, 
                     num_classes=num_classes,
                     num_classes_class=num_classes_class,
                     emb_dim=emb_dim,
                    )
    
    model.load_state_dict(torch.load(CKPT_PTH)['state_dict'])

    wandb_logger = WandbLogger(project=project)

    checkpoint_callback = ModelCheckpoint(
        save_top_k=1,
        monitor="example_loss",
        mode="min",
        dirpath=f"../output/{EXP}/{fold}",
        filename="feedback-{epoch:02d}-{example_loss:.2f}",
    )

    trainer = pl.Trainer(precision=16, 
                         accelerator="gpu", devices=1, max_epochs=epochs,
                         log_every_n_steps=100, logger=wandb_logger,
                         # val_check_interval=0.2,
                         # check_val_every_n_epoch=2,
                         default_root_dir=f"../output/{EXP}",
                         callbacks=[checkpoint_callback],
                         accumulate_grad_batches=2,
                         )

    trainer.fit(model, train_loader, val_loader)

    wandb.finish()

In [None]:
rng = 1 if DEBUG else 5
for fold in range(rng):
    train_fold(fold)


****************************************************************************************************
Training fold 0
****************************************************************************************************


[34m[1mwandb[0m: Currently logged in as: [33mdarek[0m. Use [1m`wandb login --relogin`[0m to force relogin


Global seed set to 42
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Sanity Checking DataLoader 0: 100%|████████████████████████████████████| 2/2 [00:01<00:00,  1.66s/it]tensor(1.0482, device='cuda:0')
Epoch 0:  67%|██████████████████▋         | 1676/2515 [07:39<03:49,  3.65it/s, loss=1.72, v_num=p7rm]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                            | 0/839 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                               | 0/839 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                       | 1/839 [00:00<02:04,  6.72it/s][A
Epoch 0:  67%|██████████████████▋         | 1677/2515 [07:39<03:49,  3.65it/s, loss=1.72, v_num=p7rm][A
Validation DataLoader 0:   0%|                                       | 2/839 [00:00<01:43,  8.10it/s][A
Epoch 0:  67%|██████████████████▋         | 1678/2515 [07:39<03:49,  3.65it/s, loss=1.72, v_num=p7rm][A
Epoch 0:  67%|██████████████████▋         | 1679/2515 [07:40<03:49,  3.65it/s, loss=1.72, v_num=p7rm]

In [None]:
ls ../output/PL23/0

In [None]:
# !rm -rf ../output/PL23

In [25]:
fold = 0
df_valid = pdf[pdf.fold == fold].reset_index(drop=True)

valid_dataset = MyDataset(
    df_valid,
    tokenizer,
    stage='valid',
    rand_prob=randmask_proba
)

val_loader = DataLoader(valid_dataset,
                          batch_size=1,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=4, pin_memory=True, drop_last=False)

In [26]:
# or call with pretrained model
PATH = '../output/PL23/0/feedback-epoch=01-example_loss=0.62.ckpt'
model = MyModule.load_from_checkpoint(PATH, lr=lr,
                 model_checkpoint=model_checkpoint, 
                 num_classes=num_classes,
                 num_classes_class=num_classes_class,
                 emb_dim=emb_dim)
trainer = pl.Trainer(accelerator="gpu")
predictions = trainer.predict(model, dataloaders=val_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/darek/projects/fbck/notebooks/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|█████████████████████████████████████| 839/839 [01:03<00:00, 13.13it/s]


In [28]:
preds = torch.cat([p for b in predictions for p in b[0]])
targs = torch.cat([p for b in predictions for p in b[1]])

In [29]:
preds.shape, targs.shape

(torch.Size([7356, 3]), torch.Size([7356]))

In [30]:
F.cross_entropy(preds, targs)

tensor(0.6172)

In [31]:
(preds.argmax(dim=-1) == targs).sum()/preds.shape[0]

tensor(0.7282)

In [None]:
# SOLVED: why aren't these preds deterministic???????????????????? >>> I had a random mask in valid loader!!!

In [38]:
import os
best_checkpoints = []
for fold in range(5):
    folder = f'../output/PL20/{fold}'
    ckpt = os.listdir(folder)
    ckpt = folder + '/' + ckpt[0]
    best_checkpoints.append(ckpt)
    
best_checkpoints

['../output/PL20/0/feedback-epoch=01-example_loss=0.62.ckpt',
 '../output/PL20/1/feedback-epoch=01-example_loss=0.65.ckpt',
 '../output/PL20/2/feedback-epoch=01-example_loss=0.65.ckpt',
 '../output/PL20/3/feedback-epoch=01-example_loss=0.66.ckpt',
 '../output/PL20/4/feedback-epoch=01-example_loss=0.64.ckpt']

In [39]:
for fold in range(5):
    ckpt = best_checkpoints[fold]
    !~/gdrive upload {ckpt} --name pytorch_model_{fold}.ckpt

Uploading ../output/PL20/0/feedback-epoch=01-example_loss=0.62.ckpt
Uploaded 1MXFHLBOmYLrWBPUsHpa-fcZqk-rohQby at 29.7 MB/s, total 5.2 GB
Uploading ../output/PL20/1/feedback-epoch=01-example_loss=0.65.ckpt
Uploaded 16pHTntoAhHKDQWhK1Fo_lakzRpgvTYrv at 29.1 MB/s, total 5.2 GB
Uploading ../output/PL20/2/feedback-epoch=01-example_loss=0.65.ckpt
Uploaded 1pIMZ6ZOPOoJwOh27iZqNUgs4bTMULXZd at 29.1 MB/s, total 5.2 GB
Uploading ../output/PL20/3/feedback-epoch=01-example_loss=0.66.ckpt
Uploaded 1pvfXuGFsxgs571KLspjqbGxk3VpmKdGI at 29.7 MB/s, total 5.2 GB MB/s
Uploading ../output/PL20/4/feedback-epoch=01-example_loss=0.64.ckpt
Uploaded 19btDmymxIhd746twP9w53hLy8FtrTeyf at 28.0 MB/s, total 5.2 GB
