# 6.8620 Final Project: Sarcasm + Emotion

* [proposal](https://www.overleaf.com/project/662876554d9d1a627cdd74d3)
* pset refs:
    * *[ps3](https://colab.research.google.com/drive/1yaPzyM9dKCRH1huGqNW1gtyTPet_P4a5?pli=1&usp=drive_fs#scrollTo=gXCHTYCp2FGJ): acoustic models (data processing)
    * [ps4](https://colab.research.google.com/drive/1BKiIERrEerfkm8Z0bPZcLRLXMjvytmBo?pli=1&usp=drive_fs): forced alignments, HMM, HMM-GMM, neural alignment (terminal data)
    * [ps5](https://colab.research.google.com/drive/1jlkOQdIkWP3LmaSsdE9VMtImU8QA_dng?pli=1&usp=drive_fs): n-gram LMs (terminal data)
    * [ps6](https://colab.research.google.com/drive/17uz4Ryfwm9F3s4Jhv1yYsYeQHQs6h408?pli=1&usp=drive_fs): neural LMs (nltk brown)
    * *[ps7](https://colab.research.google.com/drive/1Dx5392Ph_lE3J9bvWXhQiabS11YtfK8W?pli=1&usp=drive_fs): end-to-end ASR
    * **[ps8](https://colab.research.google.com/drive/1235jJYQgfHRuX_TbU7GMTpzlJIKYrDDc?pli=1&usp=drive_fs): wav2vec, xlsr (Wav2vecFeatureExtractor section)

# Initialization

In [1]:
!pip install -qU transformers datasets evaluate accelerate peft huggingface_hub peft

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m276.5/542.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pickle
import numpy as np
import json
import librosa
from IPython.display import Audio as AudioDisplay
import pandas as pd
import soundfile as sf
import random
import time
import tqdm
from sklearn.metrics import accuracy_score

from transformers import HubertModel, HubertConfig, PretrainedConfig, PreTrainedModel, TrainingArguments, Trainer
from transformers import AutoModel, AutoFeatureExtractor, AutoModelForAudioClassification, AutoProcessor
from datasets import load_dataset, Audio, Dataset
import evaluate
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def mount_gdrive():
    from google.colab import drive
    drive.mount('/content/gdrive')

In [4]:
SEED = 14
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Data

## General

In [5]:
# https://huggingface.co/docs/transformers/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor
hubert_processor = AutoProcessor.from_pretrained('facebook/hubert-large-ls960-ft') # is using a processor different from the model ok?

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [6]:
# using processor: https://huggingface.co/docs/transformers/en/model_doc/wav2vec2#transformers.Wav2Vec2FeatureExtractor
def preprocess_dataset(batch):
    output = hubert_processor(
        audio=[audio['array'] for audio in batch['audio']],
        sampling_rate=16_000,
        padding='longest',
        return_tensors='pt',
    )
    return output

In [7]:
def get_dataset_splits(ds):
    train_valtest = ds.train_test_split(test_size=0.2, seed=SEED)
    val_test = train_valtest['test'].train_test_split(test_size=0.5, seed=SEED)

    train_set = train_valtest['train']
    val_set = val_test['train']
    test_set = val_test['test']

    return train_set, val_set, test_set

In [8]:
def onehot_e(batch):
    label = torch.zeros(8)
    label[batch['labels']-1] = 1
    batch['one_hot'] = label.to(torch.int64)
    return batch

## MUStARD: sarcasm detection

([paper](https://aclanthology.org/P19-1455/), [github](https://github.com/soujanyaporia/MUStARD))

In [9]:
!gdown 1--U_CMuzrXZ7t01sFgeh0gKp-V8_D6pz
!unzip sarcasm_ds.zip

Downloading...
From (original): https://drive.google.com/uc?id=1--U_CMuzrXZ7t01sFgeh0gKp-V8_D6pz
From (redirected): https://drive.google.com/uc?id=1--U_CMuzrXZ7t01sFgeh0gKp-V8_D6pz&confirm=t&uuid=05e2556e-6594-4022-825a-3cda346ce21e
To: /content/sarcasm_ds.zip
100% 218M/218M [00:03<00:00, 60.3MB/s]
Archive:  sarcasm_ds.zip
   creating: sarcasm_ds/
  inflating: sarcasm_ds/data-00000-of-00004.arrow  
  inflating: sarcasm_ds/data-00001-of-00004.arrow  
  inflating: sarcasm_ds/state.json   
  inflating: sarcasm_ds/data-00003-of-00004.arrow  
  inflating: sarcasm_ds/data-00002-of-00004.arrow  
  inflating: sarcasm_ds/dataset_info.json  


In [10]:
sarcasm_ds = Dataset.load_from_disk('sarcasm_ds')
print(sarcasm_ds.features)
print(sarcasm_ds)

{'identifier': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
Dataset({
    features: ['identifier', 'labels', 'input_values', 'attention_mask'],
    num_rows: 690
})


In [11]:
sarcasm_train_set, sarcasm_val_set, sarcasm_test_set = get_dataset_splits(sarcasm_ds)
print(sarcasm_train_set.features)
print(sarcasm_train_set)

sarcasm_train_loader = DataLoader(sarcasm_train_set, batch_size=2)
sarcasm_val_loader = DataLoader(sarcasm_val_set, batch_size=2)
sarcasm_test_loader = DataLoader(sarcasm_test_set, batch_size=2)

{'identifier': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
Dataset({
    features: ['identifier', 'labels', 'input_values', 'attention_mask'],
    num_rows: 552
})


In [12]:
print(sarcasm_train_set[0])

{'identifier': '2_111', 'labels': tensor(0), 'input_values': tensor([-0.1981, -0.6425, -0.3329,  ...,  0.0000,  0.0000,  0.0000]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


## RAVDESS: emotion detection

([website](https://zenodo.org/records/1188976#.YFZuJ0j7SL8))

To keep dataset sizes comparable, take 100 samples of each of the 6 emotions in RAVDESS to create a sampled dataset of ~600 examples, similar size to MUStARD

In [13]:
!gdown 1UfcwmyRjTGRP1g4NQa92Uzs9z0PH-hil
!unzip emotion_ds.zip

Downloading...
From (original): https://drive.google.com/uc?id=1UfcwmyRjTGRP1g4NQa92Uzs9z0PH-hil
From (redirected): https://drive.google.com/uc?id=1UfcwmyRjTGRP1g4NQa92Uzs9z0PH-hil&confirm=t&uuid=ffaefa47-2f88-4c4c-8083-183f87e74203
To: /content/emotion_ds.zip
100% 151M/151M [00:01<00:00, 98.2MB/s]
Archive:  emotion_ds.zip
   creating: emotion_dataset/
   creating: emotion_ds/
  inflating: emotion_ds/state.json   
  inflating: emotion_ds/data-00000-of-00001.arrow  
  inflating: emotion_ds/dataset_info.json  


In [14]:
emotion_ds = Dataset.load_from_disk('emotion_ds')
print(emotion_ds.features)
print(emotion_ds)

{'labels': Value(dtype='int64', id=None), 'intensity': Value(dtype='int64', id=None), 'statement': Value(dtype='int64', id=None), 'actor': Value(dtype='int64', id=None), 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
Dataset({
    features: ['labels', 'intensity', 'statement', 'actor', 'input_values', 'attention_mask'],
    num_rows: 720
})


In [15]:
emotion_train_set, emotion_val_set, emotion_test_set = get_dataset_splits(emotion_ds)
print(emotion_train_set.features)
print(emotion_train_set)

emotion_train_loader = DataLoader(emotion_train_set, batch_size=2)
emotion_val_loader = DataLoader(emotion_val_set, batch_size=2)
emotion_test_loader = DataLoader(emotion_test_set, batch_size=2)

{'labels': Value(dtype='int64', id=None), 'intensity': Value(dtype='int64', id=None), 'statement': Value(dtype='int64', id=None), 'actor': Value(dtype='int64', id=None), 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
Dataset({
    features: ['labels', 'intensity', 'statement', 'actor', 'input_values', 'attention_mask'],
    num_rows: 576
})


In [16]:
print(emotion_train_set[0])

{'labels': tensor(3), 'intensity': tensor(2), 'statement': tensor(1), 'actor': tensor(23), 'input_values': tensor([0.0004, 0.0016, 0.0004,  ..., 0.0000, 0.0000, 0.0000]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


In [17]:
emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'] # aligned with ravdess' definition
emotion_id2label = {}
emotion_label2id = {}
for i in range(8): # each id is one less than what is defined by ravdess (eg. neutral in ravdess = 1, neutral id here = 0)
    emotion_id2label[i] = emotions[i-1]
    emotion_label2id[emotions[i-1]] = i
emotion_id2label

{0: 'surprised',
 1: 'neutral',
 2: 'calm',
 3: 'happy',
 4: 'sad',
 5: 'angry',
 6: 'fearful',
 7: 'disgust'}

# Models

## HuBERT

([paper](https://arxiv.org/abs/2106.07447), [github](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert), [documentation](https://huggingface.co/docs/transformers/en/model_doc/hubert#transformers.HubertModel), [model card](https://huggingface.co/facebook/hubert-base-ls960))

## General

In [18]:
hubert_model_name = 'facebook/hubert-base-ls960'
hubert_model = HubertModel.from_pretrained(hubert_model_name) # 94M params

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [19]:
# lora_config = LoraConfig(
#             task_type=TaskType.FEATURE_EXTRACTION, #r=8, lora_alpha=32, lora_dropout=0.1
# )
# peft_hubert_model = get_peft_model(hubert_model, lora_config)
# print(count_params(peft_hubert_model))

In [20]:
class HubertClassifier(nn.Module):
    def __init__(
            self,
            hubert_model_name=hubert_model_name, # 'facebook/hubert-base-ls960'
            num_classes=2,
            transplant_hubert=None,
            transplant_classifier=None,
    ):
        super().__init__()
        if transplant_hubert is None:
            self.hubert = HubertModel.from_pretrained(hubert_model_name)
        else:
            print('transplanted hubert!')
            self.hubert = transplant_hubert
        # TODO: PEFT?

        if transplant_classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(768, 32),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Linear(32, num_classes)
            )
        else:
            print('transplanted classifier!')
            self.classifier = transplant_classifier

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_values=None, attention_mask=None, labels=None):
        hubert_output = self.hubert(input_values=input_values, attention_mask=attention_mask)
        pooler_output = torch.mean(hubert_output.last_hidden_state, dim=1) # replace with PEFT?
        logits = self.classifier(pooler_output)
        out = {
            'logits': logits,
            'pooler_output': pooler_output,
        }

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            out['loss'] = loss

        return out

In [21]:
# custom huggingface config/model definition
# https://huggingface.co/docs/transformers/en/custom_models
class HubertClassifierConfig(PretrainedConfig):
    def __init__(
            self,
            hubert_model_name = hubert_model_name,
            num_classes = 2,
            **kwargs,
    ):
        self.hubert_model_name = hubert_model_name
        self.num_classes = num_classes
        super().__init__(**kwargs)

class HubertClassifierModel(PreTrainedModel):
    config_class = HubertClassifierConfig

    def __init__(self, config):
        super().__init__(config)
        self.model = HubertClassifier(
            hubert_model_name=config.hubert_model_name,
            num_classes=config.num_classes
        )

    def forward(self, input_values=None, attention_mask=None, labels=None):
        return self.model.forward(input_values=input_values, attention_mask=attention_mask, labels=labels)

In [27]:
def train(hubert_classifier, train_loader, val_loader, num_epochs=2):
    train_losses, val_losses = [], []

    optimizer = torch.optim.Adam(hubert_classifier.parameters(), lr=5e-3)
    hubert_classifier = hubert_classifier.to(DEVICE)
    start_time = time.time()

    for epoch in range(num_epochs):
        hubert_classifier.train()
        epoch_train_loss = 0
        for ix, batch in tqdm.tqdm(enumerate(train_loader), total=len(train_loader)):
            input_values = batch['input_values'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            out = hubert_classifier(input_values=input_values, attention_mask=attention_mask, labels=labels)
            print(f'{out["logits"]=}')
            print(f'{batch["labels"]=}')

            loss = out['loss']
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            epoch_train_loss += loss.item()
        epoch_train_loss /= len(train_loader)
        train_losses.append(epoch_train_loss)

        hubert_classifier.eval()
        epoch_val_loss = 0
        for ix, batch in tqdm.tqdm(enumerate(val_loader), total=len(val_loader)):
            input_values = batch['input_values'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            out = hubert_classifier(input_values=input_values, attention_mask=attention_mask, labels=labels)
            loss = out['loss']
            epoch_val_loss += loss.item()
        epoch_val_loss /= len(val_loader)
        val_losses.append(epoch_val_loss)


        print(f'Epoch {epoch+1}: Loss (train/val): {epoch_train_loss}/{epoch_val_loss}')
        print()

    print(f'Total training time elapsed: {round(time.time() - start_time, 2)}')

    return train_losses, val_losses

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(f'{predictions=}')
    print(f'{predictions.shape=}')
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score(labels, predictions)

In [24]:
h_config = HubertClassifierConfig(num_classes=2)
h_model = HubertClassifierModel(h_config)
print(h_model)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

HubertClassifierModel(
  (model): HubertClassifier(
    (hubert): HubertModel(
      (feature_extractor): HubertFeatureEncoder(
        (conv_layers): ModuleList(
          (0): HubertGroupNormConvLayer(
            (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
            (activation): GELUActivation()
            (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
          )
          (1-4): 4 x HubertNoLayerNormConvLayer(
            (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
            (activation): GELUActivation()
          )
          (5-6): 2 x HubertNoLayerNormConvLayer(
            (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
            (activation): GELUActivation()
          )
        )
      )
      (feature_projection): HubertFeatureProjection(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (projection): Linear(in_features=512, out_features=768, bias=True)
  

## Sarcasm Hubert

In [29]:
# base model testing
sarcasm_hubert = HubertClassifier(num_classes=2)

train_losses, val_losses = train(sarcasm_hubert, sarcasm_train_loader, sarcasm_val_loader, num_epochs=1)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

out["logits"]=tensor([[0.2056, 0.0096],
        [0.2051, 0.0150]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  0%|          | 1/276 [00:00<01:35,  2.87it/s]

out["logits"]=tensor([[1.0095, 0.0211],
        [1.0253, 0.0140]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  1%|          | 3/276 [00:00<01:23,  3.25it/s]

out["logits"]=tensor([[0.1984, 0.2190],
        [0.1952, 0.2216]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.2107, 0.7136],
        [0.2093, 0.6793]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  1%|▏         | 4/276 [00:01<01:25,  3.17it/s]

out["logits"]=tensor([[ 1.4477, -0.6383],
        [ 1.2959, -0.7101]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


  2%|▏         | 6/276 [00:01<01:23,  3.23it/s]

out["logits"]=tensor([[ 0.4682, -0.4829],
        [ 0.5191, -0.4814]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[0.0638, 0.8288],
        [0.1864, 0.4544]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  3%|▎         | 7/276 [00:02<01:27,  3.07it/s]

out["logits"]=tensor([[ 0.6442, -0.4073],
        [ 1.0807, -0.3487]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  3%|▎         | 8/276 [00:02<01:29,  2.99it/s]

out["logits"]=tensor([[-0.8627,  2.3052],
        [-0.9927,  2.5130]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


  3%|▎         | 9/276 [00:02<01:28,  3.01it/s]

out["logits"]=tensor([[-0.0698,  0.6771],
        [-0.1096,  0.6365]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  4%|▎         | 10/276 [00:03<01:30,  2.94it/s]

out["logits"]=tensor([[-0.0078,  0.3460],
        [ 0.0117,  0.3106]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  4%|▍         | 11/276 [00:03<01:29,  2.95it/s]

out["logits"]=tensor([[-0.0629,  0.7669],
        [-0.0562,  0.7922]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  4%|▍         | 12/276 [00:03<01:30,  2.92it/s]

out["logits"]=tensor([[-0.0518,  0.9116],
        [-0.0611,  0.8950]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


  5%|▍         | 13/276 [00:04<01:31,  2.88it/s]

out["logits"]=tensor([[ 0.0395,  0.7156],
        [-0.0539,  0.3114]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  5%|▌         | 15/276 [00:04<01:26,  3.02it/s]

out["logits"]=tensor([[ 0.0261,  0.8288],
        [-0.0423,  0.8593]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[-0.0068,  0.7308],
        [-0.0214,  0.7908]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


  6%|▌         | 16/276 [00:05<01:24,  3.07it/s]

out["logits"]=tensor([[-0.2848,  1.0669],
        [-0.2828,  1.0678]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  7%|▋         | 18/276 [00:05<01:20,  3.22it/s]

out["logits"]=tensor([[ 0.0525,  0.5723],
        [-0.0105,  0.4929]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0521, 0.1162],
        [0.0515, 0.1157]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  7%|▋         | 19/276 [00:06<01:21,  3.14it/s]

out["logits"]=tensor([[0.0540, 0.0361],
        [0.0539, 0.0321]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  7%|▋         | 20/276 [00:06<01:20,  3.17it/s]

out["logits"]=tensor([[0.0626, 0.0059],
        [0.0744, 0.0133]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  8%|▊         | 21/276 [00:06<01:21,  3.12it/s]

out["logits"]=tensor([[0.1010, 0.0116],
        [0.1016, 0.0121]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  8%|▊         | 22/276 [00:07<01:24,  3.02it/s]

out["logits"]=tensor([[0.0957, 0.0475],
        [0.2409, 0.0423]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  8%|▊         | 23/276 [00:07<01:25,  2.95it/s]

out["logits"]=tensor([[0.0850, 0.1057],
        [0.2516, 0.0599]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


  9%|▊         | 24/276 [00:07<01:23,  3.01it/s]

out["logits"]=tensor([[0.0799, 0.1962],
        [0.1031, 0.2180]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


  9%|▉         | 25/276 [00:08<01:21,  3.07it/s]

out["logits"]=tensor([[-0.1427,  0.0467],
        [ 0.2843,  0.1152]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


  9%|▉         | 26/276 [00:08<01:19,  3.13it/s]

out["logits"]=tensor([[-0.0149,  0.3105],
        [-0.0172,  0.3092]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 10%|▉         | 27/276 [00:08<01:22,  3.01it/s]

out["logits"]=tensor([[-0.1203,  0.4560],
        [-0.0690,  0.4875]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 10%|█         | 28/276 [00:09<01:22,  2.99it/s]

out["logits"]=tensor([[ 0.2415,  0.2227],
        [-0.4918,  0.2852]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 11%|█         | 29/276 [00:09<01:22,  2.99it/s]

out["logits"]=tensor([[-0.3207,  0.7160],
        [-0.6032,  0.3836]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 11%|█         | 31/276 [00:10<01:17,  3.16it/s]

out["logits"]=tensor([[-0.3484,  0.7570],
        [ 0.1797,  0.3727]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[ 0.2103,  0.2996],
        [-0.5795,  0.4333]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 12%|█▏        | 32/276 [00:10<01:18,  3.11it/s]

out["logits"]=tensor([[-0.6517,  0.4101],
        [-0.3966,  0.6988]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 12%|█▏        | 33/276 [00:10<01:20,  3.02it/s]

out["logits"]=tensor([[-0.6435,  0.5044],
        [-0.4807,  0.7373]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 12%|█▏        | 34/276 [00:11<01:19,  3.06it/s]

out["logits"]=tensor([[ 0.2404,  0.1772],
        [-0.4841,  0.6914]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 13%|█▎        | 35/276 [00:11<01:19,  3.04it/s]

out["logits"]=tensor([[ 0.2658,  0.1721],
        [-0.4597,  0.7047]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 13%|█▎        | 36/276 [00:11<01:18,  3.04it/s]

out["logits"]=tensor([[-0.4545,  0.6796],
        [-0.4540,  0.6799]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 14%|█▍        | 38/276 [00:12<01:17,  3.09it/s]

out["logits"]=tensor([[-1.1899,  2.4147],
        [-0.8142,  1.3843]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 14%|█▍        | 39/276 [00:12<01:14,  3.20it/s]

out["logits"]=tensor([[-0.3926,  0.6037],
        [-0.3937,  0.6042]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[-0.4293,  0.5752],
        [ 0.2549,  0.0739]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 14%|█▍        | 40/276 [00:13<01:16,  3.07it/s]

out["logits"]=tensor([[-0.4176,  0.5302],
        [-0.4181,  0.5298]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 15%|█▍        | 41/276 [00:13<01:18,  2.97it/s]

out["logits"]=tensor([[0.2194, 0.0369],
        [0.2177, 0.0353]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 15%|█▌        | 42/276 [00:13<01:20,  2.92it/s]

out["logits"]=tensor([[-0.3468,  1.0961],
        [-0.1785,  0.7989]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 16%|█▌        | 43/276 [00:14<01:17,  3.00it/s]

out["logits"]=tensor([[-0.4596,  0.4817],
        [-0.4601,  0.4816]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 16%|█▌        | 44/276 [00:14<01:16,  3.05it/s]

out["logits"]=tensor([[-0.5008,  0.4784],
        [-0.5348,  0.4452]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 17%|█▋        | 46/276 [00:15<01:10,  3.27it/s]

out["logits"]=tensor([[-0.5654,  0.4920],
        [-0.5650,  0.4916]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[-0.6411,  0.5300],
        [-0.6410,  0.5299]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 17%|█▋        | 47/276 [00:15<01:11,  3.18it/s]

out["logits"]=tensor([[-0.6731,  0.5556],
        [-0.6730,  0.5555]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 17%|█▋        | 48/276 [00:15<01:11,  3.19it/s]

out["logits"]=tensor([[-0.7248,  0.5973],
        [-0.7251,  0.5975]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 18%|█▊        | 49/276 [00:15<01:11,  3.19it/s]

out["logits"]=tensor([[-0.7955,  0.6548],
        [-0.7960,  0.6551]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 18%|█▊        | 50/276 [00:16<01:10,  3.19it/s]

out["logits"]=tensor([[-0.8250,  0.6786],
        [-0.8251,  0.6786]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 19%|█▉        | 52/276 [00:16<01:06,  3.36it/s]

out["logits"]=tensor([[-0.8733,  0.7179],
        [ 0.1309, -0.0369]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[-0.8662,  0.7120],
        [-0.8661,  0.7119]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 19%|█▉        | 53/276 [00:17<01:07,  3.31it/s]

out["logits"]=tensor([[-0.7790,  0.6412],
        [-0.7796,  0.6417]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 20%|█▉        | 54/276 [00:17<01:07,  3.28it/s]

out["logits"]=tensor([[-0.6470,  0.5354],
        [-0.6474,  0.5357]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 20%|█▉        | 55/276 [00:17<01:10,  3.12it/s]

out["logits"]=tensor([[ 0.1354, -0.0414],
        [-0.5212,  0.4356]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 20%|██        | 56/276 [00:18<01:09,  3.16it/s]

out["logits"]=tensor([[ 0.1367, -0.0427],
        [-0.4251,  0.3604]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 21%|██        | 57/276 [00:18<01:11,  3.07it/s]

out["logits"]=tensor([[-0.3298,  0.2869],
        [-0.3301,  0.2871]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 21%|██        | 58/276 [00:18<01:10,  3.11it/s]

out["logits"]=tensor([[-0.2455,  0.2228],
        [-0.2453,  0.2227]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 21%|██▏       | 59/276 [00:19<01:10,  3.08it/s]

out["logits"]=tensor([[-0.2723,  0.8701],
        [-0.4972,  1.3774]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 22%|██▏       | 60/276 [00:19<01:09,  3.11it/s]

out["logits"]=tensor([[-0.0988,  0.1126],
        [-0.0988,  0.1125]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 22%|██▏       | 61/276 [00:19<01:08,  3.14it/s]

out["logits"]=tensor([[-0.0332,  0.0651],
        [-0.0335,  0.0653]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 22%|██▏       | 62/276 [00:20<01:10,  3.03it/s]

out["logits"]=tensor([[ 0.1499, -0.0558],
        [ 0.0191,  0.0281]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 23%|██▎       | 63/276 [00:20<01:08,  3.09it/s]

out["logits"]=tensor([[0.0587, 0.0006],
        [0.0588, 0.0006]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 23%|██▎       | 64/276 [00:20<01:10,  3.00it/s]

out["logits"]=tensor([[ 0.0901, -0.0207],
        [ 0.0900, -0.0206]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 24%|██▎       | 65/276 [00:21<01:10,  3.00it/s]

out["logits"]=tensor([[ 0.1154, -0.0375],
        [ 0.1198, -0.0355]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 24%|██▍       | 66/276 [00:21<01:11,  2.93it/s]

out["logits"]=tensor([[ 0.1357, -0.0506],
        [ 0.1357, -0.0507]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 24%|██▍       | 67/276 [00:21<01:09,  3.00it/s]

out["logits"]=tensor([[ 0.1520, -0.0609],
        [ 0.1521, -0.0610]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 25%|██▍       | 68/276 [00:22<01:07,  3.06it/s]

out["logits"]=tensor([[ 0.1573, -0.0642],
        [ 0.1572, -0.0641]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 25%|██▌       | 69/276 [00:22<01:06,  3.11it/s]

out["logits"]=tensor([[ 0.1605, -0.0664],
        [ 0.1605, -0.0664]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 26%|██▌       | 71/276 [00:23<01:04,  3.18it/s]

out["logits"]=tensor([[ 0.1614, -0.0673],
        [ 0.1614, -0.0673]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 26%|██▌       | 72/276 [00:23<01:03,  3.22it/s]

out["logits"]=tensor([[ 0.1620, -0.0680],
        [ 0.1620, -0.0680]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 26%|██▋       | 73/276 [00:23<01:01,  3.28it/s]

out["logits"]=tensor([[ 0.6325, -0.4800],
        [ 0.6324, -0.4799]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.1641, -0.0700],
        [ 0.1641, -0.0700]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 27%|██▋       | 75/276 [00:24<01:01,  3.29it/s]

out["logits"]=tensor([[ 0.1639, -0.0699],
        [ 0.1639, -0.0699]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1631, -0.0690],
        [ 0.1630, -0.0689]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 28%|██▊       | 76/276 [00:24<01:03,  3.14it/s]

out["logits"]=tensor([[ 0.1614, -0.0673],
        [ 0.1614, -0.0673]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 28%|██▊       | 78/276 [00:25<01:02,  3.18it/s]

out["logits"]=tensor([[ 0.1592, -0.0651],
        [ 0.1592, -0.0651]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.1578, -0.0637],
        [ 0.1578, -0.0637]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 29%|██▉       | 80/276 [00:25<01:01,  3.21it/s]

out["logits"]=tensor([[ 0.1572, -0.0631],
        [ 0.1572, -0.0631]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1559, -0.0618],
        [ 0.1559, -0.0618]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 29%|██▉       | 81/276 [00:26<01:02,  3.12it/s]

out["logits"]=tensor([[ 0.1546, -0.0605],
        [ 0.1546, -0.0605]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 30%|██▉       | 82/276 [00:26<01:02,  3.12it/s]

out["logits"]=tensor([[ 0.1534, -0.0593],
        [ 0.1534, -0.0593]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 30%|███       | 83/276 [00:26<01:01,  3.12it/s]

out["logits"]=tensor([[ 0.3196, -0.2205],
        [ 0.3202, -0.2211]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 30%|███       | 84/276 [00:27<01:00,  3.15it/s]

out["logits"]=tensor([[ 0.1497, -0.0556],
        [ 0.1497, -0.0556]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 31%|███       | 85/276 [00:27<01:02,  3.06it/s]

out["logits"]=tensor([[ 0.1479, -0.0538],
        [ 0.1479, -0.0538]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 31%|███       | 86/276 [00:27<01:01,  3.07it/s]

out["logits"]=tensor([[ 0.1667, -0.0398],
        [ 0.1670, -0.0397]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 32%|███▏      | 87/276 [00:28<01:01,  3.10it/s]

out["logits"]=tensor([[ 0.1427, -0.0486],
        [ 0.1427, -0.0486]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 32%|███▏      | 88/276 [00:28<01:02,  3.03it/s]

out["logits"]=tensor([[ 0.1394, -0.0453],
        [ 0.1394, -0.0453]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 33%|███▎      | 90/276 [00:29<00:57,  3.21it/s]

out["logits"]=tensor([[ 0.1371, -0.0430],
        [ 0.1371, -0.0430]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.1356, -0.0415],
        [ 0.1356, -0.0415]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 33%|███▎      | 91/276 [00:29<00:58,  3.17it/s]

out["logits"]=tensor([[ 0.1349, -0.0408],
        [ 0.1349, -0.0408]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 33%|███▎      | 92/276 [00:29<00:59,  3.09it/s]

out["logits"]=tensor([[ 0.1349, -0.0408],
        [ 0.1349, -0.0408]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 34%|███▍      | 94/276 [00:30<00:55,  3.27it/s]

out["logits"]=tensor([[ 0.1355, -0.0414],
        [ 0.1355, -0.0414]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1353, -0.0412],
        [ 0.1353, -0.0412]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 34%|███▍      | 95/276 [00:30<00:55,  3.28it/s]

out["logits"]=tensor([[ 0.1344, -0.0403],
        [ 0.1344, -0.0403]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 35%|███▌      | 97/276 [00:31<00:53,  3.34it/s]

out["logits"]=tensor([[ 0.1335, -0.0394],
        [ 0.1335, -0.0394]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 36%|███▌      | 98/276 [00:31<00:51,  3.48it/s]

out["logits"]=tensor([[ 0.1327, -0.0386],
        [ 0.1327, -0.0386]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1312, -0.0371],
        [ 0.1312, -0.0371]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 36%|███▌      | 100/276 [00:32<00:50,  3.50it/s]

out["logits"]=tensor([[ 0.1298, -0.0357],
        [ 0.1298, -0.0357]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.1285, -0.0344],
        [ 0.1285, -0.0344]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 37%|███▋      | 101/276 [00:32<00:51,  3.38it/s]

out["logits"]=tensor([[ 0.1279, -0.0338],
        [ 0.1279, -0.0338]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 37%|███▋      | 102/276 [00:32<00:52,  3.29it/s]

out["logits"]=tensor([[ 0.1280, -0.0339],
        [ 0.1280, -0.0339]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 37%|███▋      | 103/276 [00:33<00:54,  3.17it/s]

out["logits"]=tensor([[ 0.1280, -0.0340],
        [ 0.1280, -0.0340]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 38%|███▊      | 104/276 [00:33<00:54,  3.16it/s]

out["logits"]=tensor([[ 0.1280, -0.0340],
        [ 0.1373, -0.0416]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 38%|███▊      | 105/276 [00:33<00:54,  3.13it/s]

out["logits"]=tensor([[ 0.1280, -0.0339],
        [ 0.1280, -0.0339]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 39%|███▉      | 107/276 [00:34<00:52,  3.19it/s]

out["logits"]=tensor([[ 0.1278, -0.0338],
        [ 0.1278, -0.0338]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 39%|███▉      | 108/276 [00:34<00:50,  3.30it/s]

out["logits"]=tensor([[ 0.1270, -0.0329],
        [ 0.1270, -0.0329]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1255, -0.0314],
        [ 0.1255, -0.0314]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 39%|███▉      | 109/276 [00:34<00:52,  3.18it/s]

out["logits"]=tensor([[ 0.1241, -0.0300],
        [ 0.1241, -0.0300]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 40%|████      | 111/276 [00:35<00:51,  3.21it/s]

out["logits"]=tensor([[ 0.1228, -0.0287],
        [ 0.1228, -0.0287]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1209, -0.0268],
        [ 0.1209, -0.0268]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 41%|████      | 113/276 [00:36<00:49,  3.31it/s]

out["logits"]=tensor([[ 0.1191, -0.0250],
        [ 0.1191, -0.0250]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 41%|████▏     | 114/276 [00:36<00:47,  3.39it/s]

out["logits"]=tensor([[ 0.1174, -0.0233],
        [ 0.1174, -0.0233]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.1159, -0.0218],
        [ 0.1159, -0.0218]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 42%|████▏     | 115/276 [00:36<00:48,  3.30it/s]

out["logits"]=tensor([[ 0.1144, -0.0203],
        [ 0.1144, -0.0203]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 42%|████▏     | 117/276 [00:37<00:48,  3.27it/s]

out["logits"]=tensor([[ 0.1131, -0.0190],
        [ 0.1131, -0.0190]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1111, -0.0170],
        [ 0.1111, -0.0170]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 43%|████▎     | 119/276 [00:37<00:48,  3.26it/s]

out["logits"]=tensor([[ 0.1093, -0.0152],
        [ 0.1093, -0.0152]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1069, -0.0129],
        [ 0.1069, -0.0129]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 44%|████▍     | 121/276 [00:38<00:47,  3.27it/s]

out["logits"]=tensor([[ 0.1047, -0.0107],
        [ 0.1047, -0.0107]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[ 0.1021, -0.0080],
        [ 0.1021, -0.0080]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 45%|████▍     | 123/276 [00:39<00:46,  3.26it/s]

out["logits"]=tensor([[ 0.0989, -0.0049],
        [ 0.0989, -0.0049]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.0968, -0.0027],
        [ 0.0968, -0.0027]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 45%|████▌     | 125/276 [00:39<00:44,  3.40it/s]

out["logits"]=tensor([[ 0.0955, -0.0014],
        [ 0.0955, -0.0014]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 46%|████▌     | 126/276 [00:40<00:42,  3.53it/s]

out["logits"]=tensor([[ 0.0943, -0.0002],
        [ 0.0943, -0.0002]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[0.0932, 0.0009],
        [0.0932, 0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 46%|████▌     | 127/276 [00:40<00:43,  3.40it/s]

out["logits"]=tensor([[0.0929, 0.0012],
        [0.0929, 0.0012]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 47%|████▋     | 129/276 [00:41<00:44,  3.27it/s]

out["logits"]=tensor([[0.0918, 0.0022],
        [0.0918, 0.0022]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[0.0909, 0.0032],
        [0.0909, 0.0032]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 47%|████▋     | 131/276 [00:41<00:45,  3.22it/s]

out["logits"]=tensor([[0.0907, 0.0034],
        [0.0907, 0.0034]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 48%|████▊     | 132/276 [00:41<00:44,  3.26it/s]

out["logits"]=tensor([[0.0898, 0.0043],
        [0.0898, 0.0043]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0896, 0.0044],
        [0.0896, 0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 49%|████▊     | 134/276 [00:42<00:44,  3.21it/s]

out["logits"]=tensor([[0.0902, 0.0039],
        [0.0902, 0.0039]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 49%|████▉     | 135/276 [00:42<00:43,  3.26it/s]

out["logits"]=tensor([[0.0899, 0.0042],
        [0.0899, 0.0042]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[0.0897, 0.0044],
        [0.0897, 0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 49%|████▉     | 136/276 [00:43<00:43,  3.22it/s]

out["logits"]=tensor([[0.0894, 0.0047],
        [0.0894, 0.0047]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 50%|████▉     | 137/276 [00:43<00:43,  3.19it/s]

out["logits"]=tensor([[0.0898, 0.0042],
        [0.0898, 0.0042]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 50%|█████     | 138/276 [00:43<00:44,  3.11it/s]

out["logits"]=tensor([[0.0909, 0.0032],
        [0.0909, 0.0032]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 50%|█████     | 139/276 [00:44<00:44,  3.05it/s]

out["logits"]=tensor([[0.0925, 0.0016],
        [0.0925, 0.0016]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 51%|█████     | 140/276 [00:44<00:44,  3.07it/s]

out["logits"]=tensor([[ 0.0945, -0.0004],
        [ 0.0945, -0.0004]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 51%|█████▏    | 142/276 [00:45<00:42,  3.16it/s]

out["logits"]=tensor([[ 0.0963, -0.0023],
        [ 0.0963, -0.0023]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 52%|█████▏    | 143/276 [00:45<00:41,  3.22it/s]

out["logits"]=tensor([[ 0.0973, -0.0032],
        [ 0.0973, -0.0032]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.0981, -0.0040],
        [ 0.0981, -0.0040]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 52%|█████▏    | 144/276 [00:45<00:42,  3.13it/s]

out["logits"]=tensor([[ 0.0988, -0.0047],
        [ 0.0988, -0.0047]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 53%|█████▎    | 145/276 [00:46<00:42,  3.11it/s]

out["logits"]=tensor([[ 0.0994, -0.0053],
        [ 0.0994, -0.0053]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 53%|█████▎    | 146/276 [00:46<00:42,  3.06it/s]

out["logits"]=tensor([[ 0.0992, -0.0051],
        [ 0.0992, -0.0051]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 53%|█████▎    | 147/276 [00:46<00:41,  3.08it/s]

out["logits"]=tensor([[ 0.0983, -0.0042],
        [ 0.0983, -0.0042]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 54%|█████▍    | 149/276 [00:47<00:39,  3.24it/s]

out["logits"]=tensor([[ 0.0981, -0.0041],
        [ 0.0981, -0.0041]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 54%|█████▍    | 150/276 [00:47<00:36,  3.42it/s]

out["logits"]=tensor([[ 0.0973, -0.0032],
        [ 0.0973, -0.0032]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 55%|█████▍    | 151/276 [00:47<00:35,  3.56it/s]

out["logits"]=tensor([[ 0.0965, -0.0024],
        [ 0.0965, -0.0024]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.0964, -0.0024],
        [ 0.0964, -0.0024]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 55%|█████▌    | 152/276 [00:48<00:37,  3.34it/s]

out["logits"]=tensor([[ 0.0970, -0.0029],
        [ 0.0970, -0.0029]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 56%|█████▌    | 154/276 [00:48<00:36,  3.32it/s]

out["logits"]=tensor([[ 0.0982, -0.0041],
        [ 0.0982, -0.0041]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 56%|█████▌    | 155/276 [00:49<00:36,  3.33it/s]

out["logits"]=tensor([[ 0.0985, -0.0044],
        [ 0.0985, -0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 57%|█████▋    | 156/276 [00:49<00:35,  3.34it/s]

out["logits"]=tensor([[ 0.0988, -0.0047],
        [ 0.0988, -0.0047]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[ 0.0990, -0.0049],
        [ 0.0990, -0.0049]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 57%|█████▋    | 157/276 [00:49<00:37,  3.21it/s]

out["logits"]=tensor([[ 0.0984, -0.0044],
        [ 0.0984, -0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 57%|█████▋    | 158/276 [00:50<00:37,  3.12it/s]

out["logits"]=tensor([[ 0.0979, -0.0039],
        [ 0.0979, -0.0039]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 58%|█████▊    | 159/276 [00:50<00:37,  3.12it/s]

out["logits"]=tensor([[ 0.0968, -0.0027],
        [ 0.0968, -0.0027]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 58%|█████▊    | 161/276 [00:51<00:36,  3.19it/s]

out["logits"]=tensor([[ 0.0957, -0.0016],
        [ 0.0957, -0.0016]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 59%|█████▊    | 162/276 [00:51<00:35,  3.25it/s]

out["logits"]=tensor([[ 0.0947, -0.0006],
        [ 0.0947, -0.0006]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 59%|█████▉    | 163/276 [00:51<00:34,  3.29it/s]

out["logits"]=tensor([[0.0937, 0.0003],
        [0.0937, 0.0003]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0935, 0.0005],
        [0.0935, 0.0005]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 59%|█████▉    | 164/276 [00:51<00:35,  3.17it/s]

out["logits"]=tensor([[0.0933, 0.0008],
        [0.0933, 0.0008]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 60%|█████▉    | 165/276 [00:52<00:35,  3.16it/s]

out["logits"]=tensor([[0.0924, 0.0017],
        [0.0924, 0.0017]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 61%|██████    | 167/276 [00:52<00:34,  3.17it/s]

out["logits"]=tensor([[0.0916, 0.0025],
        [0.0916, 0.0025]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0915, 0.0026],
        [0.0915, 0.0026]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 61%|██████    | 168/276 [00:53<00:34,  3.16it/s]

out["logits"]=tensor([[0.0913, 0.0027],
        [0.0913, 0.0027]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 62%|██████▏   | 170/276 [00:53<00:33,  3.16it/s]

out["logits"]=tensor([[0.0919, 0.0022],
        [0.0919, 0.0022]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0930, 0.0011],
        [0.0930, 0.0011]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 62%|██████▏   | 171/276 [00:54<00:33,  3.15it/s]

out["logits"]=tensor([[9.3981e-02, 9.4165e-05],
        [9.3981e-02, 9.4165e-05]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 62%|██████▏   | 172/276 [00:54<00:33,  3.14it/s]

out["logits"]=tensor([[ 9.4155e-02, -7.9978e-05],
        [ 9.4155e-02, -7.9978e-05]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 63%|██████▎   | 173/276 [00:54<00:32,  3.14it/s]

out["logits"]=tensor([[0.0936, 0.0005],
        [0.0936, 0.0005]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 63%|██████▎   | 174/276 [00:55<00:33,  3.08it/s]

out["logits"]=tensor([[0.0938, 0.0003],
        [0.0938, 0.0003]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 64%|██████▍   | 176/276 [00:55<00:30,  3.23it/s]

out["logits"]=tensor([[0.0932, 0.0009],
        [0.0932, 0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[0.0920, 0.0021],
        [0.0920, 0.0021]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 64%|██████▍   | 178/276 [00:56<00:30,  3.25it/s]

out["logits"]=tensor([[0.0902, 0.0039],
        [0.0902, 0.0039]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[0.0885, 0.0055],
        [0.0885, 0.0055]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 65%|██████▍   | 179/276 [00:56<00:30,  3.21it/s]

out["logits"]=tensor([[0.0877, 0.0064],
        [0.0877, 0.0064]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 65%|██████▌   | 180/276 [00:57<00:30,  3.19it/s]

out["logits"]=tensor([[0.0869, 0.0071],
        [0.0869, 0.0071]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 66%|██████▌   | 181/276 [00:57<00:30,  3.10it/s]

out["logits"]=tensor([[0.0862, 0.0079],
        [0.0862, 0.0079]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 66%|██████▌   | 182/276 [00:57<00:30,  3.10it/s]

out["logits"]=tensor([[0.0848, 0.0092],
        [0.0848, 0.0092]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 67%|██████▋   | 184/276 [00:58<00:28,  3.25it/s]

out["logits"]=tensor([[0.0836, 0.0105],
        [0.0836, 0.0105]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 67%|██████▋   | 185/276 [00:58<00:27,  3.36it/s]

out["logits"]=tensor([[0.0825, 0.0116],
        [0.0825, 0.0116]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0821, 0.0120],
        [0.0821, 0.0120]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 67%|██████▋   | 186/276 [00:58<00:27,  3.31it/s]

out["logits"]=tensor([[0.0824, 0.0117],
        [0.0824, 0.0117]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 68%|██████▊   | 188/276 [00:59<00:25,  3.43it/s]

out["logits"]=tensor([[0.0833, 0.0108],
        [0.0833, 0.0108]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[0.0841, 0.0100],
        [0.0841, 0.0100]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 69%|██████▉   | 190/276 [01:00<00:25,  3.37it/s]

out["logits"]=tensor([[0.0855, 0.0086],
        [0.0855, 0.0086]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 69%|██████▉   | 191/276 [01:00<00:24,  3.53it/s]

out["logits"]=tensor([[0.0867, 0.0074],
        [0.0867, 0.0074]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 70%|██████▉   | 192/276 [01:00<00:24,  3.49it/s]

out["logits"]=tensor([[0.0877, 0.0063],
        [0.0877, 0.0063]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 70%|██████▉   | 193/276 [01:00<00:24,  3.45it/s]

out["logits"]=tensor([[0.0887, 0.0054],
        [0.0887, 0.0054]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 70%|███████   | 194/276 [01:01<00:23,  3.42it/s]

out["logits"]=tensor([[0.0888, 0.0053],
        [0.0888, 0.0053]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[0.0889, 0.0052],
        [0.0889, 0.0052]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 71%|███████   | 195/276 [01:01<00:24,  3.32it/s]

out["logits"]=tensor([[0.0890, 0.0051],
        [0.0890, 0.0051]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 71%|███████   | 196/276 [01:01<00:24,  3.21it/s]

out["logits"]=tensor([[0.0890, 0.0051],
        [0.0890, 0.0051]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 72%|███████▏  | 198/276 [01:02<00:24,  3.20it/s]

out["logits"]=tensor([[0.0897, 0.0044],
        [0.0897, 0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 72%|███████▏  | 199/276 [01:02<00:23,  3.31it/s]

out["logits"]=tensor([[0.0902, 0.0038],
        [0.0902, 0.0038]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 72%|███████▏  | 200/276 [01:03<00:22,  3.33it/s]

out["logits"]=tensor([[0.0900, 0.0040],
        [0.0900, 0.0040]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[0.0892, 0.0049],
        [0.0892, 0.0049]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 73%|███████▎  | 201/276 [01:03<00:22,  3.28it/s]

out["logits"]=tensor([[0.0883, 0.0057],
        [0.0883, 0.0057]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 73%|███████▎  | 202/276 [01:03<00:22,  3.22it/s]

out["logits"]=tensor([[0.0876, 0.0065],
        [0.0876, 0.0065]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 74%|███████▎  | 203/276 [01:04<00:23,  3.14it/s]

out["logits"]=tensor([[0.0875, 0.0065],
        [0.0875, 0.0065]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 74%|███████▍  | 205/276 [01:04<00:22,  3.20it/s]

out["logits"]=tensor([[0.0875, 0.0066],
        [0.0875, 0.0066]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[0.0867, 0.0074],
        [0.0867, 0.0074]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 75%|███████▍  | 206/276 [01:05<00:22,  3.12it/s]

out["logits"]=tensor([[0.0860, 0.0081],
        [0.0860, 0.0081]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 75%|███████▌  | 208/276 [01:05<00:21,  3.15it/s]

out["logits"]=tensor([[0.0847, 0.0094],
        [0.0847, 0.0094]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[0.0827, 0.0113],
        [0.0827, 0.0113]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 76%|███████▌  | 209/276 [01:05<00:21,  3.16it/s]

out["logits"]=tensor([[0.0810, 0.0131],
        [0.0810, 0.0131]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 76%|███████▌  | 210/276 [01:06<00:20,  3.16it/s]

out["logits"]=tensor([[0.0794, 0.0147],
        [0.0794, 0.0147]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 76%|███████▋  | 211/276 [01:06<00:21,  3.08it/s]

out["logits"]=tensor([[0.0786, 0.0155],
        [0.0786, 0.0155]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 77%|███████▋  | 212/276 [01:06<00:20,  3.11it/s]

out["logits"]=tensor([[0.0779, 0.0162],
        [0.0779, 0.0162]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 77%|███████▋  | 213/276 [01:07<00:20,  3.12it/s]

out["logits"]=tensor([[0.0765, 0.0176],
        [0.0765, 0.0176]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 78%|███████▊  | 214/276 [01:07<00:19,  3.11it/s]

out["logits"]=tensor([[0.0746, 0.0195],
        [0.0746, 0.0195]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 78%|███████▊  | 215/276 [01:07<00:19,  3.12it/s]

out["logits"]=tensor([[0.0735, 0.0205],
        [0.0735, 0.0205]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 78%|███████▊  | 216/276 [01:08<00:19,  3.06it/s]

out["logits"]=tensor([[0.0726, 0.0215],
        [0.0726, 0.0215]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 79%|███████▉  | 218/276 [01:08<00:17,  3.23it/s]

out["logits"]=tensor([[0.0723, 0.0217],
        [0.0723, 0.0217]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[0.0721, 0.0219],
        [0.0721, 0.0219]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 79%|███████▉  | 219/276 [01:09<00:17,  3.22it/s]

out["logits"]=tensor([[0.0719, 0.0221],
        [0.0719, 0.0221]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 80%|████████  | 221/276 [01:09<00:16,  3.33it/s]

out["logits"]=tensor([[0.0710, 0.0230],
        [0.0710, 0.0230]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[0.0702, 0.0238],
        [0.0702, 0.0238]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 80%|████████  | 222/276 [01:10<00:16,  3.28it/s]

out["logits"]=tensor([[0.0702, 0.0239],
        [0.0702, 0.0239]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 81%|████████  | 223/276 [01:10<00:16,  3.19it/s]

out["logits"]=tensor([[0.0701, 0.0240],
        [0.0701, 0.0240]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 81%|████████  | 224/276 [01:10<00:16,  3.10it/s]

out["logits"]=tensor([[0.0707, 0.0234],
        [0.0707, 0.0234]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 82%|████████▏ | 226/276 [01:11<00:15,  3.32it/s]

out["logits"]=tensor([[0.0712, 0.0229],
        [0.0712, 0.0229]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 82%|████████▏ | 227/276 [01:11<00:14,  3.49it/s]

out["logits"]=tensor([[0.0723, 0.0217],
        [0.0723, 0.0217]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0740, 0.0201],
        [0.0740, 0.0201]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 83%|████████▎ | 228/276 [01:11<00:14,  3.38it/s]

out["logits"]=tensor([[0.0755, 0.0186],
        [0.0755, 0.0186]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 83%|████████▎ | 230/276 [01:12<00:13,  3.32it/s]

out["logits"]=tensor([[0.0769, 0.0172],
        [0.0769, 0.0172]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])
out["logits"]=tensor([[0.0773, 0.0167],
        [0.0773, 0.0167]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 84%|████████▎ | 231/276 [01:12<00:14,  3.19it/s]

out["logits"]=tensor([[0.0778, 0.0163],
        [0.0778, 0.0163]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 84%|████████▍ | 232/276 [01:13<00:13,  3.16it/s]

out["logits"]=tensor([[0.0788, 0.0153],
        [0.0788, 0.0153]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 84%|████████▍ | 233/276 [01:13<00:13,  3.16it/s]

out["logits"]=tensor([[0.0797, 0.0143],
        [0.0797, 0.0143]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 85%|████████▍ | 234/276 [01:13<00:13,  3.15it/s]

out["logits"]=tensor([[0.0805, 0.0135],
        [0.0805, 0.0135]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 86%|████████▌ | 236/276 [01:14<00:12,  3.20it/s]

out["logits"]=tensor([[0.0806, 0.0135],
        [0.0806, 0.0135]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[0.0812, 0.0128],
        [0.0812, 0.0128]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 86%|████████▌ | 237/276 [01:14<00:12,  3.18it/s]

out["logits"]=tensor([[0.0825, 0.0116],
        [0.0825, 0.0116]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 86%|████████▌ | 238/276 [01:15<00:12,  3.10it/s]

out["logits"]=tensor([[0.0843, 0.0098],
        [0.0843, 0.0098]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 87%|████████▋ | 239/276 [01:15<00:12,  3.04it/s]

out["logits"]=tensor([[0.0859, 0.0082],
        [0.0859, 0.0082]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 87%|████████▋ | 241/276 [01:16<00:11,  3.12it/s]

out["logits"]=tensor([[0.0873, 0.0068],
        [0.0873, 0.0068]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[0.0885, 0.0055],
        [0.0885, 0.0055]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 88%|████████▊ | 242/276 [01:16<00:11,  3.06it/s]

out["logits"]=tensor([[0.0896, 0.0044],
        [0.0896, 0.0044]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 88%|████████▊ | 243/276 [01:16<00:10,  3.01it/s]

out["logits"]=tensor([[0.0906, 0.0035],
        [0.0906, 0.0035]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 88%|████████▊ | 244/276 [01:17<00:10,  3.05it/s]

out["logits"]=tensor([[0.0914, 0.0027],
        [0.0914, 0.0027]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 89%|████████▉ | 245/276 [01:17<00:10,  3.01it/s]

out["logits"]=tensor([[0.0921, 0.0019],
        [0.0921, 0.0019]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 89%|████████▉ | 246/276 [01:17<00:09,  3.06it/s]

out["logits"]=tensor([[0.0934, 0.0006],
        [0.0934, 0.0006]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 89%|████████▉ | 247/276 [01:18<00:09,  3.07it/s]

out["logits"]=tensor([[ 0.0953, -0.0012],
        [ 0.0953, -0.0012]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 90%|█████████ | 249/276 [01:18<00:08,  3.23it/s]

out["logits"]=tensor([[ 0.0962, -0.0021],
        [ 0.0962, -0.0021]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 1])


 91%|█████████ | 250/276 [01:18<00:07,  3.28it/s]

out["logits"]=tensor([[ 0.0963, -0.0022],
        [ 0.0963, -0.0022]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.0964, -0.0023],
        [ 0.0964, -0.0023]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 91%|█████████ | 251/276 [01:19<00:07,  3.17it/s]

out["logits"]=tensor([[ 0.0971, -0.0030],
        [ 0.0971, -0.0030]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 91%|█████████▏| 252/276 [01:19<00:07,  3.15it/s]

out["logits"]=tensor([[ 0.0977, -0.0036],
        [ 0.0977, -0.0036]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 92%|█████████▏| 253/276 [01:19<00:07,  3.14it/s]

out["logits"]=tensor([[ 0.0982, -0.0041],
        [ 0.0982, -0.0041]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 92%|█████████▏| 255/276 [01:20<00:06,  3.23it/s]

out["logits"]=tensor([[ 0.0986, -0.0046],
        [ 0.0986, -0.0046]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])
out["logits"]=tensor([[ 0.0990, -0.0049],
        [ 0.0990, -0.0049]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 93%|█████████▎| 256/276 [01:20<00:06,  3.15it/s]

out["logits"]=tensor([[ 0.0993, -0.0052],
        [ 0.0993, -0.0052]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 93%|█████████▎| 257/276 [01:21<00:06,  3.15it/s]

out["logits"]=tensor([[ 0.0995, -0.0054],
        [ 0.0995, -0.0054]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


 94%|█████████▍| 259/276 [01:21<00:05,  3.24it/s]

out["logits"]=tensor([[ 0.0996, -0.0056],
        [ 0.0996, -0.0056]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 94%|█████████▍| 260/276 [01:22<00:04,  3.28it/s]

out["logits"]=tensor([[ 0.1004, -0.0064],
        [ 0.1004, -0.0064]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.1018, -0.0077],
        [ 0.1018, -0.0077]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 95%|█████████▍| 261/276 [01:22<00:04,  3.23it/s]

out["logits"]=tensor([[ 0.1037, -0.0096],
        [ 0.1037, -0.0096]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 95%|█████████▌| 263/276 [01:23<00:03,  3.26it/s]

out["logits"]=tensor([[ 0.1054, -0.0113],
        [ 0.1054, -0.0113]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])
out["logits"]=tensor([[ 0.1075, -0.0135],
        [ 0.1075, -0.0135]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 96%|█████████▌| 264/276 [01:23<00:03,  3.17it/s]

out["logits"]=tensor([[ 0.1094, -0.0154],
        [ 0.1094, -0.0154]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 96%|█████████▌| 265/276 [01:23<00:03,  3.10it/s]

out["logits"]=tensor([[ 0.1118, -0.0177],
        [ 0.1118, -0.0177]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 96%|█████████▋| 266/276 [01:24<00:03,  3.09it/s]

out["logits"]=tensor([[ 0.1146, -0.0205],
        [ 0.1146, -0.0205]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 97%|█████████▋| 267/276 [01:24<00:02,  3.11it/s]

out["logits"]=tensor([[ 0.1170, -0.0229],
        [ 0.1170, -0.0229]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 97%|█████████▋| 268/276 [01:24<00:02,  3.12it/s]

out["logits"]=tensor([[ 0.1192, -0.0251],
        [ 0.1192, -0.0251]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 97%|█████████▋| 269/276 [01:25<00:02,  3.06it/s]

out["logits"]=tensor([[ 0.1217, -0.0277],
        [ 0.1217, -0.0277]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 98%|█████████▊| 270/276 [01:25<00:01,  3.08it/s]

out["logits"]=tensor([[ 0.1247, -0.0306],
        [ 0.1247, -0.0306]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 0])


 98%|█████████▊| 271/276 [01:25<00:01,  3.09it/s]

out["logits"]=tensor([[ 0.1280, -0.0339],
        [ 0.1280, -0.0339]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 99%|█████████▉| 273/276 [01:26<00:00,  3.19it/s]

out["logits"]=tensor([[ 0.1309, -0.0369],
        [ 0.1309, -0.0369]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])
out["logits"]=tensor([[ 0.1335, -0.0394],
        [ 0.1335, -0.0394]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


 99%|█████████▉| 274/276 [01:26<00:00,  3.18it/s]

out["logits"]=tensor([[ 0.1358, -0.0417],
        [ 0.1358, -0.0417]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([1, 0])


100%|█████████▉| 275/276 [01:26<00:00,  3.16it/s]

out["logits"]=tensor([[ 0.1378, -0.0437],
        [ 0.1378, -0.0437]], device='cuda:0', grad_fn=<AddmmBackward0>)
batch["labels"]=tensor([0, 1])


100%|██████████| 276/276 [01:27<00:00,  3.16it/s]
100%|██████████| 35/35 [00:03<00:00,  9.80it/s]

Epoch 1: Loss (train/val): 0.7100810260930355/0.6921319995607649

Total training time elapsed: 90.8





In [None]:
# HF model
sarcasm_hubert_classifier_config = HubertClassifierConfig(num_classes=2)
sarcasm_hubert_classifier_model = HubertClassifierModel(sarcasm_hubert_classifier_config)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
# HF trainer ref: https://huggingface.co/docs/transformers/main/en/trainer
# Trainer/TrainingArguments docs: https://huggingface.co/docs/transformers/main/en/main_classes/trainer
sarcasm_hubert_classifier_training_args = TrainingArguments(
    output_dir='sarcasm_hubert_classifier_model',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=1e-2,
    num_train_epochs=10,
    # logging_steps=100,
    lr_scheduler_type='linear',
    evaluation_strategy='epoch', # eval_strategy?
    save_strategy='epoch',
    logging_strategy='epoch',
    # load_best_model_at_end=True,
)

sarcasm_hubert_classifier_trainer = Trainer(
    model=sarcasm_hubert_classifier_model,
    args=sarcasm_hubert_classifier_training_args,
    train_dataset=sarcasm_train_set,
    eval_dataset=sarcasm_val_set,
    # optimizers=( # need to specify optimizers explicity?
    #     torch.optim.AdamW,
    #     torch.optim.lr_scheduler.LinearLR
    # ),
)

In [None]:
sarcasm_hubert_classifier_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss
1,0.7018,0.693337
2,0.7006,0.690571
3,0.7,0.702648
4,0.6936,0.698065
5,0.6965,0.696815
6,0.6927,0.695547
7,0.6957,0.695526
8,0.6936,0.695361
9,0.6949,0.695256
10,0.6911,0.695565


TrainOutput(global_step=2760, training_loss=0.6960503453793733, metrics={'train_runtime': 1931.3202, 'train_samples_per_second': 2.858, 'train_steps_per_second': 1.429, 'total_flos': 0.0, 'train_loss': 0.6960503453793733, 'epoch': 10.0})

In [None]:
mount_gdrive()
%cd /content/sarcasm_hubert_classifier_model
!zip -r checkpoint-1104.zip checkpoint-1104
!mv checkpoint-1104.zip /content/gdrive/MyDrive/2023-24/24spring/6.8620/project/models/sarcasm_hubert
%cd /content

/content/sarcasm_hubert_classifier_model
updating: checkpoint-1104/ (stored 0%)
updating: checkpoint-1104/optimizer.pt (deflated 7%)
updating: checkpoint-1104/training_args.bin (deflated 51%)
updating: checkpoint-1104/model.safetensors (deflated 7%)
updating: checkpoint-1104/config.json (deflated 24%)
updating: checkpoint-1104/trainer_state.json (deflated 71%)
updating: checkpoint-1104/rng_state.pth (deflated 25%)
updating: checkpoint-1104/scheduler.pt (deflated 55%)
/content


In [None]:
!rm -rf sarcasm_hubert_classifier_model

## Emotion Hubert

In [60]:
# base model testing
emotion_hubert = HubertClassifier(num_classes=8)

train_losses, val_losses = train(emotion_hubert, emotion_train_loader, emotion_val_loader, num_epochs=4)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

input_values=tensor([[ 3.9280e-04,  1.5681e-03,  3.6434e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-5.0203e-06, -5.0203e-06, -5.0203e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([3, 0], device='cuda:0')
input_values=tensor([[ 1.2375e-04,  1.2375e-04,  1.2375e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.3493e-05, -1.3493e-05, -1.3493e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([2, 6], device='cuda:0')


  1%|          | 3/288 [00:00<00:35,  7.95it/s]

input_values=tensor([[-1.7147e-03, -4.5738e-03, -3.9026e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.2095e-04, -7.1211e-04, -9.2799e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([4, 1], device='cuda:0')
input_values=tensor([[ 2.5171e-04,  1.3527e-04, -5.7747e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.3138e-06,  1.7473e-06,  1.4404e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([2, 4], device='cuda:0')
input_values=tensor([[ 1.0400e-03, -2.4044e-03, -1.4948e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 7.3416e-06,  7.3416e-06,  7.3416e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([4, 4], device='cuda:0')


  2%|▏         | 7/288 [00:00<00:27, 10.07it/s]

input_values=tensor([[-0.0027, -0.0035, -0.0032,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0088,  0.0151,  0.0031,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([2, 2], device='cuda:0')
input_values=tensor([[-4.8050e-05,  6.0976e-05,  5.0038e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 3.8618e-04,  3.8618e-04,  3.8618e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([5, 3], device='cuda:0')
input_values=tensor([[-5.7700e-07,  1.4468e-06, -7.2966e-07,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 3.4687e-05,  3.3084e-05,  4.0380e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([4, 2], device='cuda:0')


  3%|▎         | 9/288 [00:00<00:26, 10.69it/s]

input_values=tensor([[0.0022, 0.0026, 0.0031,  ..., 0.0000, 0.0000, 0.0000],
        [0.0014, 0.0014, 0.0014,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
labels=tensor([0, 6], device='cuda:0')
input_values=tensor([[ 2.7880e-06,  2.7880e-06,  2.7880e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.1954e-05,  1.2429e-05, -4.3140e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([7, 6], device='cuda:0')
input_values=tensor([[ 0.0001, -0.0002, -0.0003,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0110,  0.0151,  0.0131,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([2, 1], device='cuda:0')


  5%|▍         | 13/288 [00:01<00:24, 11.25it/s]

input_values=tensor([[ 3.4303e-05,  3.7341e-06,  3.2002e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-4.0431e-04, -4.1406e-04, -4.0364e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([6, 7], device='cuda:0')
input_values=tensor([[-2.9951e-05, -2.9951e-05, -2.9951e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.8792e-05,  8.8618e-05,  8.8785e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([3, 0], device='cuda:0')
input_values=tensor([[0.0002, 0.0002, 0.0002,  ..., 0.0000, 0.0000, 0.0000],
        [0.0002, 0.0005, 0.0002,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
labels=tensor([0, 6], device='cuda:0')


  5%|▌         | 15/288 [00:01<00:24, 11.00it/s]

input_values=tensor([[ 0.0017,  0.0016,  0.0017,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0092, -0.0143, -0.0115,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([3, 2], device='cuda:0')
input_values=tensor([[ 0.0137,  0.0044, -0.0005,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0002,  0.0002,  0.0002,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([1, 7], device='cuda:0')
input_values=tensor([[-4.8162e-02, -7.4551e-02, -6.8407e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-8.5724e-07, -8.5724e-07, -8.5724e-07,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([1, 4], device='cuda:0')


  7%|▋         | 19/288 [00:01<00:24, 11.18it/s]

input_values=tensor([[ 0.0004,  0.0004,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0002, -0.0002, -0.0002,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([1, 7], device='cuda:0')
input_values=tensor([[-1.4482e-05, -1.4482e-05, -1.4482e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.5338e-03,  3.6020e-03,  3.7799e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([7, 3], device='cuda:0')
input_values=tensor([[-1.1543e-05, -1.1072e-05, -1.2878e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.7315e-02, -1.7450e-02, -3.8526e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([4, 1], device='cuda:0')


  7%|▋         | 21/288 [00:01<00:23, 11.32it/s]

input_values=tensor([[-0.0049, -0.0063, -0.0042,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0005, -0.0053, -0.0019,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([4, 4], device='cuda:0')
input_values=tensor([[-8.9162e-06, -8.9162e-06, -8.9162e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 5.2640e-04,  5.2640e-04,  5.2640e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')
labels=tensor([2, 7], device='cuda:0')
input_values=tensor([[-0.0006, -0.0020, -0.0033,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0009, -0.0076, -0.0162,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
labels=tensor([2, 3], device='cuda:0')


  8%|▊         | 23/288 [00:02<00:25, 10.57it/s]


KeyboardInterrupt: 

In [None]:
# HF model
emotion_hubert_classifier_config = HubertClassifierConfig(num_classes=8)
emotion_hubert_classifier_model = HubertClassifierModel(emotion_hubert_classifier_config)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
# HF trainer ref: https://huggingface.co/docs/transformers/main/en/trainer
# Trainer/TrainingArguments docs: https://huggingface.co/docs/transformers/main/en/main_classes/trainer
emotion_hubert_classifier_training_args = TrainingArguments(
    output_dir='emotion_hubert_classifier_model',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=1e-2,
    num_train_epochs=20,
    # logging_steps=100,
    lr_scheduler_type='linear',
    evaluation_strategy='epoch', # eval_strategy?
    save_strategy='epoch',
    logging_strategy='epoch',
    # load_best_model_at_end=True,
)

emotion_hubert_classifier_trainer = Trainer(
    model=emotion_hubert_classifier_model,
    args=emotion_hubert_classifier_training_args,
    train_dataset=emotion_train_set,
    eval_dataset=emotion_val_set,
    # compute_metrics=compute_metrics,
    # optimizers=( # need to specify optimizers explicity?
    #     torch.optim.AdamW,
    #     torch.optim.lr_scheduler.LinearLR
    # ),
)

In [None]:
emotion_hubert_classifier_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss
1,2.025,2.131641
2,1.9214,1.91747
3,1.8842,1.837542
4,1.7879,1.652742
5,1.69,1.539122
6,1.4496,1.5503
7,1.2761,1.269641
8,1.0882,1.290077
9,0.8615,1.347866
10,0.7404,1.145923


TrainOutput(global_step=5760, training_loss=0.8790163649453058, metrics={'train_runtime': 1128.3563, 'train_samples_per_second': 10.21, 'train_steps_per_second': 5.105, 'total_flos': 0.0, 'train_loss': 0.8790163649453058, 'epoch': 20.0})

In [None]:
mount_gdrive()
%cd /content/emotion_hubert_classifier_model
!zip -r checkpoint-3456.zip checkpoint-3456
!du -h checkpoint-3456.zip
!mv checkpoint-3456.zip /content/gdrive/MyDrive/2023-24/24spring/6.8620/project/models/emotion_hubert
%cd /content

Mounted at /content/gdrive
/content/emotion_hubert_classifier_model
  adding: checkpoint-3456/ (stored 0%)
  adding: checkpoint-3456/optimizer.pt (deflated 7%)
  adding: checkpoint-3456/training_args.bin (deflated 51%)
  adding: checkpoint-3456/model.safetensors (deflated 7%)
  adding: checkpoint-3456/config.json (deflated 24%)
  adding: checkpoint-3456/trainer_state.json (deflated 78%)
  adding: checkpoint-3456/rng_state.pth (deflated 25%)
  adding: checkpoint-3456/scheduler.pt (deflated 56%)
1004M	checkpoint-3456.zip
/content


## General: Transplant

In [50]:
# custom huggingface config/model definition
# https://huggingface.co/docs/transformers/en/custom_models
class TransplantHubertClassifierConfig(PretrainedConfig):
    def __init__(
            self,
            hubert_model_name = hubert_model_name,
            num_classes = 2,
            transplant_hubert_checkpoint = None,
            **kwargs,
    ):
        self.hubert_model_name = hubert_model_name
        self.num_classes = num_classes
        self.transplant_hubert_checkpoint = transplant_hubert_checkpoint
        super().__init__(**kwargs)

class TransplantHubertClassifierModel(PreTrainedModel):
    config_class = HubertClassifierConfig

    def __init__(self, config):
        super().__init__(config)
        transplant_hubert = HubertClassifierModel.from_pretrained(config.transplant_hubert_checkpoint).model.hubert
        # freeze hubert params when transplanted
        for p in transplant_hubert.parameters():
            p.requires_grad = False

        transplant_classifier = HubertClassifierModel.from_pretrained(config.transplant_hubert_checkpoint).model.classifier
        self.model = HubertClassifier(
            hubert_model_name=config.hubert_model_name,
            num_classes=config.num_classes,
            transplant_hubert=transplant_hubert,
            # transplant_classifier=transplant_classifier # don't transplant classifier, except for testing
        )

    def forward(self, input_values=None, attention_mask=None, labels=None):
        return self.model.forward(input_values=input_values, attention_mask=attention_mask, labels=labels)

## Emotion-Sarcasm Hubert

In [None]:
# load emotion checkpoint
!gdown 1P4hapAd2-RYsbzKz0waiKpoD8bLHtdbX
!unzip checkpoint-3456.zip

Downloading...
From (original): https://drive.google.com/uc?id=1P4hapAd2-RYsbzKz0waiKpoD8bLHtdbX
From (redirected): https://drive.google.com/uc?id=1P4hapAd2-RYsbzKz0waiKpoD8bLHtdbX&confirm=t&uuid=c1f8c356-275b-4d5c-bc36-f1fbfac6dff7
To: /content/checkpoint-3456.zip
100% 1.05G/1.05G [00:25<00:00, 40.8MB/s]
Archive:  checkpoint-3456.zip
   creating: checkpoint-3456/
  inflating: checkpoint-3456/optimizer.pt  
  inflating: checkpoint-3456/training_args.bin  
  inflating: checkpoint-3456/model.safetensors  
  inflating: checkpoint-3456/config.json  
  inflating: checkpoint-3456/trainer_state.json  
  inflating: checkpoint-3456/rng_state.pth  
  inflating: checkpoint-3456/scheduler.pt  


In [None]:
# base model testing
emotion_sarcasm_hubert_config = TransplantHubertClassifierConfig(
    num_classes=2,
    transplant_hubert_checkpoint='checkpoint-3456'
)
emotion_sarcasm_hubert = TransplantHubertClassifierModel(emotion_sarcasm_hubert_config)

# e = HubertClassifierModel.from_pretrained('checkpoint-3456')

# train_losses, val_losses = train(e, sarcasm_train_loader, sarcasm_val_loader, num_epochs=4)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

transplanted hubert!


In [None]:
count_params(emotion_sarcasm_hubert)

24674

In [None]:
new_emotion_hubert_classifier_training_args = TrainingArguments(
    output_dir='emotion_sarcasm_hubert',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=1e-2,
    num_train_epochs=20,
    # logging_steps=100,
    lr_scheduler_type='linear',
    evaluation_strategy='epoch', # eval_strategy?
    save_strategy='epoch',
    logging_strategy='epoch',
    # load_best_model_at_end=True,
)

new_emotion_hubert_classifier_trainer = Trainer(
    model=emotion_sarcasm_hubert,
    args=new_emotion_hubert_classifier_training_args,
    train_dataset=sarcasm_train_set,
    eval_dataset=sarcasm_val_set,
    # compute_metrics=compute_metrics,
    # optimizers=( # need to specify optimizers explicity?
    #     torch.optim.AdamW,
    #     torch.optim.lr_scheduler.LinearLR
    # ),
)

In [None]:
new_emotion_hubert_classifier_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss
1,0.6936,0.681437
2,0.6781,0.666476
3,0.6779,0.666878
4,0.6771,0.669884
5,0.6682,0.673033
6,0.672,0.681748
7,0.6659,0.679741
8,0.698,0.668653
9,0.6726,0.67882
10,0.6746,0.678398


TrainOutput(global_step=5520, training_loss=0.6759941045788751, metrics={'train_runtime': 630.4362, 'train_samples_per_second': 17.512, 'train_steps_per_second': 8.756, 'total_flos': 0.0, 'train_loss': 0.6759941045788751, 'epoch': 20.0})

In [None]:
mount_gdrive()
%cd /content/emotion_sarcasm_hubert
!zip -r checkpoint-5520.zip checkpoint-5520
!du -h checkpoint-5520.zip
!mv checkpoint-5520.zip /content/gdrive/MyDrive/2023-24/24spring/6.8620/project/models/emotion_sarcasm_hubert
%cd /content

Mounted at /content/gdrive
/content/emotion_sarcasm_hubert
  adding: checkpoint-5520/ (stored 0%)
  adding: checkpoint-5520/optimizer.pt (deflated 8%)
  adding: checkpoint-5520/training_args.bin (deflated 51%)
  adding: checkpoint-5520/model.safetensors (deflated 7%)
  adding: checkpoint-5520/config.json (deflated 31%)
  adding: checkpoint-5520/trainer_state.json (deflated 81%)
  adding: checkpoint-5520/rng_state.pth (deflated 25%)
  adding: checkpoint-5520/scheduler.pt (deflated 56%)
335M	checkpoint-5520.zip
/content


## Sarcasm-Emotion Hubert

In [51]:
# load sarcasm checkpoint
!gdown 1Em_Z2WRQY6yu7iWNSYu9xLo8NDB7FBh_
!unzip checkpoint-1104.zip

Downloading...
From (original): https://drive.google.com/uc?id=1Em_Z2WRQY6yu7iWNSYu9xLo8NDB7FBh_
From (redirected): https://drive.google.com/uc?id=1Em_Z2WRQY6yu7iWNSYu9xLo8NDB7FBh_&confirm=t&uuid=04b6cbbc-b6b0-42d7-bb9f-5111ae04788d
To: /content/checkpoint-1104.zip
100% 1.05G/1.05G [00:14<00:00, 75.1MB/s]
Archive:  checkpoint-1104.zip
   creating: checkpoint-1104/
  inflating: checkpoint-1104/optimizer.pt  
  inflating: checkpoint-1104/training_args.bin  
  inflating: checkpoint-1104/model.safetensors  
  inflating: checkpoint-1104/config.json  
  inflating: checkpoint-1104/trainer_state.json  
  inflating: checkpoint-1104/rng_state.pth  
  inflating: checkpoint-1104/scheduler.pt  


In [52]:
# base model testing
sarcasm_emotion_hubert_config = TransplantHubertClassifierConfig(
    num_classes=8,
    transplant_hubert_checkpoint='checkpoint-1104'
)
sarcasm_emotion_hubert = TransplantHubertClassifierModel(sarcasm_emotion_hubert_config)

# e = HubertClassifierModel.from_pretrained('checkpoint-3456')

# train_losses, val_losses = train(e, sarcasm_train_loader, sarcasm_val_loader, num_epochs=4)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

transplanted hubert!


In [53]:
sarcasm_emotion_hubert_classifier_training_args = TrainingArguments(
    output_dir='sarcasm_emotion_hubert',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=1e-2,
    num_train_epochs=20,
    # logging_steps=100,
    lr_scheduler_type='linear',
    evaluation_strategy='epoch', # eval_strategy?
    save_strategy='epoch',
    logging_strategy='epoch',
    # load_best_model_at_end=True,
)

sarcasm_emotion_hubert_classifier_trainer = Trainer(
    model=sarcasm_emotion_hubert,
    args=sarcasm_emotion_hubert_classifier_training_args,
    train_dataset=emotion_train_set,
    eval_dataset=emotion_val_set,
    # compute_metrics=compute_metrics,
    # optimizers=( # need to specify optimizers explicity?
    #     torch.optim.AdamW,
    #     torch.optim.lr_scheduler.LinearLR
    # ),
)

In [56]:
sarcasm_emotion_hubert_classifier_trainer.train()

Epoch,Training Loss,Validation Loss
1,2.0684,2.075519
2,2.0676,2.073698
3,2.0661,2.078223
4,2.0664,2.076091
5,2.0616,2.077411
6,2.0651,2.077406
7,2.0662,2.076551
8,2.0655,2.077134
9,2.0681,2.076619
10,2.0651,2.075911


TrainOutput(global_step=5760, training_loss=2.0654622077941895, metrics={'train_runtime': 298.344, 'train_samples_per_second': 38.613, 'train_steps_per_second': 19.307, 'total_flos': 0.0, 'train_loss': 2.0654622077941895, 'epoch': 20.0})

In [58]:
mount_gdrive()
%cd /content/sarcasm_emotion_hubert
!zip -r checkpoint-5760.zip checkpoint-5760
!du -h checkpoint-5760.zip
!mv checkpoint-5760.zip /content/gdrive/MyDrive/2023-24/24spring/6.8620/project/models/sarcasm_emotion_hubert
%cd /content

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/sarcasm_emotion_hubert
  adding: checkpoint-5760/ (stored 0%)
  adding: checkpoint-5760/optimizer.pt (deflated 8%)
  adding: checkpoint-5760/training_args.bin (deflated 51%)
  adding: checkpoint-5760/model.safetensors (deflated 7%)
  adding: checkpoint-5760/config.json (deflated 31%)
  adding: checkpoint-5760/trainer_state.json (deflated 81%)
  adding: checkpoint-5760/rng_state.pth (deflated 25%)
  adding: checkpoint-5760/scheduler.pt (deflated 56%)
335M	checkpoint-5760.zip
/content


## Tutorials

### Tutorial: HubertForCTC example

https://huggingface.co/docs/transformers/en/model_doc/hubert#transformers.HubertForCTC.forward.example

In [None]:
hubert_model_name = 'facebook/hubert-base-ls960'
processor = AutoProcessor.from_pretrained('facebook/hubert-large-ls960-ft') # is using a processor different from the model ok?
model = HubertModel.from_pretrained(hubert_model_name) # 94M params

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
sampling_rate = dataset.features['audio'].sampling_rate
inputs = processor(dataset[0]['audio']['array'], sampling_rate=sampling_rate, return_tensors='pt')
with torch.no_grad():
    output = model(**inputs)
    pooler_output = torch.mean(output.last_hidden_state, dim=1) # no pooler_output provided by hubert, calculate by taking average over sequence dimension
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

In [None]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/hubert-large-ls960-ft', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("<s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("</s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3: AddedToken("<unk>", rstrip=True, lstrip=Tr

### Tutorial: HubertModel example

https://huggingface.co/docs/transformers/en/model_doc/hubert#transformers.HubertModel.forward.example

What is `ds['speech']`?

In [None]:
processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

In [None]:
audio = ds[0]['audio']['array']
speech, sr = sf.read(ds[0]['file'])
all(audio == speech)

True

### Tutorial: audio classification

https://huggingface.co/docs/transformers/en/tasks/audio_classification

In [None]:
minds_dataset = load_dataset('PolyAI/minds14', name='en-US', split='train')
minds = minds_dataset.train_test_split(test_size=0.2)
print(minds['train'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 450
})


In [None]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
print(minds['train'])
minds['train'][0]

Dataset({
    features: ['audio', 'intent_class'],
    num_rows: 450
})


{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~BALANCE/602b9c72bb1e6d0fbce91f87.wav',
  'array': array([ 0.00024414, -0.00024414,  0.        , ...,  0.01037598,
          0.0098877 ,  0.0098877 ]),
  'sampling_rate': 8000},
 'intent_class': 4}

In [None]:
labels = minds['train'].features['intent_class'].names
print(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label[str(2)]

['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']


'app_error'

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base')



In [None]:
minds = minds.cast_column('audio', Audio(sampling_rate=16_000))
minds['train'][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~BALANCE/602b9c72bb1e6d0fbce91f87.wav',
  'array': array([ 2.35084444e-04,  3.73781659e-05, -2.34791543e-04, ...,
          1.11566903e-02,  9.70520079e-03,  4.98228893e-03]),
  'sampling_rate': 16000},
 'intent_class': 4}

In [None]:
def preprocess_function(examples):
    audio_arrays = [example['array'] for example in examples['audio']]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_minds = minds.map(preprocess_function, remove_columns='audio', batched=True)
encoded_minds = encoded_minds.rename_column('intent_class', 'label')
encoded_minds

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 450
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 113
    })
})

In [None]:
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(prediction=predictions, references=eval_pred.label_ids)

In [None]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    'facebook/wav2vec2-base', num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


TypeError: object of type 'NoneType' has no len()

In [None]:
encoded_minds['test']

Dataset({
    features: ['label', 'input_values'],
    num_rows: 113
})

# Evaluation

In [None]:
def eval(hubert_model, test_loader):
    preds_tensor = torch.tensor([]).to(DEVICE)
    labels_tensor = torch.tensor([]).to(DEVICE)

    hubert_model.eval()
    for ix, batch in tqdm.tqdm(enumerate(test_loader)):
        out = hubert_model(
            input_values=batch['input_values'].to(DEVICE),
            attention_mask=batch['attention_mask'].to(DEVICE),
            labels=batch['labels'].to(DEVICE)
        )
        preds = torch.argmax(torch.softmax(out['logits'], dim=1), dim=1)
        print(f'{preds=}')
        print(f'{preds.shape=}')
        print(f'{batch["labels"]=}')
        preds_tensor = torch.concat((preds_tensor, preds), dim=0)
        labels_tensor = torch.concat((labels_tensor, batch['labels'].to(DEVICE)), dim=0)

    print(f'{preds_tensor.shape=}')
    print(f'{labels_tensor.shape=}')


In [None]:
e_hubert = HubertClassifierModel(HubertClassifierConfig(num_classes=8)).to(DEVICE)

eval(e_hubert, emotion_test_loader)

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 4])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 7])
preds=tensor([2, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([1, 6])


6it [00:00, 15.16it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([0, 1])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([1, 5])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 2])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 7])


10it [00:00, 16.87it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 4])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([3, 6])
preds=tensor([4, 2], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 5])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 0])


14it [00:00, 17.51it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([2, 2])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([2, 4])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 7])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([6, 5])


18it [00:01, 17.83it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([3, 5])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 7])
preds=tensor([2, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([0, 1])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([6, 7])


22it [00:01, 17.70it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 4])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([6, 6])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 5])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 2])


26it [00:01, 17.63it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([4, 7])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 7])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([3, 6])
preds=tensor([2, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([3, 4])


30it [00:01, 17.87it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([1, 4])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 2])
preds=tensor([4, 2], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 5])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([1, 2])


34it [00:01, 18.14it/s]

preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([5, 3])
preds=tensor([2, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 7])
preds=tensor([1, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([3, 1])
preds=tensor([4, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([7, 4])


36it [00:02, 17.29it/s]

preds=tensor([7, 4], device='cuda:0')
preds.shape=torch.Size([2])
batch["labels"]=tensor([2, 6])
preds_tensor.shape=torch.Size([72])
labels_tensor.shape=torch.Size([72])





# Visualization