In [1]:
!pip install transformers==4.14.1 -q
!pip install bitsandbytes
!pip install datasets==1.16.1 -q

[K     |████████████████████████████████| 3.4 MB 30.2 MB/s 
[K     |████████████████████████████████| 182 kB 96.9 MB/s 
[K     |████████████████████████████████| 880 kB 95.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 80.1 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.35.4-py3-none-any.whl (62.5 MB)
[K     |████████████████████████████████| 62.5 MB 7.2 MB/s 
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.35.4
[K     |████████████████████████████████| 298 kB 23.1 MB/s 
[K     |████████████████████████████████| 212 kB 101.3 MB/s 
[K     |████████████████████████████████| 132 kB 89.7 MB/s 
[?25h

### Fine-tuning 6-Billion GPT-J (& other models) in colab with LoRA and 8-bit compression

This notebook is a simple example for fine-tuning [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) with limited memory. A detailed explanation of how it works can be found in [this model card](https://huggingface.co/hivemind/gpt-j-6B-8bit). It is heavily based on [this Colab](https://colab.research.google.com/drive/1ft6wQU0BhqG5PRlwgaZJv2VukKKjU4Es#scrollTo=vfdLQHOuEU7h). Huge thanks to Hivemind!

You can also finetune [GPT-Neo-2.7B](https://huggingface.co/gustavecortal/gpt-neo-2.7B-8bit), [French GPT-J (Cedille's Boris)](https://huggingface.co/gustavecortal/fr-boris-8bit) and [T0-3B](https://huggingface.co/gustavecortal/T0_3B-8bit) with limited memory.

Twitter: [@gustavecortal](https://twitter.com/gustavecortal)

In [2]:
from sklearn.model_selection import train_test_split

import transformers

import pandas as pd

import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise

from tqdm.auto import tqdm

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Converting the model to 8 bits

In [4]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias
 
    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output += self.adapter(input)
        return output
 
    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
 
 
class DequantizeAndLinear(torch.autograd.Function): 
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias).clone()
 
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias
 
 
class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
 
    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output 
 
    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
 
 
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
 
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)
 
 
def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

You have to Monkey-Patch GPT-J before loading: 

In [5]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
        

class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock

If you're using another 8-bit quantized model (e.g. T0-3B), remember to Monkey-Patch the model using convert_to_int8()

In [6]:
class T5ForConditionalGeneration(transformers.models.t5.modeling_t5.T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration = T5ForConditionalGeneration

In [7]:
config = transformers.GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

Downloading:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [8]:
config.pad_token_id = config.eos_token_id
tokenizer.pad_token = config.pad_token_id

In [9]:
gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)
#gpt = GPTJForCausalLM.from_pretrained("gustavecortal/fr-boris-8bit", low_cpu_mem_usage=True) French GPT-J Cedille's Boris

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.75G [00:00<?, ?B/s]

k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, 

## LoRA fine-tuning example

You can load my very small dataset composed of philosophical sentences: 

In [10]:
data = pd.read_csv('/content/drive/MyDrive/tarotto1218.csv')
data = data.loc[:,['sentence']]
train, test = train_test_split(data, test_size=0.01) 
train.to_csv('/content/train_pgbp_example.csv', index=False)
test.to_csv('/content/test_pgbp_example.csv', index=False)

In [11]:
print(data)

                                               sentence
0     <|endoftext|>What type of person do I like nex...
1     <|endoftext|>What type of person do I like nex...
2     <|endoftext|>What type of person do I like nex...
3     <|endoftext|>What type of person do I like nex...
4     <|endoftext|>What type of person do I like nex...
...                                                 ...
1479  <|endoftext|>Christmas, how do you spend with ...
1480  <|endoftext|>Christmas, how do you spend with ...
1481  <|endoftext|>Christmas, how do you spend with ...
1482  <|endoftext|>Christmas, how do you spend with ...
1483  <|endoftext|>Christmas, how do you spend with ...

[1484 rows x 1 columns]


In [12]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': '/content/train_pgbp_example.csv',
                                              'test': '/content/test_pgbp_example.csv'})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a972ff377a7dce34/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a972ff377a7dce34/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True, max_length= 800)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence"])
tokenized_datasets.set_format("torch")



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
from torch.utils.data import DataLoader

full_train_dataset = tokenized_datasets["train"]
train_dataloader = DataLoader(full_train_dataset, shuffle=False, batch_size=8)

Add adapters to Embedding/MLP/Attention/LMHead layers

In [15]:
def add_adapters(model, adapter_dim=4, p = 0.1):
    assert adapter_dim > 0

    for name, module in model.named_modules():
      if isinstance(module, FrozenBNBLinear):
          if "attn" in name or "mlp" in name or "head" in name:
              print("Adding adapter to", name)
              module.adapter = nn.Sequential(
                nn.Linear(module.in_features, adapter_dim, bias=False),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.out_features, bias=False),
            )
              print("Initializing", name)
              nn.init.zeros_(module.adapter[2].weight)

          else:
              print("Not adding adapter to", name)
      elif isinstance(module, FrozenBNBEmbedding):
          print("Adding adapter to", name)
          module.adapter = nn.Sequential(
                nn.Embedding(module.num_embeddings, adapter_dim),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.embedding_dim, bias=False),
            )
          print("Initializing", name)
          nn.init.zeros_(module.adapter[2].weight)

add_adapters(gpt)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpt.to(device)

Adding adapter to transformer.wte
Initializing transformer.wte
Adding adapter to transformer.h.0.attn.k_proj
Initializing transformer.h.0.attn.k_proj
Adding adapter to transformer.h.0.attn.v_proj
Initializing transformer.h.0.attn.v_proj
Adding adapter to transformer.h.0.attn.q_proj
Initializing transformer.h.0.attn.q_proj
Adding adapter to transformer.h.0.attn.out_proj
Initializing transformer.h.0.attn.out_proj
Adding adapter to transformer.h.0.mlp.fc_in
Initializing transformer.h.0.mlp.fc_in
Adding adapter to transformer.h.0.mlp.fc_out
Initializing transformer.h.0.mlp.fc_out
Adding adapter to transformer.h.1.attn.k_proj
Initializing transformer.h.1.attn.k_proj
Adding adapter to transformer.h.1.attn.v_proj
Initializing transformer.h.1.attn.v_proj
Adding adapter to transformer.h.1.attn.q_proj
Initializing transformer.h.1.attn.q_proj
Adding adapter to transformer.h.1.attn.out_proj
Initializing transformer.h.1.attn.out_proj
Adding adapter to transformer.h.1.mlp.fc_in
Initializing transfor

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0

In [16]:
from bitsandbytes.optim import Adam8bit

gpt.gradient_checkpointing_enable()
optimizer = Adam8bit(gpt.parameters(), lr=3e-5, weight_decay=0.1)

In [17]:
num_epochs = 7
num_training_steps = num_epochs * len(train_dataloader)

In [18]:
lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, int(num_training_steps*0.1), num_training_steps
)

In [19]:
filepath = '/content/model.pt'

In [20]:
from tqdm.auto import tqdm

scaler = torch.cuda.amp.GradScaler()
progress_bar = tqdm(range(num_training_steps))
gpt.train()
gpt.gradient_checkpointing_enable()
k = 0

for epoch in range(num_epochs):
    for batch in train_dataloader:

        k = k + 1
        if k % 500 == 0:
          print(k)
          state = {'k' : k, 'epoch': num_epochs, 'lr_scheduler': lr_scheduler.state_dict(), 'state_dict': gpt.state_dict(), 'optimizer': optimizer.state_dict()}
          torch.save(state, filepath)

        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
          out = gpt.forward(**batch,)

          loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),
                                reduction='mean', label_smoothing=0.1)
          
        print(loss)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

  0%|          | 0/1288 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9356, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8033, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.7478, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0242, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8330, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9726, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.1388, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.1764, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.4591, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.5093, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3774, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.7522, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8907, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6708, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.3099, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.1334, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9009, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.5388, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9212, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.2471, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.1187, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.3494, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.0862, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8416, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8642, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.4498, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9054, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3642, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8570, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6656, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8301, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2075, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9835, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2956, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.2269, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.0832, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8851, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3890, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.2271, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.2244, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.8276, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6252, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9733, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0220, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.5995, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.4377, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6630, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2489, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0588, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9333, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6268, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3831, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9842, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0342, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.9281, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3554, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.5115, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3198, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.1075, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.6607, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9675, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.4033, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7927, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2095, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7047, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.4008, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.4623, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3228, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1184, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2832, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1048, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8429, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3751, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0590, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7836, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2066, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8556, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8219, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8504, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9788, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6938, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5961, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8909, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4064, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5387, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1533, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0835, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9349, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6708, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9169, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3029, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4168, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2774, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2008, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9044, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8460, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9100, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9413, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7414, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5604, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4791, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4656, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2858, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2363, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0562, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9613, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9463, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.8098, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.7853, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.4354, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.3096, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.0931, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.0648, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.8887, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.8321, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.8164, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.5609, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.4512, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.3824, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.7569, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.4987, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1625, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.2982, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.3073, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.4709, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1223, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.2736, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0618, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.3126, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0605, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1483, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1052, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.2447, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.7352, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0725, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1634, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1094, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9272, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0388, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0491, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9720, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.2173, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1202, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1243, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1663, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9804, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9552, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1756, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9799, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0896, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1248, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1909, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7463, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0701, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9092, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1550, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1689, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0390, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9318, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8878, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8506, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9410, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9698, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9278, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9421, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0410, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8304, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9912, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7895, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8204, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8618, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8763, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8544, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9214, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7090, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7842, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9348, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5727, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6940, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8732, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1188, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7252, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7457, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9391, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9868, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0231, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0525, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8016, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8637, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7652, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7629, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5942, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6669, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0096, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9585, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8440, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8463, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8109, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7608, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8286, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8182, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8213, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6683, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7469, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7032, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6796, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8197, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1229, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6007, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8462, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0210, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1309, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9276, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7304, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9769, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6877, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0121, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5999, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6766, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8035, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1582, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6498, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6565, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6759, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8446, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6941, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1489, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8175, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8071, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7396, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9211, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9492, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6985, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7239, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7791, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6552, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9765, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5864, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7937, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6517, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7977, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5561, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6970, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9243, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7614, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8728, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8321, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9773, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8578, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7476, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8039, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6742, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6848, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8758, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8680, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7112, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7092, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9044, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6192, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7697, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7788, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7432, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6500, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7325, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8045, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5931, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6563, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7281, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0070, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9145, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1279, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5563, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0067, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6117, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5837, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8072, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6660, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8792, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7623, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6512, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7720, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7000, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7232, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7582, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7121, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6601, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9194, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8723, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9183, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7463, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5767, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6785, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6860, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6669, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5564, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7543, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7703, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7958, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8875, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7267, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6849, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6561, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9952, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7937, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5022, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6581, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6658, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8916, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5454, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7305, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5444, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7473, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5660, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6453, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5854, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7423, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.2467, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5957, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7194, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6428, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5037, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5964, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6440, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5622, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8133, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7024, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7449, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7757, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5846, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5537, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7954, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5963, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7262, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7413, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8494, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3943, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7082, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5756, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7991, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8318, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7069, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6120, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5768, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5529, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6237, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6543, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6283, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6530, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7454, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5474, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7023, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5096, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5651, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5814, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5983, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5857, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6450, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4583, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5275, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6720, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3289, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4194, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6192, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8736, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4952, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5452, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6826, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7281, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7888, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7978, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5691, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6210, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5230, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5300, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4469, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7783, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7297, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6267, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6168, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5881, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5505, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6060, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6138, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6329, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4668, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5678, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5182, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4833, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6150, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8905, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4070, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6628, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8254, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9267, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7525, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5323, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7963, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5036, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8329, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4189, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5046, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6423, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9581, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4995, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4925, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5078, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6597, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5118, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9517, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6531, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6374, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5807, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7704, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7721, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5354, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5726, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6195, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5000, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8360, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4099, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6233, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4996, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6328, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4060, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5371, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7666, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6106, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7181, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6717, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7852, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7163, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5991, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6381, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5155, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5381, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7322, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7045, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5613, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5925, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7573, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4494, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6237, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6339, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6181, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5153, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5906, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6657, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4698, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5434, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6041, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8436, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7836, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9585, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4437, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8709, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4782, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4665, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6384, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5282, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7518, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6396, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5272, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6172, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5851, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6094, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6310, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5939, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5481, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8545, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7475, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8072, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7655, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6187, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4716, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5572, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5740, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5496, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4355, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6196, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6515, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6746, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7428, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6238, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5643, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5372, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8546, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6932, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3747, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5290, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5477, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7851, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4387, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6092, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4399, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6347, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4767, device='cuda:0', grad_fn=<AddBackward0>)
500


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5481, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4697, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6261, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1129, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4862, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5994, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5367, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4103, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4830, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5422, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4688, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7110, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5817, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6491, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6590, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4725, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4446, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6822, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4856, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6380, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6435, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7377, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3223, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5985, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4839, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6891, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7263, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6081, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5225, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4750, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4731, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5094, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5237, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5561, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5441, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6472, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4454, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6139, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4111, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4825, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4909, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5111, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4919, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5293, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3720, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4428, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5759, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2385, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3351, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5386, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7831, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4032, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4692, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5816, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6165, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7123, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6969, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4825, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5305, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4321, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4331, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3351, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3476, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6771, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6351, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5273, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5230, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5002, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4763, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5096, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5299, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5617, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3970, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4999, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4453, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4080, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5295, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3454, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5959, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7405, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8429, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6610, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4495, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7196, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4500, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7640, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3462, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4406, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5746, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8731, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4418, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4154, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4388, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5774, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4215, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8639, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5673, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5654, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5125, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7031, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6835, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4585, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5267, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5574, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4326, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7260, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3343, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5355, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4335, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5635, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3497, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4580, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6896, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5429, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6409, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5989, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7096, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6468, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5380, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5667, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4527, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4825, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6601, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6368, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4928, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5360, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6890, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3771, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5536, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5663, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5588, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4493, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5233, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6032, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4092, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4903, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5479, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7793, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7260, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8897, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3954, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8034, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4241, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4047, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5810, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4732, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6875, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5799, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4661, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5584, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5361, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5585, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5598, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5383, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5057, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9767, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6831, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7457, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7095, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5663, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4247, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5003, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5163, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4973, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3830, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5618, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5895, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6119, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6815, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5772, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5160, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4917, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7924, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6580, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3292, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4739, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4965, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7295, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3956, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5568, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3973, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5764, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4312, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5069, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4143, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5779, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0551, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4393, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5457, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4887, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3688, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4294, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4993, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4231, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6659, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5259, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6029, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6100, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4253, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3989, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6339, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4403, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5875, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5914, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6867, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2863, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5704, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4411, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6445, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6771, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5578, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4759, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4338, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4372, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4622, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4724, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5128, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4936, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5995, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4029, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5662, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3665, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4431, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4538, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4729, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4495, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4854, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3389, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4044, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5396, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.1966, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3038, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4992, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7401, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3580, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4374, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5334, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5659, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6642, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6503, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4446, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4841, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3928, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3965, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3019, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3110, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6316, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5917, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4814, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4781, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4621, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4342, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4674, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4923, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5183, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3644, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4655, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4115, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3746, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4882, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7443, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3151, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5719, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6906, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7972, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6174, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4109, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6727, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4164, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7264, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3130, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4057, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5367, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8269, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4044, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3817, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4045, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5341, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3883, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8269, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5295, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5260, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4770, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6643, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6375, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4210, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5229, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3984, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7135, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3006, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5003, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4035, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5240, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3206, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4287, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6466, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5022, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6005, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5637, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6697, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6082, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5068, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5273, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4174, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4497, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6163, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5961, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4605, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5060, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6449, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3458, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5161, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5263, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5260, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4194, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4882, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5714, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3756, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4636, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5153, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7401, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6918, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8486, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3665, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7698, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3868, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3718, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5491, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4410, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6523, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5442, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4300, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5256, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5068, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5302, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5279, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5098, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4755, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9384, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6516, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7140, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6595, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5372, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3970, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4687, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4834, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4675, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3562, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5298, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5631, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5737, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6433, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5450, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4836, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4623, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7603, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6251, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2998, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4407, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4633, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6980, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3701, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5289, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3677, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5426, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3977, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4784, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3851, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5484, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.0115, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4073, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5170, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4550, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3386, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4007, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4707, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3873, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6324, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4979, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5758, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5798, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3967, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3707, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6020, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4207, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5600, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5618, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6496, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2641, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5382, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4151, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6153, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6466, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5286, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4459, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4074, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4152, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4322, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4399, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4845, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4659, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5650, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3701, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5371, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3418, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4194, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4261, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4474, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4241, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4562, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3122, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3790, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5137, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.1743, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2831, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4770, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7075, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3321, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4094, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5074, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5408, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6420, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6265, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4185, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4564, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3646, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3742, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2784, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2904, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6015, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5616, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4517, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4489, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4336, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4071, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4408, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4718, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4930, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3416, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4401, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3882, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3478, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4589, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7121, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2965, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5453, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6640, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7690, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5959, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3865, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6413, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3902, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7072, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2926, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3817, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5127, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8014, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3803, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3578, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3847, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5057, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3623, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7980, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5070, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4979, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4554, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6346, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6097, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3967, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4750, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5016, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3757, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6629, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2810, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4748, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3815, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4981, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3006, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4043, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6199, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4773, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5772, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5341, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6444, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5778, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4817, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4979, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3968, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4235, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5862, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5721, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4405, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4834, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6183, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3257, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4918, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5119, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5021, device='cuda:0', grad_fn=<AddBackward0>)
1000


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3943, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4629, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5531, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3520, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4358, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4905, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7169, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6677, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8229, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3460, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7431, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3664, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3515, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5235, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4166, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6290, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5212, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4069, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5027, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4884, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5055, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5055, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4881, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4549, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7851, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6285, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6898, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6346, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5149, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3798, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4497, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4635, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4465, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3347, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5082, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5412, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5497, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6204, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5238, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4665, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4398, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7364, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6074, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2808, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4218, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4375, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6781, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3559, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5110, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3466, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5234, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3798, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4551, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3641, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5272, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9916, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3870, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4310, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3180, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3785, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4543, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3722, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6126, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4756, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5566, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5558, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3796, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3531, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5810, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4112, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5422, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5450, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6257, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2483, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5181, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3959, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6011, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6278, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5081, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4282, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3921, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3996, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4099, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4214, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4689, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4471, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5436, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3561, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5155, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3283, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4033, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4091, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4302, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4060, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4361, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2998, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3645, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4974, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.1576, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2723, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4574, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6879, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3156, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3927, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4895, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5229, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6242, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6111, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3986, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4355, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3457, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3562, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2629, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2698, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5849, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5396, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4363, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4275, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4187, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3940, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4234, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4516, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4688, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3261, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4246, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3778, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3361, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4418, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6936, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2802, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5324, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6485, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7528, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5770, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3725, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6274, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3773, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2768, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3634, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4947, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7825, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3631, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3423, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3679, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4890, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3503, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7798, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4937, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4870, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4391, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6185, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5965, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3854, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4586, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4852, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3620, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6455, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2637, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4608, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3670, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4847, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2872, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3898, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5961, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4613, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5565, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5223, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6268, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5651, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4711, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4857, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3819, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4105, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5697, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5561, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4264, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4671, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6048, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3093, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4768, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4953, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4849, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3844, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4514, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5406, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3360, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4231, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4798, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7000, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6537, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.8048, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3326, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7285, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3536, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3398, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5109, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4028, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6092, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5026, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3935, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4884, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4759, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4962, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4937, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4777, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4412, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6984, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6112, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6788, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6200, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4997, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3662, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4367, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4486, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4334, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3263, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4889, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5312, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5350, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6047, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5146, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4551, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4328, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.7262, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5962, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2708, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4082, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4263, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6615, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3476, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4979, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3350, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5117, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3685, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4494, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3543, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5115, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.9778, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3737, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4823, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4215, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3075, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3674, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4439, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3554, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6025, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4685, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5417, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5499, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3662, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3434, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5643, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3974, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5270, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5323, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6165, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2368, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5106, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3875, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5868, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6168, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4967, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4160, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3873, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3912, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4041, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4106, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4600, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4380, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5342, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3452, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.5058, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3173, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3942, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4029, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4231, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3954, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4298, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2920, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3563, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4871, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.1514, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.2646, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.4459, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.6766, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3084, device='cuda:0', grad_fn=<AddBackward0>)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(2.3842, device='cuda:0', grad_fn=<AddBackward0>)


## Text generation example

In [51]:
gpt.eval()
with torch.no_grad():
  prompt = tokenizer("You are a fortune teller who does tarot readings. Please answer the following questions with your interpretation of the cards in [].;does he like me?[The Devil]####", truncation=True, padding=True, max_length=500, return_tensors='pt')
  prompt = {key: value.to(device) for key, value in prompt.items()}
  out = gpt.generate(**prompt, max_length=500,  temperature=0.7, do_sample=True, )
  print(tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


You are a fortune teller who does tarot readings. Please answer the following questions with your interpretation of the cards in [].;does he like me?[The Devil]###### In the near future, there is a possibility that you will be involved with a new type of love. If you want to increase the probability of success, it is important to have a clear idea of "what you are looking for" and "what you want to do". If you have a new experience that you have never been interested in, you will surely be happy and will get the chance to fall in love. If you don't want to get stuck with the current love, it is important to deepen your understanding of love and deepen your romance knowledge.<|endoftext|>
