# pytorch擴充包：pytorch-lightning [Video](https://www.youtube.com/watch?v=O7dNXpgdWbo) + [GitHub](https://github.com/kwea123/pytorch-lightning-tutorial)

# <font color="red">Step 1--建立`資料夾(名為models)`</font>

In [None]:
!mkdir models

# <font color="red">Step 2--執行`%%writefile models/networks.py`(下一個cell)</font>

In [None]:
# models資料夾需要是先建立在Colab
%%writefile models/networks.py
import torch
from torch import nn

from einops import rearrange, reduce, repeat


class LinearModel(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(28 * 28, hidden_dim),
            nn.ReLU(True),
            nn.Linear(hidden_dim, 10)
            )

    def forward(self, x): # x 已經用transform=T.ToTensor()轉成tensor，因此x大小不是(B, 28, 28)

        # x = x.flatten() #
        """
        x: (B, 1, 28, 28) batch of images
        """
        x = rearrange(x, 'b 1 x y -> b (x y)', x=28, y=28)
        # 也可以是 x = rearrange(x, 'b 1 x y -> b (x y)')
        return self.net(x)

Writing models/networks.py


# <font color="red">Step 3--執行`%%writefile opt.py`(下一個cell)</font>

In [None]:
%%writefile opt.py
import argparse

def get_opts():
    parser = argparse.ArgumentParser()

    parser.add_argument('--root_dir', type=str, required=True,
                        help='root directory of dataset')
    parser.add_argument('--hidden_dim', type=int, default=128,
                        help='number of hidden dimensions')

    parser.add_argument('--val_size', type=int, default=5000,
                        help='size of validation set')

    parser.add_argument('--batch_size', type=int, default=128,
                        help='number of batch size')
    parser.add_argument('--lr', type=float, default=1e-4,
                        help='learning rate')
    parser.add_argument('--num_epochs', type=int, default=10,
                        help='number of epochs')
    parser.add_argument('--num_workers', type=int, default=4,
                        help='number of workers for data loader')

    parser.add_argument('--exp_name', type=str, default='exp',
                        help='experiment name')

    return parser.parse_args()

Writing opt.py


# <font color="red">Step 4--執行`%%writefile train.py`(下一個cell)</font>

## <font color="red">`train.py`</font>


---


注意：在執行`!python train.py`時, scheduler(在configure_optimizer中)沒法運行，不管是pytorch_lightning==1.6.0 或是 >2 的版本以上，會出現

```
The provided lr scheduler CosineAnnealingLR doesn't follow PyTorch's LRScheduler API.
You should override the LightningModule.lr_scheduler_step hook with your own logic if you are using a custom LR scheduler.
```
估計是在Colab上pytorch 與 pytorch_lightning 不匹配(在本機上的solution如下一個cell)。

因此在使用(Colab)當下default torch版本，`train.py`不使用scheduler，pytorch_lightning==1.6.0



若在本機在執行`python train.py`關於錯誤
```
pytorch_lightning.utilities.exceptions.MisconfigurationException:
The provided lr scheduler `CosineAnnealingLR` doesn't follow PyTorch's LRScheduler API.
You should override the `LightningModule.lr_scheduler_step` hook with your own logic if you are using a custom LR scheduler.
```
的[解決方法之一](https://blog.csdn.net/yangyu0515/article/details/131945195)(用在本機)


In [None]:
%%writefile train.py
import torch
from torch.nn import functional as F

from opt import get_opts

# datasets
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from torchvision import transforms as T

# models
from models.networks import LinearModel

# optimizer
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR

from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger

seed_everything(1234, workers=True) # 固定所有的亂數。workers=True讓取data也會是固定的，而非亂數


def get_learning_rate(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


class MNISTSystem(LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.save_hyperparameters(hparams) # 將使用過的 hyperparameters 都存起來

        # 定義模型：
        # self.net = nn.Sequential(
        #     nn.Linear(28*28, self.hparams.hidden_dim),
        #     nn.ReLU(True),
        #     nn.Linear(self.hparams.hidden_dim, 10)
        # )

        # 定義模型：由分開的檔案(models/networks.py)存放模型，再引入
        self.net = LinearModel(self.hparams.hidden_dim)

    def forward(self, x):
        # 改變x的形狀大小。此步驟移至分開的檔案(models/networks.py)
        # x = rearrange(x, 'b 1 x y -> b (x y)', x=28, y=28)
        return self.net(x)

    def prepare_data(self): # 下載資料
        """
        download data once (下載資料--只會執行一次)
        """
        MNIST(self.hparams.root_dir, train=True, download=True)
        MNIST(self.hparams.root_dir, train=False, download=True)

    def setup(self, stage=None): # 讀取資料
        """
        setup dataset for each machine (分配資料到每一個運行的實際硬體單元--會執行多次)
        """
        dataset = MNIST(self.hparams.root_dir,
                  train=True,
                  download=False,
                  transform=T.ToTensor())
        train_length = len(dataset) # 60000
        self.train_dataset, self.val_dataset = \
            random_split(dataset,
                   [train_length-self.hparams.val_size, self.hparams.val_size])

    def train_dataloader(self): # 定義loader (訓練)
        return DataLoader(self.train_dataset,
                  shuffle=True,
                  num_workers=self.hparams.num_workers, # 與GPU數目有關
                  batch_size=self.hparams.batch_size, # batch_size並非無上限，理論上是不能超過(GPU)memory大小
                  pin_memory=True) # 使用GPU時，用pin_memory運行，速度會較快

    def val_dataloader(self): # 定義loader (檢驗)
        return DataLoader(self.val_dataset,
                  shuffle=False,
                  num_workers=self.hparams.num_workers,
                  batch_size=self.hparams.batch_size,
                  pin_memory=True)

    def configure_optimizers(self): # 定義：使用何種方法進行梯度下降處理(設定優化器)
        self.optimizer = Adam(self.net.parameters(), lr=self.hparams.lr) # self.net.parameters()，把前面定義的模型之參數放入

        # scheduler = CosineAnnealingLR(self.optimizer,
        #                  T_max=self.hparams.num_epochs, # 最大訓練epochs的量
        #                  eta_min=self.hparams.lr/1e2)  # 最小學習率

        # return [self.optimizer], [scheduler]
        # return [optimizer_A, optimizer_B],[scheduler_A, scheduler_B] # 在GAN中就可以這麼用
        return self.optimizer # 如果沒用scheduler，可以只回傳optimizer

    def training_step(self, batch, batch_idx): # batch 來自 train_dataloader，batch_idx就是batch的編號，少用到（可以不用）
        images, labels = batch # 圖片資料, 圖片標籤(對應的one-hot vector)
        logits_predicted = self(images) # self()會呼叫forward()函數來執行

        loss = F.cross_entropy(logits_predicted, labels)

        self.log('lr', get_learning_rate(self.optimizer))
        self.log('train/loss', loss) # 如果要顯示loss 的progress bar 進度，可加參數`prog_bar=True`

        return loss

    def validation_step(self, batch, batch_idx): # batch_idx會被使用到的情況，通常是為了紀錄log時
        images, labels = batch
        logits_predicted = self(images)

        loss = F.cross_entropy(logits_predicted, labels)
        acc = torch.sum(torch.eq(torch.argmax(logits_predicted, -1), labels).to(torch.float32)) / len(labels)

        log = {'val_loss': loss,
             'val_acc': acc}

        return log

    def validation_epoch_end(self, outputs):
        mean_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        mean_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

        self.log('val/loss', mean_loss, prog_bar=True)
        self.log('val/acc', mean_acc, prog_bar=True)


if __name__ == '__main__':
    hparams = get_opts()
    mnistsystem = MNISTSystem(hparams)

    # 模型保存點
    ckpt_cb = ModelCheckpoint(dirpath=f'ckpts/{hparams.exp_name}', # 設定模型保存路徑(每個實驗都有各自的名稱)
                   filename='{epoch:d}',
                   monitor='val/acc', # 檢測 val/acc。如果檢測 val/loss，則mode設為min
                   mode='max',     # 最大
                   save_top_k=5)    # 5筆（若要全部保存，就設為-1）
    pbar = TQDMProgressBar(refresh_rate=1)     # 一步就更新進度條
    callbacks = [ckpt_cb, pbar]

    # 設定log記錄點
    logger = TensorBoardLogger(save_dir="logs",     # 設定檔案夾
                   name=hparams.exp_name,  # 實驗名稱
                   default_hp_metric=False)

    trainer = Trainer(max_epochs=hparams.num_epochs, # 設定最大epochs數
              callbacks=callbacks,       # 保存點與進度條
              # resume_from_checkpoint= hparams.ckpt_path, # 訓練中斷，可保存中斷點，重啟時可從此保存點繼續訓練
              logger=logger,          # 紀錄log的地方
              enable_model_summary=True,    # 開始顯示模型的初始構造
              accelerator='auto',        # 自動偵測要使用GPU或是CPU
              devices=1,             # 多少GPU
              num_sanity_val_steps=1,      # 開始訓練前，指定先運行val的次數
              benchmark=True,   # 使用CUDN(深度學習框架)為True時，input尺度相同時，會找出最好的演算法使速度加快
              # profile='simple',      # 顯示訓練時，不同步驟所需的時間，可用以improvement(如果使用，會顯示很多)
              )
    # 啟動訓練
    trainer.fit(mnistsystem)

Writing train.py


## <font color="red">`train_u.py`</font>



---
前面提到，由於在Colab執行`python train.py`(原始)時，會出現關於scheduler的問題，因此安裝 pytorch_lightning > 2.0 版本，但執行`python train.py`(原始)又會出現新的錯誤(如下)

```
NotImplementedError: Support for `validation_epoch_end` has been removed in v2.0.0. `MNISTSystem` implements this method.
You can use the `on_validation_epoch_end` hook instead. To access outputs, save them in-memory as instance attributes.
You can find migration examples in https://github.com/Lightning-AI/lightning/pull/16520.
```
`train_u.py`主要是把 `def validation_epoch_end(self, outputs): `改成 `def on_validation_epoch_end(self):` ，還有其他兩處，共修改三處，如此就可以在Colab中正常使用



In [None]:
# %%writefile train_u.py
import torch
from torch.nn import functional as F

from opt import get_opts

# datasets
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from torchvision import transforms as T

# models
from models.networks import LinearModel

# optimizer
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR

from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger

seed_everything(1234, workers=True) # 固定所有的亂數。workers=True讓取data也會是固定的，而非亂數


def get_learning_rate(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


class MNISTSystem(LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.save_hyperparameters(hparams) # 將使用過的 hyperparameters 都存起來
        '''為了configure_optimizers(self)的scheduler'''
        self.validation_step_outputs = []
        # 定義模型：
        # self.net = nn.Sequential(
        #     nn.Linear(28*28, self.hparams.hidden_dim),
        #     nn.ReLU(True),
        #     nn.Linear(self.hparams.hidden_dim, 10)
        # )

        # 定義模型：由分開的檔案(models/networks.py)存放模型，再引入
        self.net = LinearModel(self.hparams.hidden_dim)

    def forward(self, x):
        # 改變x的形狀大小。此步驟移至分開的檔案(models/networks.py)
        # x = rearrange(x, 'b 1 x y -> b (x y)', x=28, y=28)
        return self.net(x)

    def prepare_data(self): # 下載資料
        """
        download data once (下載資料--只會執行一次)
        """
        MNIST(self.hparams.root_dir, train=True, download=True)
        MNIST(self.hparams.root_dir, train=False, download=True)

    def setup(self, stage=None): # 讀取資料
        """
        setup dataset for each machine (分配資料到每一個運行的實際硬體單元--會執行多次)
        """
        dataset = MNIST(self.hparams.root_dir,
                  train=True,
                  download=False,
                  transform=T.ToTensor())
        train_length = len(dataset) # 60000
        self.train_dataset, self.val_dataset = \
            random_split(dataset,
                   [train_length-self.hparams.val_size, self.hparams.val_size])

    def train_dataloader(self): # 定義loader (訓練)
        return DataLoader(self.train_dataset,
                  shuffle=True,
                  num_workers=self.hparams.num_workers, # 與GPU數目有關
                  batch_size=self.hparams.batch_size, # batch_size並非無上限，理論上是不能超過(GPU)memory大小
                  pin_memory=True) # 使用GPU時，用pin_memory運行，速度會較快

    def val_dataloader(self): # 定義loader (檢驗)
        return DataLoader(self.val_dataset,
                  shuffle=False,
                  num_workers=self.hparams.num_workers,
                  batch_size=self.hparams.batch_size,
                  pin_memory=True)

    def configure_optimizers(self): # 定義：使用何種方法進行梯度下降處理(設定優化器)
        self.optimizer = Adam(self.net.parameters(), lr=self.hparams.lr) # self.net.parameters()，把前面定義的模型之參數放入

        scheduler = CosineAnnealingLR(self.optimizer,
                         T_max=self.hparams.num_epochs, # 最大訓練epochs的量
                         eta_min=self.hparams.lr/1e2)  # 最小學習率

        # scheduler = CosineAnnealingLR(self.optimizer, T_max=20, eta_min = 1e-5)

        return [self.optimizer], [scheduler]
        # return [optimizer_A, optimizer_B],[scheduler_A, scheduler_B] # 在GAN中就可以這麼用
        # return self.optimizer # 如果沒用scheduler，可以只回傳optimizer

    def training_step(self, batch, batch_idx): # batch 來自 train_dataloader，batch_idx就是batch的編號，少用到（可以不用）
        images, labels = batch # 圖片資料, 圖片標籤(對應的one-hot vector)
        logits_predicted = self(images) # self()會呼叫forward()函數來執行

        loss = F.cross_entropy(logits_predicted, labels)

        self.log('lr', get_learning_rate(self.optimizer))
        self.log('train/loss', loss) # 如果要顯示loss 的progress bar 進度，可加參數`prog_bar=True`

        return loss

    def validation_step(self, batch, batch_idx): # batch_idx會被使用到的情況，通常是為了紀錄log時
        images, labels = batch
        logits_predicted = self(images)

        loss = F.cross_entropy(logits_predicted, labels)
        acc = torch.sum(torch.eq(torch.argmax(logits_predicted, -1), labels).to(torch.float32)) / len(labels)

        log = {'val_loss': loss,
             'val_acc': acc}

        self.validation_step_outputs.append(log)

        return log

    '''原本是validation_epoch_end(self)，為了configure_optimizers(self)的scheduler而改'''
    def on_validation_epoch_end(self):
        '''為了configure_optimizers(self)的scheduler'''
        outputs = self.validation_step_outputs
        mean_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        mean_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

        self.log('val/loss', mean_loss, prog_bar=True)
        self.log('val/acc', mean_acc, prog_bar=True)


if __name__ == '__main__':
    hparams = get_opts()
    mnistsystem = MNISTSystem(hparams)

    # 模型保存點
    ckpt_cb = ModelCheckpoint(dirpath=f'ckpts/{hparams.exp_name}', # 設定模型保存路徑(每個實驗都有各自的名稱)
                   filename='{epoch:d}',
                   monitor='val/acc', # 檢測 val/acc。如果檢測 val/loss，則mode設為min
                   mode='max',     # 最大
                   save_top_k=5)    # 5筆（若要全部保存，就設為-1）
    pbar = TQDMProgressBar(refresh_rate=1)     # 一步就更新進度條
    callbacks = [ckpt_cb, pbar]

    # 設定log記錄點
    logger = TensorBoardLogger(save_dir="logs",     # 設定檔案夾
                   name=hparams.exp_name,  # 實驗名稱
                   default_hp_metric=False)

    trainer = Trainer(max_epochs=hparams.num_epochs, # 設定最大epochs數
              callbacks=callbacks,       # 保存點與進度條
              # resume_from_checkpoint= hparams.ckpt_path, # 訓練中斷，可保存中斷點，重啟時可從此保存點繼續訓練
              logger=logger,          # 紀錄log的地方
              enable_model_summary=True,    # 開始顯示模型的初始構造
              accelerator='auto',        # 自動偵測要使用GPU或是CPU
              devices=1,             # 多少GPU
              num_sanity_val_steps=1,      # 開始訓練前，指定先運行val的次數
              benchmark=True,   # 使用CUDN(深度學習框架)為True時，input尺度相同時，會找出最好的演算法使速度加快
              # profile='simple',      # 顯示訓練時，不同步驟所需的時間，可用以improvement(如果使用，會顯示很多)
              )
    # 啟動訓練
    trainer.fit(mnistsystem)

Writing train_u.py




```python
for param_group in optimizer.param_groups:
  return param_group['lr']

# optimizer.param_groups長相，舉例
[{'amsgrad': False,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'lr': 0.001,
  'params': [tensor([[ 2.9064, -0.2141, -0.4037],
           [-0.5718,  1.0375, -0.6862],
           [-0.8372,  0.4380, -0.1572]])],
  'weight_decay': 0}]
```
[optimizer.param_groups的參考](https://www.jb51.net/article/213735.htm)



# <font color="red">Step 5--安裝`pytorch_lightning`與`einops`</font>

注意，此`train.py`(原始)有修改一處，不執行在configure_optimizer中scheduler，以下實驗是配合`pytorch_lightning==1.6.0`

In [None]:
!pip install pytorch_lightning==1.6.0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m582.1/582.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install pytorch_lightning==2.0.0 --quiet
# !pip install pytorch_lightning==2.0.1.post0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m715.6/715.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install einops==0.4.1

Collecting einops==0.4.1
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [None]:
# 測試用
# import pytorch_lightning
# pytorch_lightning.__version__

'2.0.1.post0'

# <font color="red">Step 6 -- 執行 `!python train.py 選項`(下一個cell)</font>

## <font color="red">執行`train.py`</font>

In [None]:
!python train.py --root_dir "./" --num_epoch 50 --exp_name train_50

2024-03-07 05:43:07.095839: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 05:43:07.095897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 05:43:07.097745: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz
100% 99124

## <font color="red">執行`train_u.py`</font>

In [None]:
# 使用GPU 且 pytorch_lightning >= 2.0
!python train_u.py --root_dir "./" --num_epoch 50 --exp_name train_u_50

Global seed set to 1234
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz
100% 9912422/9912422 [00:00<00:00, 111035560.80it/s]
Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz
100% 28881/28881 [00:00<00:00, 171337615.03it/s]
Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz
100% 1648877/1648877 [00:00<00:00, 

In [None]:
# 使用CPU(pytorch_lightning==2.0.0 或 2.0.1;post0 結果都一樣)，會反覆出現"...Validation DataLoader..."
# !python train_u.py --root_dir "./" --num_epoch 5 --exp_name train_u_5

Global seed set to 1234
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz
100% 9912422/9912422 [00:00<00:00, 109676164.06it/s]
Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz
100% 28881/28881 [00:00<00:00, 145946619.07it/s]
Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz
100% 1648877/1648877 [00:00<00:00, 29868

## <font color="red">壓縮資料夾，用以下載</font>

In [None]:
# !zip -r logs_train_50.zip logs/train_50
!zip -r logs_train_u_50.zip logs/train_u_50

  adding: logs/train_u_50/ (stored 0%)
  adding: logs/train_u_50/version_0/ (stored 0%)
  adding: logs/train_u_50/version_0/hparams.yaml (deflated 16%)
  adding: logs/train_u_50/version_0/events.out.tfevents.1709796671.0874785c57eb.2014.0 (deflated 69%)


In [None]:
!zip -r ckpts_train_u_50.zip ckpts/train_u_50

  adding: ckpts/train_u_50/ (stored 0%)
  adding: ckpts/train_u_50/epoch=49.ckpt (deflated 11%)
  adding: ckpts/train_u_50/epoch=48.ckpt (deflated 11%)
  adding: ckpts/train_u_50/epoch=45.ckpt (deflated 11%)
  adding: ckpts/train_u_50/epoch=46.ckpt (deflated 11%)
  adding: ckpts/train_u_50/epoch=47.ckpt (deflated 11%)


## <font color="red">上傳後，解壓使用</font>

In [None]:
!unzip ckpts_train_u_50.zip
!unzip logs_train_u_50.zip

Archive:  ckpts_train_u_50.zip
   creating: ckpts/train_u_50/
  inflating: ckpts/train_u_50/epoch=49.ckpt  
  inflating: ckpts/train_u_50/epoch=48.ckpt  
  inflating: ckpts/train_u_50/epoch=45.ckpt  
  inflating: ckpts/train_u_50/epoch=46.ckpt  
  inflating: ckpts/train_u_50/epoch=47.ckpt  
Archive:  logs_train_u_50.zip
   creating: logs/train_u_50/
   creating: logs/train_u_50/version_0/
  inflating: logs/train_u_50/version_0/hparams.yaml  
  inflating: logs/train_u_50/version_0/events.out.tfevents.1709796671.0874785c57eb.2014.0  


# <font color="red">Step 7 -- 用 `tensorboard` 檢視實驗log</font>
[Can I use TensorBoard with Google Colab?](https://stackoverflow.com/questions/47818822/can-i-use-tensorboard-with-google-colab)

In [None]:
!pip install tensorrt --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for tensorrt (setup.py) ... [?25l[?25hdone


In [None]:
%load_ext tensorboard

In [None]:
# 產生的結果無法儲存
%tensorboard --logdir logs

In [None]:
!rm -rf logs

In [None]:
!rm -rf ckpts