In [1]:
# Fixing path
import os

os.getcwd()   # Verify where it is right now...

'/home/jobquiroz/full_stack_deep_learning/lab04'

In [2]:
# Solution, go to lab directory:
os.chdir('/home/jobquiroz/full_stack_deep_learning/lab04/')

In [3]:
from IPython.display import display, HTML, IFrame

full_width = True
frame_height = 720  # adjust for your screen

if full_width:  # if we want the notebook to take up the whole width
    # add styling to the notebook's HTML directly
    display(HTML("<style>.container { width:100% !important; }</style>"))
    display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

In [4]:
from text_recognizer.data.iam import IAM  # base dataset of images of handwritten text
from text_recognizer.data import IAMLines  # processed version split into individual lines
from text_recognizer.models import LineCNNTransformer  # simple CNN encoder / Transformer decoder

print(IAM.__doc__)

# uncomment a line below for details on either class
# IAMLines??  
# LineCNNTransformer??

A dataset of images of handwritten text written on a form underneath a typewritten prompt.

    "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text,
    which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels."
    From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database

    Images are identified by their "form ID". These IDs are used to separate train, validation and test splits,
    as keys for dictonaries returning label and image crop region data, and more.

    The data split we will use is
    IAM lines Large Writer Independent Text Line Recognition Task (LWITLRT): 9,862 text lines.
        The validation set has been merged into the train set.
        The train set has 7,101 lines from 326 writers.
        The test set has 1,861 lines from 128 writers.
        The text lines of all data sets are mutually exclusive, thus each writer has contributed to one set only.
    


The cell below will train a model on 10% of the data for two epochs.

It takes up to a few minutes to run on commodity hardware, including data download and preprocessing. As it's running, continue reading below.

In [15]:
%%time
import torch


gpus = int(torch.cuda.is_available()) 

%run training/run_experiment.py --model_class LineCNNTransformer --data_class IAMLines \
  --loss transformer --batch_size 32 --gpus {gpus} --max_epochs 2 \
  --limit_train_batches 0.1 --limit_val_batches 0.1 --limit_test_batches 0.1 --log_every_n_steps 10

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                      | Type               | Params
------------------------------------------------------------------
0  | model                     | LineCNNTransformer | 4.3 M 
1  | model.line_cnn            | LineCNN            | 1.6 M 
2  | model.embedding           | Embedding          | 21.2 K
3  | model.fc                  | Linear             | 21.3 K
4  | model.pos_encoder         | PositionalEncoding | 0     
5  | model.transformer_decoder | TransformerDecoder | 2.6 M 
6  | train_acc                 | Accuracy           | 0     
7  | val_acc                   | Accuracy           | 0     
8  | test_acc      

Model State Dict Disk Size: 17.23 MB


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

Best model saved at: /home/jobquiroz/full_stack_deep_learning/lab04/training/logs/lightning_logs/version_1/epoch=0000-validation.loss=3.127-validation.cer=1.893.ckpt


───────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
───────────────────────────────────────────────────────────────────────────────────────
        test/cer             2.03977632522583
        test/loss           3.2378978729248047
───────────────────────────────────────────────────────────────────────────────────────
CPU times: user 41.8 s, sys: 7.59 s, total: 49.4 s
Wall time: 50.1 s


### TensorBoard

In [6]:
# we use a sequence of bash commands to get the latest experiment's directory
#  by hand, you can just copy and paste it from the terminal

list_all_log_files = "find training/logs/lightning_logs/"  # find avoids issues ls has with \n in filenames
filter_to_folders = "grep '_[0-9]*$'"  # regex match on end of line
sort_version_descending = "sort -Vr"  # uses "version" sorting (-V) and reverses (-r)
take_first = "head -n 1"  # the first n elements, n=1

In [7]:
latest_log, = ! {list_all_log_files} | {filter_to_folders} | {sort_version_descending} | {take_first}
latest_log

'training/logs/lightning_logs/version_0'

In [8]:
!ls -lh {latest_log}

total 99M
-rw-r--r-- 1 jobquiroz jobquiroz  50M Aug 30 01:19 'epoch=0000-validation.loss=3.127-validation.cer=1.893.ckpt'
-rw-r--r-- 1 jobquiroz jobquiroz  50M Aug 30 01:19 'epoch=0001-validation.loss=3.122-validation.cer=1.893.ckpt'
-rw-r--r-- 1 jobquiroz jobquiroz 1.3K Aug 30 01:19  events.out.tfevents.1661822350.deep-learning.17708.0
-rw-r--r-- 1 jobquiroz jobquiroz  176 Aug 30 01:19  events.out.tfevents.1661822395.deep-learning.17708.1
-rw-r--r-- 1 jobquiroz jobquiroz    3 Aug 30 01:19  hparams.yaml


In [9]:
%load_ext tensorboard

In [16]:
# same command works in terminal, with "{arguments}" replaced with values or "$VARIABLES"

port = 6006  # pick an open port on your machine
host = "0.0.0.0" # allow connections from the internet
                 #   watch out! make sure you turn TensorBoard off

%tensorboard --logdir {latest_log} --port {port} --host {host}

All lightning_logs:

In [17]:
%tensorboard --logdir training/logs/lightning_logs --port {port + 1} --host "0.0.0.0"

In [18]:
import tensorboard.manager

# get the process IDs for all tensorboard instances
pids = [tb.pid for tb in tensorboard.manager.get_all()]

done_with_tensorboard = False

if done_with_tensorboard:
    # kill processes
    for pid in pids:
        !kill {pid} 2> /dev/null
        
    # remove the temporary files that sometimes persist, see https://stackoverflow.com/a/59582163
    !rm -rf {tensorboard.manager._get_info_dir()}

## W & B

In [19]:
import wandb

print(wandb.__doc__)

Use wandb to track machine learning work.

The most commonly used functions/objects are:
  - wandb.init — initialize a new run at the top of your training script
  - wandb.config — track hyperparameters and metadata
  - wandb.log — log metrics and media over time within your training loop

For guides and examples, see https://docs.wandb.com/guides.

For scripts and interactive notebooks, see https://github.com/wandb/examples.

For reference documentation, see https://docs.wandb.com/ref/python.



In [20]:
!grep "args.wandb" -A 5 training/run_experiment.py | head -n 6

    if args.wandb:
        logger = pl.loggers.WandbLogger(log_model="all", save_dir=str(log_dir), job_type="train")
        logger.watch(model, log_freq=max(100, args.log_every_n_steps))
        logger.log_hyperparams(vars(args))
        experiment_dir = logger.experiment.dir
    callbacks += [cb.ModelSizeLogger(), cb.LearningRateMonitor()]


In [21]:
from pytorch_lightning.loggers import WandbLogger


WandbLogger??

In [25]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mjobquiroz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
%%time
%run training/run_experiment.py --model_class LineCNNTransformer --data_class IAMLines \
  --loss transformer --batch_size 32 --gpus {gpus} --max_epochs 10 \
  --log_every_n_steps 10 --wandb --limit_test_batches 0.1 \
  --limit_train_batches 0.1 --limit_val_batches 0.1
    
last_expt = wandb.run

wandb.finish()  # necessary in this style of in-notebook experiment running, not necessary in CLI

[34m[1mwandb[0m: Currently logged in as: [33mjobquiroz[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                      | Type               | Params
------------------------------------------------------------------
0  | model                     | LineCNNTransformer | 4.3 M 
1  | model.line_cnn            | LineCNN            | 1.6 M 
2  | model.embedding           | Embedding          | 21.2 K
3  | model.fc                  | Linear             | 21.3 K
4  | model.pos_encoder         | PositionalEncoding | 0     
5  | model.transformer_decoder | TransformerDecoder | 2.6 M 
6  | train_acc                 | Accuracy           | 0     

Model State Dict Disk Size: 17.23 MB


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

Best model saved at: /home/jobquiroz/full_stack_deep_learning/lab04/training/logs/lightning_logs/version_2/epoch=0009-validation.loss=2.365-validation.cer=0.849.ckpt
Best model also uploaded to W&B 


───────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
───────────────────────────────────────────────────────────────────────────────────────
        test/cer             0.908748209476471
        test/loss           2.5084540843963623
───────────────────────────────────────────────────────────────────────────────────────


VBox(children=(Label(value='526.725 MB of 526.725 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█
optimizer/lr-Adam,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
size/mb_disk,▁
size/nparams,▁
test/cer,▁
test/loss,▁
train/loss,██▆▆▆▇▇▇▇▆▅▃▂▃▃▂▂▂▂▂▁▁▂▁▁▁▂▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
validation/cer,███▇▇▇▂▇▇▁
validation/loss,███▅▃▂▂▂▁▁

0,1
epoch,10.0
optimizer/lr-Adam,0.001
size/mb_disk,17.22701
size/nparams,4297331.0
test/cer,0.90875
test/loss,2.50845
train/loss,2.49651
trainer/global_step,290.0
validation/cer,0.84905
validation/loss,2.36482


CPU times: user 2min 41s, sys: 32.4 s, total: 3min 13s
Wall time: 3min 31s


In [29]:
last_expt

### Runs

In [30]:
print(last_expt.url)
IFrame(last_expt.url, width="100%", height=frame_height)

https://wandb.ai/jobquiroz/full_stack_deep_learning-lab04/runs/1p4bakfy


In [31]:
table_versions_url = last_expt.url.split("runs")[0] + f"artifacts/run_table/run-{last_expt.id}-trainpredictions/"
table_data_url = table_versions_url + "v0/files/train/predictions.table.json"

print(table_data_url)
IFrame(src=table_data_url, width="100%", height=frame_height)

https://wandb.ai/jobquiroz/full_stack_deep_learning-lab04/artifacts/run_table/run-1p4bakfy-trainpredictions/v0/files/train/predictions.table.json


In [32]:
from text_recognizer.callbacks.imtotext import ImageToTextTableLogger


ImageToTextTableLogger??

In [33]:
from text_recognizer.lit_models.base import BaseImageToTextLitModel

BaseImageToTextLitModel.add_on_logged_batches??

I'm not done yet...