In [1]:
import os

os.getcwd()   # Verify where it is right now...

'/home/jobquiroz/full_stack_deep_learning/lab05/notebooks'

In [2]:
# Solution, go to lab directory:
os.chdir('/home/jobquiroz/full_stack_deep_learning/lab05/')

In [3]:
!pre-commit run --all-files

trim trailing whitespace.................................................[42mPassed[m
check toml...............................................................[42mPassed[m
check yaml...............................................................[42mPassed[m
check json...............................................................[42mPassed[m
check for merge conflicts................................................[42mPassed[m
check for added large files..............................................[42mPassed[m
debug statements (python)................................................[42mPassed[m
detect private key.......................................................[42mPassed[m
black....................................................................[42mPassed[m
flake8...............................................(no files to check)[46;30mSkipped[m
shellcheck...........................................(no files to check)[46;30mSkipped[m


In [4]:
!pre-commit run --all-files

trim trailing whitespace.................................................[42mPassed[m
check toml...............................................................[42mPassed[m
check yaml...............................................................[42mPassed[m
check json...............................................................[42mPassed[m
check for merge conflicts................................................[42mPassed[m
check for added large files..............................................[42mPassed[m
debug statements (python)................................................[42mPassed[m
detect private key.......................................................[42mPassed[m
black....................................................................[42mPassed[m
flake8...............................................(no files to check)[46;30mSkipped[m
shellcheck...........................................(no files to check)[46;30mSkipped[m


In [5]:
!cat .pre-commit-config.yaml

repos:
  # a set of useful Python-based pre-commit hooks
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.1.0
    hooks:
      # list of definitions and supported hooks: https://pre-commit.com/hooks.html
      - id: trailing-whitespace      # removes any whitespace at the ends of lines
      - id: check-toml               # check toml syntax by loading all toml files
      - id: check-yaml               # check yaml syntax by loading all yaml files
      - id: check-json               # check-json syntax by loading all json files
      - id: check-merge-conflict     # check for files with merge conflict strings
        args: ['--assume-in-merge']  #  and run this check even when not explicitly in a merge
      - id: check-added-large-files  # check that no "large" files have been added
        args: ['--maxkb=10240']      #  where large means 10MB+, as in Hugging Face's git server
      - id: debug-statements         # check for python debug statements

In [6]:
!cat .pre-commit-config.yaml | grep repos -A 15

repos:
  # a set of useful Python-based pre-commit hooks
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.1.0
    hooks:
      # list of definitions and supported hooks: https://pre-commit.com/hooks.html
      - id: trailing-whitespace      # removes any whitespace at the ends of lines
      - id: check-toml               # check toml syntax by loading all toml files
      - id: check-yaml               # check yaml syntax by loading all yaml files
      - id: check-json               # check-json syntax by loading all json files
      - id: check-merge-conflict     # check for files with merge conflict strings
        args: ['--assume-in-merge']  #  and run this check even when not explicitly in a merge
      - id: check-added-large-files  # check that no "large" files have been added
        args: ['--maxkb=10240']      #  where large means 10MB+, as in Hugging Face's git server
      - id: debug-statements         # check for python debug statements

In [7]:
!cat .pre-commit-config.yaml | grep "flake8 python" -A 10

  # flake8 python linter with all the fixins
  - repo: https://github.com/PyCQA/flake8
    rev: 3.9.2
    hooks:
      - id: flake8
        exclude: (lab01|lab02|lab03|lab04|lab06|lab07|lab08)
        additional_dependencies: [
          flake8-annotations, flake8-bandit, flake8-bugbear, flake8-black, flake8-docstrings,
          flake8-import-order, darglint, mypy, pycodestyle, pydocstyle]
        args: ["--config", ".flake8"]
    # additional configuration of flake8 and extensions in .flake8


In [8]:
!cat .flake8

[flake8]
select = ANN,B,B9,BLK,C,D,E,F,I,S,W
  # only check selected error codes
max-complexity = 12
  # C9 - flake8 McCabe Complexity checker -- threshold
max-line-length = 120
  # E501 - flake8 -- line length too long, actually handled by black
extend-ignore =
  # E W - flake8 PEP style check
    E203,E402,E501,W503,  # whitespace, import, line length, binary operator line breaks
  # S - flake8-bandit safety check
    S101,S311,S105,  # assert removed in bytecode, pRNG not secure, hardcoded password
  # ANN - flake8-annotations type annotation check
    ANN,ANN002,ANN003,ANN101,ANN102,ANN202,  # ignore all for now, but always ignore some
  # D1 - flake8-docstrings docstring style check
    D100,D102,D103,D104,D105,  # missing docstrings
  # D2 D4 - flake8-docstrings docstring style check
    D200,D205,D400,D401,  # whitespace issues and first line content
  # DAR - flake8-darglint docstring correctness check
    DAR103,  # mismatched or missing type in docstring
a

In [9]:
!cat tasks/lint.sh

#!/bin/bash
set -uo pipefail
set +e

FAILURE=false

# apply automatic formatting
echo "black"
pre-commit run black || FAILURE=true

# check for python code style violations, see .flake8 for details
echo "flake8"
pre-commit run flake8 || FAILURE=true

# check for shell scripting style violations and common bugs
echo "shellcheck"
pre-commit run shellcheck || FAILURE=true

# check python types
echo "mypy"
pre-commit run mypy || FAILURE=true

if [ "$FAILURE" = true ]; then
  echo "Linting failed"
  exit 1
fi
echo "Linting passed"
exit 0


In [10]:
!cat tasks/lint.sh

#!/bin/bash
set -uo pipefail
set +e

FAILURE=false

# apply automatic formatting
echo "black"
pre-commit run black || FAILURE=true

# check for python code style violations, see .flake8 for details
echo "flake8"
pre-commit run flake8 || FAILURE=true

# check for shell scripting style violations and common bugs
echo "shellcheck"
pre-commit run shellcheck || FAILURE=true

# check python types
echo "mypy"
pre-commit run mypy || FAILURE=true

if [ "$FAILURE" = true ]; then
  echo "Linting failed"
  exit 1
fi
echo "Linting passed"
exit 0


In [11]:
script_filename = "tasks/lint.sh"
!pre-commit run shellcheck --files {script_filename}

shellcheck...............................................................[42mPassed[m


In [12]:
!head -n 3 tasks/lint.sh

#!/bin/bash
set -uo pipefail
set +e


### Testing ML Codebases

**PyTest**

In [13]:
from text_recognizer.lit_models.metrics import test_character_error_rate

test_character_error_rate??

In [14]:
!pytest text_recognizer/lit_models/metrics.py

platform linux -- Python 3.7.13, pytest-7.1.1, pluggy-1.0.0
rootdir: /home/jobquiroz/full_stack_deep_learning/lab05
plugins: typeguard-2.13.3, anyio-3.6.1, cov-3.0.0
collected 1 item                                                               [0m[1m

text_recognizer/lit_models/metrics.py [32m.[0m[32m                                  [100%][0m



In [15]:
!ls text_recognizer/tests

test_callback_utils.py	test_iam.py


In [17]:
from text_recognizer.tests import test_callback_utils

test_callback_utils.__doc__

'Tests for the text_recognizer.callbacks.util module.'

In [18]:
from text_recognizer.callbacks.util import check_and_warn

check_and_warn??

In [19]:

test_callback_utils.test_check_and_warn_simple??

In [21]:
from text_recognizer.lit_models.util import first_appearance


first_appearance??

In [22]:
import torch


first_appearance(torch.tensor([[1, 2, 3], [2, 3, 3], [1, 1, 1], [3, 1, 1]]), 3)

tensor([2, 1, 3, 0])

In [23]:
!pytest --doctest-modules text_recognizer/lit_models/util.py

platform linux -- Python 3.7.13, pytest-7.1.1, pluggy-1.0.0
rootdir: /home/jobquiroz/full_stack_deep_learning/lab05
plugins: typeguard-2.13.3, anyio-3.6.1, cov-3.0.0
collected 2 items                                                              [0m[1m

text_recognizer/lit_models/util.py [32m.[0m[32m.[0m[32m                                    [100%][0m



In [24]:
!grep "assert" -r text_recognizer/data

text_recognizer/data/iam_paragraphs.py:        self.input_dims = metadata.DIMS  # We assert that this is correct in setup()
text_recognizer/data/iam_paragraphs.py:        self.output_dims = metadata.OUTPUT_DIMS  # We assert that this is correct in setup()
text_recognizer/data/iam_paragraphs.py:    assert input_dims is not None and input_dims[1] >= max_image_shape[0] and input_dims[2] >= max_image_shape[1]
text_recognizer/data/iam_paragraphs.py:    assert output_dims is not None and output_dims[0] >= properties["label_length"]["max"] + 2
text_recognizer/data/iam_paragraphs.py:    assert len(crops) == len(labels)
text_recognizer/data/iam_paragraphs.py:    assert len(ordered_crops) == len(ordered_labels)
text_recognizer/data/iam.py:    assert any(region is not None for region in line_regions), "Line regions cannot be None"
text_recognizer/data/iam_lines.py:        self.input_dims = metadata.DIMS  # We assert that this is correct in setup()
text_recognizer/data/iam_lines.py:       

In [25]:
from text_recognizer.tests.test_iam import test_iam_data_splits

test_iam_data_splits??

In [26]:
test_iam_data_splits()

In [27]:
!pytest --markers | head -n 10

@pytest.mark.no_cover: disable coverage for this test.

@pytest.mark.anyio: mark the (coroutine function) test to be run asynchronously via anyio.


@pytest.mark.skip(reason=None): skip the given test function with an optional reason. Example: skip(reason="no way of currently testing this") skips the test.

@pytest.mark.skipif(condition, ..., *, reason=...): skip the given test function if any of the conditions evaluate to True. Example: skipif(sys.platform == 'win32') skips the test if we are on the win32 platform. See https://docs.pytest.org/en/stable/reference/reference.html#pytest-mark-skipif



In [28]:
!wandb login  # one test requires wandb authentication

!pytest -m "not data and not slow"

[34m[1mwandb[0m: Currently logged in as: [33mjobquiroz[0m. Use [1m`wandb login --relogin`[0m to force relogin
platform linux -- Python 3.7.13, pytest-7.1.1, pluggy-1.0.0
rootdir: /home/jobquiroz/full_stack_deep_learning/lab05
plugins: typeguard-2.13.3, anyio-3.6.1, cov-3.0.0
collected 5 items                                                              [0m[1m

text_recognizer/tests/test_callback_utils.py [32m.[0m[32m.[0m[32m.[0m[32m                         [ 60%][0m
text_recognizer/tests/test_iam.py [32m.[0m[32m.[0m[32m                                     [100%][0m



In [29]:
!ls

notebooks  tasks  text_recognizer  training  wandb


In [30]:
from text_recognizer.data import FakeImageData


FakeImageData.__doc__

'Fake images dataset.'

In [31]:
!cat training/tests/test_run_experiment.sh

#!/bin/bash
set -uo pipefail
set +e

FAILURE=false

echo "running full loop test with CNN on fake data"
python training/run_experiment.py --data_class=FakeImageData --model_class=CNN --conv_dim=2 --fc_dim=2 --loss=cross_entropy --num_workers=4 --max_epochs=1 || FAILURE=true

echo "running fast_dev_run test of real model class on real data"
python training/run_experiment.py --data_class=IAMParagraphs --model_class=ResnetTransformer --loss=transformer \
  --tf_dim 4 --tf_fc_dim 2 --tf_layers 2 --tf_nhead 2 --batch_size 2 --lr 0.0001 \
  --fast_dev_run --num_sanity_val_steps 0 \
  --num_workers 1 || FAILURE=true

if [ "$FAILURE" = true ]; then
  echo "Test for run_experiment.py failed"
  exit 1
fi
echo "Tests for run_experiment.py passed"
exit 0


In [32]:
! ./training/tests/test_run_experiment.sh

running full loop test with CNN on fake data
Missing logger folder: training/logs/lightning_logs
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type      | Params
---------------------------------------------
0 | model          | CNN       | 874   
1 | model.conv1    | ConvBlock | 20    
2 | model.conv2    | ConvBlock | 38    
3 | model.dropout  | Dropout   | 0     
4 | model.max_pool | MaxPool2d | 0     
5 | model.fc1      | Linear    | 786   
6 | model.fc2      | Linear    | 30    
7 | train_acc      | Accuracy  | 0     
8 | val_acc        | Accuracy  | 0     
9 | test_acc       | Accuracy  | 0     
---------------------------------------------
874       Trainable params
0         Non-traina

Tests for run_experiment.py passed


In [33]:
!cat training/tests/test_memorize_iam.sh

#!/bin/bash
set -uo pipefail
set +e

# tests whether we can achieve a criterion loss
#  on a single batch within a certain number of epochs

FAILURE=false

# constants and CLI args set by aiming for <5 min test on commodity GPU,
#   including data download step
MAX_EPOCHS="${1:-100}"  # syntax for basic optional arguments in bash
CRITERION="${2:-1.0}"

# train on GPU if it's available
GPU=$(python -c 'import torch; print(int(torch.cuda.is_available()))')

python ./training/run_experiment.py \
  --data_class=IAMParagraphs --model_class=ResnetTransformer --loss=transformer \
  --limit_test_batches 0.0 --overfit_batches 1 --num_sanity_val_steps 0 \
  --augment_data false --tf_dropout 0.0 \
  --gpus "$GPU" --precision 16 --batch_size 16 --lr 0.0001 \
  --log_every_n_steps 25 --max_epochs "$MAX_EPOCHS"  --num_workers 2 --wandb || FAILURE=true

python -c "import json; loss = json.load(open('training/logs/wandb/latest-run/files/wandb-summary.json'))['train/loss']; asse

In [34]:
print(int(torch.cuda.is_available()))

1


In [35]:
%%time
running_memorization = True

if running_memorization:
    max_epochs = 1000
    loss_criterion = 0.05
    !./training/tests/test_memorize_iam.sh {max_epochs} {loss_criterion}

[34m[1mwandb[0m: Currently logged in as: [33mjobquiroz[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.13.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.12.17
[34m[1mwandb[0m: Run data is saved locally in [35m[1mtraining/logs/wandb/run-20220831_195214-2v3kp9yq[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdauntless-frog-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jobquiroz/full_stack_deep_learning-lab05_training[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jobquiroz/full_stack_deep_learning-lab05_training/runs/2v3kp9yq[0m
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
Using 16bit native Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'pytorch_l

In [36]:
!python training/run_experiment.py --help | grep -A 1 -e "^\s*--profile\s"

  --profile             If passed, uses the PyTorch Profiler to track
                        computation, exported as a Chrome-style trace.


In [37]:
!cat training/run_experiment.py | grep args.profile -A 5

    if args.profile:
        sched = torch.profiler.schedule(wait=0, warmup=3, active=4, repeat=0)
        profiler = pl.profiler.PyTorchProfiler(export_to_chrome=True, schedule=sched, dirpath=experiment_dir)
        profiler.STEP_FUNCTIONS = {"training_step"}  # only profile training
    else:
        profiler = pl.profiler.PassThroughProfiler()


In [38]:
import glob

import torch
import wandb

from text_recognizer.data.base_data_module import DEFAULT_NUM_WORKERS


# make it easier to separate these from training runs
%env WANDB_JOB_TYPE=profile

batch_size = 16
num_workers = DEFAULT_NUM_WORKERS  # change this number later and see how the results change
gpus = 1  # must be run with accelerator

%run training/run_experiment.py --wandb --profile \
  --max_epochs=1 \
  --num_sanity_val_steps=0 --limit_val_batches=0 --limit_test_batches=0 \
  --model_class=ResnetTransformer --data_class=IAMParagraphs --loss=transformer \
  --batch_size={batch_size} --num_workers={num_workers} --precision=16 --gpus=1

latest_expt = wandb.run

try:  # add execution trace to logged and versioned binaries
    folder = wandb.run.dir
    trace_matcher = wandb.run.dir + "/*.pt.trace.json"
    trace_file = glob.glob(trace_matcher)[0]
    trace_at = wandb.Artifact(name=f"trace-{wandb.run.id}", type="trace")
    trace_at.add_file(trace_file, name="training_step.pt.trace.json")
    wandb.log_artifact(trace_at)
except IndexError:
    print("trace not found")

wandb.finish()

env: WANDB_JOB_TYPE=profile


[34m[1mwandb[0m: Currently logged in as: [33mjobquiroz[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
Using 16bit native Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
IAMParagraphs.setup(fit): Loading IAM paragraph regions and lines...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                      | Type                    | Params
-----------------------------------------------------------------------
0  | model                     | ResnetTransformer       | 14.0 M
1  | model.resnet              | Sequential              | 11.2 M
2  | model.encoder_projection  | Conv2d                  | 131 K 
3  | model.enc_pos_encoder     | PositionalEncodingImage | 0     
4  | model.embedding           |

Model State Dict Disk Size: 56.06 MB


Training: 0it [00:00, ?it/s]

FIT Profiler Report
Profile stats for: records
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*        10.52%     291.233ms        48.38%        1.339s     334.731ms       0.000us         0.00%     126.620ms      31.655ms             4  
                        [pl][profile]run_training_batch         0.12%       3.191ms        35.47%     981.718ms     327.239ms       0.000us         0.00%     28

VBox(children=(Label(value='24.023 MB of 24.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
epoch,▁
optimizer/lr-Adam,▁▁
size/mb_disk,▁
size/nparams,▁
train/loss,▁
trainer/global_step,▁▁███

0,1
epoch,0.0
optimizer/lr-Adam,0.001
size/mb_disk,56.06463
size/nparams,13988756.0
train/loss,3.17177
trainer/global_step,49.0


In [39]:
your_tensorboard_url = latest_expt.url + "/tensorboard"

print(your_tensorboard_url)

https://wandb.ai/jobquiroz/full_stack_deep_learning-lab05_training/runs/3ri7hbh6/tensorboard


In [40]:
trace_files_url = latest_expt.url.split("/runs/")[0] + f"/artifacts/trace/trace-{latest_expt.id}/latest/files/"
trace_url = trace_files_url + "training_step.pt.trace.json"

example_trace_url = "https://wandb.ai/cfrye59/fsdl-text-recognizer-2022-training/artifacts/trace/trace-67j1qxws/latest/files/training_step.pt.trace.json"

print(trace_url)

https://wandb.ai/jobquiroz/full_stack_deep_learning-lab05_training/artifacts/trace/trace-3ri7hbh6/latest/files/training_step.pt.trace.json
