In [4]:
import torch

torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

## Reproducibility Attempts on Network Deconvolution

https://github.com/yechengxi/deconvolution

### Parameters
* Model Architectures 
    * VGG-16 --- vgg16
    * ResNet-18 --- resnet18d
    * Preact-18 --- preact
    * DenseNet-121 --- densenet121
    * ResNext-29 --- resnext
    * MobileNet v2 --- mobilev2
    * DPN-92 --- dpn
    * PNASNetA --- pnasnetA
    * SENet-18 --- senet
    * EfficientNet --- efficient
* Datasets --- [CIFAR-10, CIFAR-100]
* Performance enhancement techniques --- [Batch Normalization, Network Deconvolution]
* Epochs --- [1, 20, 100]
* Attempts --- [1, 2, 3]
* Optimizer --- SGD

### Hyper-Parameters
* batch\_size = 128 
* learning_rate = 0.1
* Weight-Decay = 0.001

# <span style="color:blue">CIFAR-10</span>

# VGG-16 --- vgg16

#### batch normalization --- vgg16

In [6]:
!python main.py --lr .1 --optimizer SGD --arch vgg16 --epochs 1 --dataset cifar100  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-13 20:55:48.025766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-13-20.55', checkpoint_path='checkpoints/cifar10,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-13-20.55', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batch_size=1

#### network deconvolution --- vgg16

In [8]:
!python main.py --lr .1 --optimizer SGD --arch vgg16 --epochs 1 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-13 21:00:08.063912: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
************ Batch norm disabled when deconv is used. ************
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512/03-13-20.59', checkpoint_path='checkpoints/cifar10,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512/03-13-20.59', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1

# ResNet-18 --- resnet18d

#### batch normalization --- resnet18d

issue resolved -- ```No module named 'torchvision.models.utils'```

https://stackoverflow.com/questions/70998767/no-module-named-torchvision-models-utils

In [None]:
!python main.py --lr .1 --optimizer SGD --arch resnet18d --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

#### network deconvolution --- resnet18d

In [None]:
!python main.py --lr .1 --optimizer SGD --arch resnet18d --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

In [None]:
# CUDA_VISIBLE_DEVICES=0 python main.py --lr .1 --optimizer SGD --arch resnet --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# Preact-18 --- preact

#### batch normalization --- preact

```Traceback (most recent call last):```
  File "/home/oruma001/CS895/project/codebase/main.py", line 271, in <module>
    net = PreActResNet18(num_classes=args.num_outputs,deconv=args.deconv,delinear=args.delinear,channel_deconv=args.channel_deconv)
NameError: name 'PreActResNet18' is not defined

In [7]:
!python main.py --lr .1 --optimizer SGD --arch preact --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-14 20:28:07.261707: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,preact,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-20.28', checkpoint_path='checkpoints/cifar10,preact,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-20.28', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batch_size

#### network deconvolution --- preact

In [8]:
!python main.py --lr .1 --optimizer SGD --arch preact --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-14 20:29:57.273898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
************ Batch norm disabled when deconv is used. ************
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,preact,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512/03-14-20.29', checkpoint_path='checkpoints/cifar10,preact,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512/03-14-20.29', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0

# DenseNet-121 --- densenet121

#### batch normalization --- densenet121

In [9]:
!python main.py --lr .1 --optimizer SGD --arch densenet121 --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-14 21:18:48.529854: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,densenet121,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-21.18', checkpoint_path='checkpoints/cifar10,densenet121,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-21.18', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, 

#### network deconvolution --- densenet121

In [None]:
!python main.py --lr .1 --optimizer SGD --arch densenet121 --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# ResNext-29 --- resnext

#### batch normalization --- resnext

In [None]:
!python main.py --lr .1 --optimizer SGD --arch resnext --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

#### network deconvolution --- resnext

In [None]:
!python main.py --lr .1 --optimizer SGD --arch resnext --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# MobileNet v2 --- mobilev2

#### batch normalization --- mobilev2

In [1]:
!python main.py --lr .1 --optimizer SGD --arch mobilev2 --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-14 23:43:15.074610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,mobilev2,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-23.43', checkpoint_path='checkpoints/cifar10,mobilev2,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-14-23.43', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batch_

#### network deconvolution --- mobilev2

In [None]:
!python main.py --lr .1 --optimizer SGD --arch mobilev2 --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# DPN-92 --- dpn

#### batch normalization --- dpn

In [3]:
!python main.py --lr .1 --optimizer SGD --arch dpn --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-15 00:41:32.501297: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,dpn,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-00.41', checkpoint_path='checkpoints/cifar10,dpn,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-00.41', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batch_size=128, 

#### network deconvolution --- dpn

In [None]:
!python main.py --lr .1 --optimizer SGD --arch dpn --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# PNASNetA --- pnasnetA

#### batch normalization --- pnasnetA

In [None]:
!python main.py --lr .1 --optimizer SGD --arch pnasnetA --epochs 1 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

#### network deconvolution --- pnasnetA

In [None]:
!python main.py --lr .1 --optimizer SGD --arch pnasnetA --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# SENet-18 --- senet

#### batch normalization --- senet

In [1]:
!python main.py --lr .1 --optimizer SGD --arch senet --epochs 1 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-15 18:36:45.236938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,senet,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-18.36', checkpoint_path='checkpoints/cifar10,senet,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-18.36', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batch_size=1

#### network deconvolution --- senet

In [None]:
!python main.py --lr .1 --optimizer SGD --arch senet --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

# EfficientNet --- efficient

#### batch normalization --- efficient

In [2]:
!python main.py --lr .1 --optimizer SGD --arch efficient --epochs 1 --dataset cifar10  --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
2024-03-15 18:38:18.327308: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar10,efficient,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-18.38', checkpoint_path='checkpoints/cifar10,efficient,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-15-18.38', checkpoint_epoch=-1, print_freq=20, seed=0, optimizer='SGD', lr=0.1, lr_scheduler='cosine', momentum=0.9, weight_decay=0.001, batc

#### network deconvolution --- efficient

In [None]:
!python main.py --lr .1 --optimizer SGD --arch efficient --epochs 20 --dataset cifar10  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

Attempt 1

sbatch --export=ALL,architecture='vgg16',epochs=100 -o vgg16_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='vgg16',epochs=100 -o vgg16_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='resnet18d',epochs=100 -o resnet18d_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='resnet18d',epochs=100 -o resnet18d_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='preact',epochs=100 -o preact_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='preact',epochs=100 -o preact_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='densenet121',epochs=100 -o densenet121_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='densenet121',epochs=100 -o densenet121_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='resnext',epochs=100 -o resnext_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='resnext',epochs=100 -o resnext_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='mobilev2',epochs=100 -o mobilev2_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='mobilev2',epochs=100 -o mobilev2_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='dpn',epochs=100 -o dpn_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='dpn',epochs=100 -o dpn_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='pnasnetA',epochs=100 -o pnasnetA_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='pnasnetA',epochs=100 -o pnasnetA_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='senet',epochs=100 -o senet_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='senet',epochs=100 -o senet_100_att1_dconv.txt single_experiment_net_deconv.sh


sbatch --export=ALL,architecture='efficient',epochs=100 -o efficient_100_att1_BN.txt single_experiment.sh
sbatch --export=ALL,architecture='efficient',epochs=100 -o efficient_100_att1_dconv.txt single_experiment_net_deconv.sh

In [2]:
!ls checkpoints

cifar10,densenet121,ep.100,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0
cifar10,densenet121,ep.100,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512
cifar10,densenet121,ep.20,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0
cifar10,densenet121,ep.20,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512
cifar10,dpn,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0
cifar10,dpn,ep.100,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0
cifar10,dpn,ep.100,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.1,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.512
cifar10,dpn,ep.20,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64

### Results

In [48]:
import glob
import os
import pandas as pd


# file_list = glob.glob('checkpoints/data?.txt')
file_list_ = glob.glob('checkpoints/*ep.20*')
file_list = sorted(file_list_)

In [54]:
import glob
import os
import pandas as pd


# file_list = glob.glob('checkpoints/data?.txt')
file_list_ = glob.glob('checkpoints/*ep.20*')
file_list = sorted(file_list_)

count = 0
for num, dir in enumerate(file_list):
    three_folders = glob.glob(f'{dir}/*')
    print(num+1, dir.split(",")[1], end='---')
    
    arch = dir.split(",")[1]
    mode_ = dir.split(",")[9]
    if mode_ == 'deconv.0':
        mode_ = 'BN'
    epochs_ = dir.split(",")[2]
    print(mode_)

    
    for i, folder in enumerate(three_folders):
        files = glob.glob(f'{folder}/*.log')
        # print(files)
        print(f"Attempt {i+1}")
        for file in files:
            if os.path.basename(file) != 'train_batch.log':
                df = pd.read_csv(file, sep='\t', engine='python')
                type_ = os.path.basename(file).replace('.log','')
                # print(type_)
                # print(df)
                print(f"{arch} - {epochs_} - mode: {mode_} - Attempt_{i+1} - {type_} - top1_acc: {max(df['top1'])} - time: {sum(df['time']):.2f}")
        
        print('\n')
            

    # count+=1
    # if count == 3:
    #     break
    print('\n\n')
    

1 densenet121---BN
Attempt 1
densenet121 - ep.20 - mode: BN - Attempt_1 - train - top1_acc: 98.32 - time: 1174.68
densenet121 - ep.20 - mode: BN - Attempt_1 - test - top1_acc: 93.1 - time: 111.86


Attempt 2
densenet121 - ep.20 - mode: BN - Attempt_2 - train - top1_acc: 98.108 - time: 1132.57
densenet121 - ep.20 - mode: BN - Attempt_2 - test - top1_acc: 93.02 - time: 91.34


Attempt 3
densenet121 - ep.20 - mode: BN - Attempt_3 - train - top1_acc: 98.304 - time: 1162.58
densenet121 - ep.20 - mode: BN - Attempt_3 - test - top1_acc: 93.21 - time: 97.98





2 densenet121---deconv.1
Attempt 1
densenet121 - ep.20 - mode: deconv.1 - Attempt_1 - train - top1_acc: 99.402 - time: 5028.65
densenet121 - ep.20 - mode: deconv.1 - Attempt_1 - test - top1_acc: 94.96 - time: 132.67


Attempt 2
densenet121 - ep.20 - mode: deconv.1 - Attempt_2 - train - top1_acc: 99.42 - time: 4630.60
densenet121 - ep.20 - mode: deconv.1 - Attempt_2 - test - top1_acc: 94.89 - time: 125.93


Attempt 3
densenet121 - ep.20

# <span style="color:blue">CIFAR-100</span>

# VGG-16 --- vgg16

#### batch normalization --- vgg16

In [1]:
!python main.py --lr .1 --optimizer SGD --arch vgg16 --epochs 1 --dataset cifar100 --batch-size 128 --msg True --deconv False --block-fc 0 --wd .001

==> Preparing data..
| Preparing CIFAR-100 dataset...
Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz
100%|███████████████████████| 169001437/169001437 [00:01<00:00, 86719800.89it/s]
Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified
2024-03-19 19:38:41.530404: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
==> Building model..
Namespace(msg=1, resume='', use_gpu=True, num_workers=16, result_path='checkpoints/cifar100,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.True,b.64,stride.3,it.5,eps.1e-05,bias.True,bfc.0/03-19-19.38', checkpoint_path='checkpoints/cifar100,vgg16,ep.1,SGD,0.1,cosine,bs.128,wd.0.001,bn.True,deconv.0,delinear.T

In [1]:
!cat single_experiment_net_deconv_cifar100.sh

#!/bin/bash

#SBATCH -c 8
#SBATCH -p gpu
#SBATCH --gres gpu:1

enable_lmod
module load container_env pytorch-gpu/1.13.0

export CUDA_HOME=/cm/shared/applications/cuda-toolkit/11.7.1/
export XLA_FLAGS=--xla_gpu_cuda_data_dir=$CUDA_HOME

# crun.tensorflow-gpu -p ~/envs/cs834_project python lemos_kerasnlp_for_slurm_job_training.py -itr $iteration -ep $epochs

crun.pytorch-gpu -p ~/envs/cs834_project python main.py --lr .1 --optimizer SGD --arch $architecture --epochs $epochs --dataset cifar100  --batch-size 128 --msg True --deconv True --block-fc 512 --wd .001

In [1]:
import glob
import os
import pandas as pd


# file_list_ = glob.glob('checkpoints/*ep.20*')
# file_list_ = glob.glob('checkpoints/cifar10,*ep.100*')
# file_list_ = glob.glob('checkpoints/cifar100,*ep.1,*')
# file_list_ = glob.glob('checkpoints/cifar100,*ep.20*')
file_list_ = glob.glob('checkpoints/cifar100,*ep.20*')

file_list = sorted(file_list_)
print(len(file_list))
file_list

count = 0
for num, dir in enumerate(file_list):
    three_folders = glob.glob(f'{dir}/*')
    print(num+1, dir.split(",")[1], end='---')
    
    arch = dir.split(",")[1]
    mode_ = dir.split(",")[9]
    if mode_ == 'deconv.0':
        mode_ = 'BN'
    epochs_ = dir.split(",")[2]
    print(mode_)

    
    for i, folder in enumerate(three_folders):
        files = glob.glob(f'{folder}/*.log')
        # print(files)
        print(f"Attempt {i+1}")
        for file in files:
            if os.path.basename(file) != 'train_batch.log':
                df = pd.read_csv(file, sep='\t', engine='python')
                type_ = os.path.basename(file).replace('.log','')
                # print(type_)
                # print(df)
                print(f"{arch} - {epochs_} - mode: {mode_} - Attempt_{i+1} - {type_} - top1_acc: {max(df['top1'])} - time: {sum(df['time']):.2f}")
        
        print('\n')
    # count+=1
    # if count == 3:
    #     break
    print('\n\n')
    

20
1 densenet121---BN
Attempt 1
densenet121 - ep.20 - mode: BN - Attempt_1 - train - top1_acc: 92.956 - time: 1141.66
densenet121 - ep.20 - mode: BN - Attempt_1 - test - top1_acc: 75.45 - time: 94.32


Attempt 2
densenet121 - ep.20 - mode: BN - Attempt_2 - train - top1_acc: 93.052 - time: 1124.73
densenet121 - ep.20 - mode: BN - Attempt_2 - test - top1_acc: 75.78 - time: 94.72


Attempt 3
densenet121 - ep.20 - mode: BN - Attempt_3 - train - top1_acc: 92.858 - time: 1315.25
densenet121 - ep.20 - mode: BN - Attempt_3 - test - top1_acc: 75.36 - time: 95.62





2 densenet121---deconv.1
Attempt 1
densenet121 - ep.20 - mode: deconv.1 - Attempt_1 - train - top1_acc: 98.18 - time: 4716.44
densenet121 - ep.20 - mode: deconv.1 - Attempt_1 - test - top1_acc: 78.57 - time: 110.58


Attempt 2
densenet121 - ep.20 - mode: deconv.1 - Attempt_2 - train - top1_acc: 98.094 - time: 4668.14
densenet121 - ep.20 - mode: deconv.1 - Attempt_2 - test - top1_acc: 78.38 - time: 108.34


Attempt 3
densenet121 - e

In [23]:
!python main_imagenet.py -a resnet18 -j 32 data/imagenet/ILSVRC/Data/CLS-LOC 

2024-03-24 23:51:48.660579: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
=> creating model 'resnet18'
Namespace(data='data/imagenet/ILSVRC/Data/CLS-LOC', arch='resnet18', workers=32, epochs=90, start_epoch=0, batch_size=256, lr=0.1, lr_scheduler='cosine', scheduler_step_size=30, milestone=0.3, multistep_gamma=0.1, momentum=0.9, weight_decay=0.0001, print_freq=10, resume='', evaluate=False, pretrained=False, world_size=-1, rank=-1, dist_url='tcp://224.66.41.62:23456', dist_backend='nccl', seed=None, gpu=None, multiprocessing_distributed=False, dataset='imagenet', tensorboard=True, save_plot=True, deconv=False, delinear=functools.partial(<class 'models.deconv.Delinear'>, block=64, eps=1e-05, n_iter=5), block=64, deconv_iter=5, eps=1e-05, bias=Tr

In [3]:
# https://www.kaggle.com/c/imagenet-object-localization-challenge/data
# dataset is ~160GB

In [None]:
# cityescapes
# is not --- syntax error --- had to use != at train 166 line
ModuleNotFoundError: No module named 'torchvision.models.utils'
from torch.hub import load_state_dict_from_url


In [11]:
!python Segmentation/train.py --dataset cityscapes --model deeplabv3_resnet50 -b 8 --epochs 30 --deconv False --pretrained-backbone False --lr 0.1

2024-04-11 18:44:48.966205: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-11 18:44:49.630482: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-11 18:44:52.076565: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
| distributed init (rank 0): env://
Traceback (most recent call last):
  File "/home/oruma001/CS895/project/codebase/Segmentation/train.py", line 611, in <module>
    main(args)
  File "/home/oruma001/CS895/project/codebase/Segmentation/train.py", line 189, in main
    utils.init_distributed_mode(args)
  File "/home/oruma001/CS895/project/codebase/Segmentation/utils.py", line 298, in init_