In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def create_splits(input_datasets, percentage=0.2):
    splits = []
    for i in range(len(input_datasets)):
        train = list(input_datasets)
        train.pop(i)
        val = [input_datasets[i]]
        splits += [{'train_set': train, 'val_set': val}]
    return splits

# create_splits([80, 68, 176, 81, 82, 71, 87])

In [3]:
from train import train_loop, prepare_config, prepare_training_set
import os
import subprocess

mlflow_env = {'MLFLOW_TRACKING_URI': 'http://localhost:5000', 'MLFLOW_S3_ENDPOINT_URL': 'http://localhost:9000'}

for env in mlflow_env:
    os.environ[env] = mlflow_env[env]

import mlflow
from tqdm.notebook import tqdm

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_registry_uri("http://localhost:9000")

In [4]:
training_data = ({'train_set': [80, 68, 176, 81, 82, 71], 'val_set': [87]},)

In [None]:
prepare_training_set('/home/jfeil/IDP/training_pipeline/data/tennis_dataset', training_data['train_set'], training_data['val_set'], '/var/tmp/test')

In [6]:
training_data = create_splits([80, 68, 176, 81, 82, 71, 87])[3]

{'train_set': [80, 68, 176, 82, 71, 87], 'val_set': [81]}

In [5]:
def create_experiments(key, values, pos):
    experiments = []
    for val in values:
        if pos == 0:
            experiments += [({key: val}, {})]
        else:
            experiments += [({}, {key: val})]
    return tuple(experiments)

In [7]:
train_experiments = create_experiments("SAT_FACTOR", [0.05, 0.1, 0.25, 0.35, 0.4, 0.5], pos=0)
train_experiments

(({'SAT_FACTOR': 0.05}, {}),
 ({'SAT_FACTOR': 0.1}, {}),
 ({'SAT_FACTOR': 0.25}, {}),
 ({'SAT_FACTOR': 0.35}, {}),
 ({'SAT_FACTOR': 0.4}, {}),
 ({'SAT_FACTOR': 0.5}, {}))

In [6]:
train_experiments = create_experiments("END_EPOCH", [1,1,1,1,1,1], pos=1)
train_experiments = {5: train_experiments}
train_experiments

{5: (({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}))}

In [16]:
train_experiments = {
    3: (({}, {'LR_STEP': [90, 120]}),
 ({}, {'LR_STEP': [10, 20, 30, 40]}),
 ({}, {'LR_STEP': [20, 40, 60, 80]}),
 ({}, {'LR_STEP': [1, 2, 3, 4, 5]}),
 ({}, {'LR_STEP': [5, 10, 20, 30]})),
    4: (({'SAT_FACTOR': 0.05}, {}),
 ({'SAT_FACTOR': 0.1}, {}),
 ({'SAT_FACTOR': 0.25}, {}),
 ({'SAT_FACTOR': 0.35}, {}),
 ({'SAT_FACTOR': 0.4}, {}),
 ({'SAT_FACTOR': 0.5}, {}))
}

In [20]:
experiment_count = 0
for key in train_experiments:
    experiment_count += len(train_experiments[key])
experiment_count *= len(training_data)

progress_bar = tqdm(total=experiment_count)

failed_experiments = []

for data_set in training_data:
    prepare_training_set('/home/jfeil/IDP/training_pipeline/data/tennis_dataset', data_set['train_set'], data_set['val_set'], '/var/tmp/test')
    for experiment_id in train_experiments:
        for dataset_params, train_params in train_experiments[experiment_id]:
            with mlflow.start_run(experiment_id=experiment_id) as run:
                config_path = prepare_config('/var/tmp/test', 'experiment_output', '/home/jfeil/IDP/training_pipeline/configuration/384x288_d256x3_adam_lr1e-3_TransferLearning.yaml', dataset_params, train_params)
                mlflow.log_params(data_set)
                result = subprocess.run(["python3", "train.py", "--cfg", config_path, "--mlflow-run", run.info.run_id], env=os.environ)
                if result.returncode != 0:
                    print(f"FAILED {(experiment_id, dataset_params, train_params)}")
                    failed_experiments += [(experiment_id, dataset_params, train_params)]
                    mlflow.end_run('FAILED')
                # train_loop(config_path, num_workers=16, enable_mlflow=False)
                os.remove(config_path)
                progress_bar.update(1)

if failed_experiments:
    print(failed_experiments)

  0%|          | 0/6 [00:00<?, ?it/s]

{'cfg': '/var/tmp/test/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:11:56.212487.yaml',
 'frequent': 20,
 'gpus': None,
 'workers': None}
{'CUDNN': {'BENCHMARK': True, 'DETERMINISTIC': False, 'ENABLED': True},
 'DATASET': {'DATASET': 'coco',
             'DATA_FORMAT': 'png',
             'FLIP': True,
             'HYBRID_JOINTS_TYPE': '',
             'ROOT': '/var/tmp/test',
             'ROT_FACTOR': 40,
             'SAT_FACTOR': 0.25,
             'SCALE_FACTOR': 0.15,
             'SELECT_DATA': False,
             'TEST_SET': 'val2017',
             'TRAIN_SET': 'train2017'},
 'DATA_DIR': '',
 'DEBUG': {'DEBUG': True,
           'SAVE_BATCH_IMAGES_GT': True,
           'SAVE_BATCH_IMAGES_PRED': True,
           'SAVE_HEATMAPS_GT': True,
           'SAVE_HEATMAPS_PRED': True},
 'GPUS': '0',
 'LOG_DIR': 'log',
 'LOSS': {'USE_TARGET_WEIGHT': True},
 'MODEL': {'EXTRA': {'DECONV_WITH_BIAS': False,
                     'FINAL_CONV_KERNEL': 1,
                     'HEATMA

6452e0ff5864424ea5589994c113adb3
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:11:56
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:11:56_2024-01-17-01-11


Epoch: [0][0/6]	Time 15.348s (15.348s)	Speed 2.1 samples/s	Data 8.605s (8.605s)	Loss 0.00199 (0.00199)	Accuracy 0.038 (0.038)
Test: [0/1]	Time 7.578 (7.578)	Loss 0.0020 (0.0020)	Accuracy 0.097 (0.097)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:11:56/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.002
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0

7fc8a8aaf9a0495cb1efbd9017fad97f
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:13:30
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:13:30_2024-01-17-01-13


Epoch: [0][0/6]	Time 14.790s (14.790s)	Speed 2.2 samples/s	Data 8.337s (8.337s)	Loss 0.00197 (0.00197)	Accuracy 0.047 (0.047)
Test: [0/1]	Time 7.499 (7.499)	Loss 0.0020 (0.0020)	Accuracy 0.103 (0.103)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:13:30/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.002
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.001
 Average Precision  (AP) @[ IoU=0

5ccbdd16828146b98429a8b228742f3d
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:15:06
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:15:06_2024-01-17-01-15


Epoch: [0][0/6]	Time 15.062s (15.062s)	Speed 2.1 samples/s	Data 8.572s (8.572s)	Loss 0.00194 (0.00194)	Accuracy 0.029 (0.029)
Test: [0/1]	Time 7.454 (7.454)	Loss 0.0020 (0.0020)	Accuracy 0.117 (0.117)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:15:06/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.021
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.004
 Average Precision  (AP) @[ IoU=0

9837438361f244f5ba839b1825981246
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:16:39
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:16:39_2024-01-17-01-16


Epoch: [0][0/6]	Time 15.727s (15.727s)	Speed 2.0 samples/s	Data 9.351s (9.351s)	Loss 0.00196 (0.00196)	Accuracy 0.034 (0.034)
Test: [0/1]	Time 7.519 (7.519)	Loss 0.0020 (0.0020)	Accuracy 0.120 (0.120)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:16:39/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.020
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.007
 Average Precision  (AP) @[ IoU=0

35a98d74f3b24f8c82414a9ba1449b97
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:18:17
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:18:17_2024-01-17-01-18


Epoch: [0][0/6]	Time 15.296s (15.296s)	Speed 2.1 samples/s	Data 8.845s (8.845s)	Loss 0.00203 (0.00203)	Accuracy 0.028 (0.028)
Test: [0/1]	Time 7.464 (7.464)	Loss 0.0020 (0.0020)	Accuracy 0.090 (0.090)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:18:17/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.002
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0

156d8419b96b466482305e1a98f1fc17
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:19:49
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:19:49_2024-01-17-01-19


Epoch: [0][0/6]	Time 15.507s (15.507s)	Speed 2.1 samples/s	Data 8.810s (8.810s)	Loss 0.00197 (0.00197)	Accuracy 0.010 (0.010)
Test: [0/1]	Time 7.444 (7.444)	Loss 0.0020 (0.0020)	Accuracy 0.120 (0.120)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_01:19:49/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.002
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.013
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.001
 Average Precision  (AP) @[ IoU=0

[]


In [16]:
failed_experiments

'7ba78b801ce64775bb49f448c7089606'