In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def create_splits(input_datasets, percentage=0.2):
    splits = []
    for i in range(len(input_datasets)):
        train = list(input_datasets)
        train.pop(i)
        val = [input_datasets[i]]
        splits += [{'train_set': train, 'val_set': val}]
    return splits

# create_splits([80, 68, 176, 81, 82, 71, 87])

In [3]:
from train import train_loop, prepare_config, prepare_training_set
import os
import subprocess

mlflow_env = {'MLFLOW_TRACKING_URI': 'http://localhost:5000', 'MLFLOW_S3_ENDPOINT_URL': 'http://localhost:9000'}

for env in mlflow_env:
    os.environ[env] = mlflow_env[env]

import mlflow
from tqdm.notebook import tqdm

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_registry_uri("http://localhost:9000")

In [4]:
training_data = ({'train_set': [80, 68, 176, 81, 82, 71], 'val_set': [87]},)

In [None]:
prepare_training_set('/home/jfeil/IDP/training_pipeline/data/tennis_dataset', training_data['train_set'], training_data['val_set'], '/var/tmp/test')

In [6]:
training_data = create_splits([80, 68, 176, 81, 82, 71, 87])[3]

{'train_set': [80, 68, 176, 82, 71, 87], 'val_set': [81]}

In [5]:
def create_experiments(key, values, pos):
    experiments = []
    for val in values:
        if pos == 0:
            experiments += [({key: val}, {})]
        else:
            experiments += [({}, {key: val})]
    return tuple(experiments)

In [7]:
train_experiments = create_experiments("SAT_FACTOR", [0.05, 0.1, 0.25, 0.35, 0.4, 0.5], pos=0)
train_experiments

(({'SAT_FACTOR': 0.05}, {}),
 ({'SAT_FACTOR': 0.1}, {}),
 ({'SAT_FACTOR': 0.25}, {}),
 ({'SAT_FACTOR': 0.35}, {}),
 ({'SAT_FACTOR': 0.4}, {}),
 ({'SAT_FACTOR': 0.5}, {}))

In [6]:
train_experiments = create_experiments("END_EPOCH", [1,1,1,1,1,1], pos=1)
train_experiments = {5: train_experiments}
train_experiments

{5: (({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}),
  ({}, {'END_EPOCH': 1}))}

In [20]:
train_experiments = {1: (({'SAT_FACTOR': 0.4}, {'LR_STEP': [20, 60]}), )}

In [23]:
experiment_count = 0
for key in train_experiments:
    experiment_count += len(train_experiments[key])
experiment_count *= len(training_data)

progress_bar = tqdm(total=experiment_count)

failed_experiments = []

for data_set in training_data:
    prepare_training_set('/home/jfeil/IDP/training_pipeline/data/tennis_dataset', data_set['train_set'], data_set['val_set'], '/var/tmp/test')
    for experiment_id in train_experiments:
        for dataset_params, train_params in train_experiments[experiment_id]:
            with mlflow.start_run(experiment_id=experiment_id) as run:
                config_path = prepare_config('/var/tmp/test', 'experiment_output', '/../experiments/coco/resnet152/384x288_d256x3_adam_lr1e-3_TrainingLoopDefault.yaml', dataset_params, train_params)
                mlflow.log_params(data_set)
                result = subprocess.run(["python3", "train.py", "--cfg", config_path, "--mlflow-run", run.info.run_id], env=os.environ)
                if result.returncode != 0:
                    print(f"FAILED {(experiment_id, dataset_params, train_params)}")
                    failed_experiments += [(experiment_id, dataset_params, train_params)]
                    mlflow.end_run('FAILED')
                # train_loop(config_path, num_workers=16, enable_mlflow=False)
                os.remove(config_path)
                progress_bar.update(1)

if failed_experiments:
    print(failed_experiments)

  0%|          | 0/1 [00:00<?, ?it/s]

{'cfg': '/var/tmp/test/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_16:31:43.565357.yaml',
 'frequent': 20,
 'gpus': None,
 'workers': None}
{'CUDNN': {'BENCHMARK': True, 'DETERMINISTIC': False, 'ENABLED': True},
 'DATASET': {'DATASET': 'coco',
             'DATA_FORMAT': 'png',
             'FLIP': True,
             'HYBRID_JOINTS_TYPE': '',
             'ROOT': '/var/tmp/test',
             'ROT_FACTOR': 40,
             'SAT_FACTOR': 0.4,
             'SCALE_FACTOR': 0.15,
             'SELECT_DATA': False,
             'TEST_SET': 'val2017',
             'TRAIN_SET': 'train2017'},
 'DATA_DIR': '',
 'DEBUG': {'DEBUG': True,
           'SAVE_BATCH_IMAGES_GT': True,
           'SAVE_BATCH_IMAGES_PRED': True,
           'SAVE_HEATMAPS_GT': True,
           'SAVE_HEATMAPS_PRED': True},
 'GPUS': '0',
 'LOG_DIR': 'log',
 'LOSS': {'USE_TARGET_WEIGHT': True},
 'MODEL': {'EXTRA': {'DECONV_WITH_BIAS': False,
                     'FINAL_CONV_KERNEL': 1,
                     'HEATMAP

54f5f756984e44dd8e673d9d8ca2be18
=> creating /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_16:31:43
=> creating log/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_16:31:43_2024-01-17-16-31


Epoch: [0][0/6]	Time 14.575s (14.575s)	Speed 2.2 samples/s	Data 8.052s (8.052s)	Loss 0.00200 (0.00200)	Accuracy 0.019 (0.019)
Test: [0/1]	Time 7.251 (7.251)	Loss 0.0020 (0.0020)	Accuracy 0.103 (0.103)
=> Writing results json to /home/jfeil/IDP/baseline_implementations/microsoft_baseline/pose_estimation/experiment_output/coco/pose_resnet_152/384x288_d256x3_adam_lr1e-3_TransferLearning_2024-01-17_16:31:43/results/keypoints_val2017_results.json
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints*
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.001
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.012
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.002
 Average Precision  (AP) @[ IoU=0

In [11]:
train_experiments[4]

(({'SAT_FACTOR': 0.25}, {}),
 ({'SAT_FACTOR': 0.35}, {}),
 ({'SAT_FACTOR': 0.4}, {}),
 ({'SAT_FACTOR': 0.5}, {}))

In [None]:
failed_experiments