# Installing WandB & Downgrading OpenCV

In [None]:
%%capture
!pip install --upgrade pip
!pip install wandb

# Downgrade opencv to solve some compatibility issues
!pip uninstall opencv-python-headless -y
!pip install opencv-python-headless==4.1.2.30

# Log Into WandB, and Mount to Drive


In [None]:
import tensorflow as tf
import tensorboard
from collections import defaultdict
from google.colab import drive
import tarfile
import re
import os
import wandb
import shutil
import csv
from tensorflow.python.summary.summary_iterator import summary_iterator

drive.mount('/content/drive')

!wandb login

%load_ext tensorboard

Install Tensorflow Object Detection API

In [None]:
%%bash
# Install the Tensorflow object detection API
# This installation may take a while
sudo apt -y install protobuf-compiler
cd /content/drive/MyDrive/models/research
protoc object_detection/protos/*.proto --python_out=.   
cp object_detection/packages/tf2/setup.py .
python -m pip install .

# Test installataion
# python /content/drive/MyDrive/models/research/object_detection/builders/model_builder_tf2_test.py

# Initialization of Path and Globals

* Number of classes (always 1 for this project: 'cots')
* Choose how many prediction images to make for the tfevent-eval file
* Choose if you want to save the tfevent files in wandb

In [None]:
# Always 1 for this project (class: cots)
NUM_CLASSES = 1  
# Number of prediction images stored in tfevent-eval
NUM_VISUALIZATIONS = 5
# Storing tfevent files in the wandb-run
STORE_TFEVENT_FILES = True

In [None]:
PRETRAINED_DIR = '/content/drive/MyDrive/models/research/deploy/pretrained/'
TRAIN_RECORD_FILE = '/content/drive/MyDrive/models/research/object_detection/train_cots/training_data/train.tfrecord'
VAL_RECORD_FILE = '/content/drive/MyDrive/models/research/object_detection/train_cots/training_data/validation.tfrecord'
TEST_RECORD_FILE = '/content/drive/MyDrive/models/research/object_detection/train_cots/training_data/test.tfrecord'
LABELMAP_FILE = '/content/drive/MyDrive/models/research/object_detection/train_cots/training_data/label_map.txt'

MODELS_CONFIG = {
    'efficientdet-d0': {
        'model_name': 'efficientdet_d0_coco17_tpu-32',
        'base_config_path': 'ssd_efficientdet_d0_512x512_coco17_tpu-8.config',
        'pretrained_checkpoint': 'efficientdet_d0_coco17_tpu-32.tar.gz',
    },
    'resnet-50-faster': {
        'model_name': 'faster_rcnn_resnet50_v1_640x640_coco17_tpu-8',
        'base_config_path': 'faster_rcnn_resnet50_v1_640x640_coco17_tpu-8.config',
        'pretrained_checkpoint': 'faster_rcnn_resnet50_v1_640x640_coco17_tpu-8.tar.gz',
    },
    'resnet-50': {
        'model_name': 'ssd_resnet50_v1_fpn_640x640_coco17_tpu-8',
        'base_config_path': 'ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.config',
        'pretrained_checkpoint': 'ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz'
    },
    'centernet': {
        'model_name': 'centernet_hg104_512x512_coco17_tpu-8',
        'base_config_path': '/content/drive/MyDrive/models/research/deploy/pretrained/centernet_hg104_512x512_coco17_tpu-8/pipeline.config',
        'pretrained_checkpoint': 'centernet_hg104_512x512_coco17_tpu-8.tar.gz'
    },
    'mobilenet': {
        'model_name': 'ssd_mobilenet_v2_320x320_coco17_tpu-8',
        'base_config_path': 'ssd_mobilenet_v2_320x320_coco17_tpu-8.config',
        'pretrained_checkpoint': 'ssd_mobilenet_v2_320x320_coco17_tpu-8.tar.gz'
    }
}

# Train Function

In [None]:
def train_eval(config, run_id):
    # initialization of model specific paths
    model = config['model']
    # dir for log training and metrics, gets reset every sweep
    model_dir = '/content/training/' + MODELS_CONFIG[model]['model_name']
    log_dir = '/content/drive/MyDrive/models/research/deploy/logs/' + run_id
    base_pipeline_file = MODELS_CONFIG[model]['base_config_path']
    pretrained_checkpoint = MODELS_CONFIG[model]['pretrained_checkpoint']
    fine_tune_checkpoint = PRETRAINED_DIR + MODELS_CONFIG[model]['model_name'] + '/checkpoint/ckpt-0'
    config_file = model_dir + '/deployed_config.config'

    # reset if exists log_dir and model_dir
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
    os.makedirs(log_dir, exist_ok=True)

    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.makedirs(model_dir, exist_ok=True)

    # prepating environment
    prepare_env(model_dir, pretrained_checkpoint, base_pipeline_file)

    # update config file to match sweep_config
    update_config_file(config, base_pipeline_file, config_file, fine_tune_checkpoint)

    # Train the model, output get stored in model_dir
    !python /content/drive/MyDrive/models/research/object_detection/model_main_tf2.py \
        --pipeline_config_path {config_file} \
        --model_dir {model_dir} \
        --sample_1_of_n_eval_examples=1 \
        --checkpoint_every_n=1000 \
        --record_summaries=False \
        --alsologtostderr
    

    # Test the model
    !python /content/drive/MyDrive/models/research/object_detection/model_main_tf2.py \
        --pipeline_config_path {config_file} \
        --model_dir {model_dir} \
        --checkpoint_dir {model_dir} \
        --eval_timeout=4 \
        --wait_interval=2 \
        --alsologtostderr


    # Move tfevent files from model_dir to log_dir
    !cp -r {model_dir + '/train'} {log_dir + '/train'}
    !cp -r {model_dir + '/eval'} {log_dir + '/eval'}

    # Also store hpams in log_dir
    filename = 'hpams.csv'
    path = os.path.join(log_dir, filename)
    headers = list(config.keys())
    with open(path, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        writer.writerow(config)
    

def prepare_env(model_dir, pretrained_checkpoint, base_pipeline_file):
    '''download config file, and relevant model 
    information, also makes sure folders exist'''

    # Download pretrained weights if they arent already
    %cd {PRETRAINED_DIR}

    # check if tarfile file already exists
    if not os.path.exists(PRETRAINED_DIR + pretrained_checkpoint):
        download_tar = 'http://download.tensorflow.org/models/object_detection/tf2/20200711/' + pretrained_checkpoint
        !wget {download_tar}

    # check if unzipped tarfile folder exists
    if not os.path.exists(pretrained_checkpoint.split('.')[0]):
        tar = tarfile.open(PRETRAINED_DIR + pretrained_checkpoint)
        tar.extractall()
        tar.close()

    # get config file and set path of config file
    if not os.path.exists(base_pipeline_file):
        download_config = 'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/configs/tf2/' + base_pipeline_file
        !wget {download_config}


def update_config_file(sweep_config, base_pipeline_file, config_file, fine_tune_checkpoint):
    '''
    * sweep_config (dict) w/keys:
        'batch_size'
        'num_steps'
        ... more to come
    * base_pipeline_file: path to template config for this model
    * config_file: where you want to store the update config file
    '''

    with open(base_pipeline_file) as f:
        config = f.read()

    if os.path.exists(config_file):
        os.remove(config_file)
        print('deleted old initialization cofig file')
        
    with open(config_file, 'w') as f:
        # Set labelmap path
        config = re.sub('label_map_path: ".*?"', 
                        'label_map_path: "{}"'.format(LABELMAP_FILE), config)

        # Set fine_tune_checkpoint path
        config = re.sub('fine_tune_checkpoint: ".*?"',
                        'fine_tune_checkpoint: "{}"'.format(fine_tune_checkpoint), config)

        # Set train tf-record file path
        config = re.sub('(input_path: ".*?)(PATH_TO_BE_CONFIGURED/train)(.*?")', 
                        'input_path: "{}"'.format(TRAIN_RECORD_FILE), config)

        # Set validation tf-record file path
        config = re.sub('(input_path: ".*?)(PATH_TO_BE_CONFIGURED/val)(.*?")', 
                        'input_path: "{}"'.format(VAL_RECORD_FILE), config)
        
        # Set test tf-record file path
        config = re.sub('(input_path: ".*?)(PATH_TO_BE_CONFIGURED/test)(.*?")', 
                        'input_path: "{}"'.format(TEST_RECORD_FILE), config)

        # Set number of classes. 
        config = re.sub('num_classes: [0-9]+',
                        'num_classes: {}'.format(NUM_CLASSES), config)

        # Hyper parameters
        ########################################################################

        # Set batch size
        config = re.sub('batch_size: [0-9]+',
                        'batch_size: {}'.format(sweep_config['batch_size']), config)
        
        # Set training steps
        config = re.sub('num_steps: [0-9]+',
                        'num_steps: {}'.format(sweep_config['num_steps']), config)
            
        # CHANGE LEARNING RATE PARAMTERS
        # models 'resnet-50-faster' and 'efficientdet-d0' are using 
        # cosine_decay_learning_rate in momentum optimizer:
        if sweep_config['model'] in ['resnet-50-faster', 'efficientdet-d0']:
            warmup_learning_rate = sweep_config['learning_rate_base']/100
            warmup_steps = sweep_config['num_steps']//25

            config = re.sub('learning_rate_base: [-.e0-9]+',
                            'learning_rate_base: {}'.format(sweep_config['learning_rate_base']), config)
            config = re.sub('warmup_learning_rate: [-.e0-9]+',
                            'warmup_learning_rate: {}'.format(warmup_learning_rate), config)
            config = re.sub('total_steps: [0-9]+',
                            'total_steps: {}'.format(sweep_config['num_steps']), config)
            config = re.sub('warmup_steps: [0-9]+',
                            'warmup_steps: {}'.format(warmup_steps), config)
            
        # 'centernet' model is using manual_step_learning_rate:
        elif sweep_config['model'] in ['centernet']:
            step_update_lr_1 = int(sweep_config['num_steps'] * .6)
            step_update_lr_2 = int(sweep_config['num_steps'] * .8)
            learning_rate_1 = sweep_config['initial_learning_rate']/10
            learning_rate_2 = sweep_config['initial_learning_rate']/100

            config = re.sub('initial_learning_rate: [-.e0-9]+',
                            'initial_learning_rate: {}'.format(sweep_config['initial_learning_rate']), config)
            
            config = re.sub('learning_rate_1: [-.e0-9]+',
                            'learning_rate: {}'.format(learning_rate_1), config)
            config = re.sub('learning_rate_2: [-.e0-9]+',
                            'learning_rate: {}'.format(learning_rate_2), config)
            
            config = re.sub('step_update_lr_1: [-.e0-9]+',
                            'step: {}'.format(step_update_lr_1), config)
            config = re.sub('step_update_lr_2: [-.e0-9]+',
                            'step: {}'.format(step_update_lr_2), config)
        ########################################################################
        

        # Set how may images you want visualized in the evaluation tb-event file
        config = re.sub('eval_config: {',
                        'eval_config: {' + '\n  num_visualizations: {}'.format(NUM_VISUALIZATIONS), config)

        # Set fine-tune checkpoint type to detection
        # more details here: https://drive.google.com/file/d/1diOU07Qfc73R3DvZL9JyYfdMZWhAXCL5/view?usp=sharing
        config = re.sub('fine_tune_checkpoint_type: "classification"', 
                        'fine_tune_checkpoint_type: "{}"'.format('detection'), config)
        f.write(config)

        # test if the config is updated:
        print(config)


def get_model_dir(model):
    return '/content/training/' + MODELS_CONFIG[model]['model_name']


def get_log_dir(sweep_id):
    return '/content/drive/MyDrive/models/research/deploy/logs/' + sweep_id


def log_eval_metrics(path):
    '''reads a evaluation-tfevent file and returns a list of metrics'''
    eval_metrics = []
    for summary in summary_iterator(path):
        for v in summary.summary.value:
            if v.metadata.plugin_data.plugin_name != 'images':
                eval_metrics.append({v.tag: tf.make_ndarray(v.tensor).item()})
    return eval_metrics


def log_train_metrics(path):
    '''reads a training-tfevent file and returns a list of train-information'''
    train_metrics = []
    for summary in summary_iterator(path):
        step = summary.step
        for v in summary.summary.value:
            if v.metadata.plugin_data.plugin_name != 'images':
                train_metrics.append({v.tag: tf.make_ndarray(v.tensor).item(), 'Step': step})
    return train_metrics

# Choose Model & Configure Sweep

In [None]:
# if you want to start a new sweep
# False if you are connecting to an existing sweep
new_sweep = True  
# number of runs to execute
count = 100
# sweep_id if you are connecting to an existing one
sweep_id = 'btrorx9e'

# We chose to evaluate the three models: 'resnet-50-faster', 
#                                        'efficientdet-d0', 
#                                        'centernet'

if new_sweep:
    # model = 'centernet'

    sweep_config = {
        'method': 'grid',
        'parameters': {
            'model': {
                'values': ['resnet-50-faster', 'efficientdet-d0']
            },
            'batch_size': {
                'value': 3
            },
            'num_steps': {
                'values': [10000, 20000, 30000]
            },
            # For efficientdet-d0 and resnet-faster:
            'learning_rate_base': {
                'values': [0.1, 0.01]
            }
            # following is only for cosin learning rate (not centernet):
            # happens in the update_config_file() function:
            # warmup learning rate is calculated to learning_rate/100
            # warmup_steps is calculated to num_steps//25

            # For Centernet:
            # 'initial_learning_rate': {
            #     'values': [0.01, 0.03]
            # }
            # following is only for manual_step_learning_rate (only centernet):
            # happens in the update_config_file() function:
            # step_update_lr_1 is the lr for the last 40% of steps
            #   learning_rate_1: 1/100 of initial_learning_rate
            # step_update_lr_2 is the lr for the last 20% of steps 
            #   learning_rate_2: 1/100 of initial_learning_rate
        }
    }
    #  Initialize new sweep
    sweep_id = wandb.sweep(sweep_config, project='model-sweeps', entity='cots-detectors')
    print('Created new sweel with sweep_id:{}'.format(sweep_id))



# funciton run by wandb sweep agent
def sweep_train_eval():
    with wandb.init() as run:
        config = wandb.config
        run_id = run.id

        # tfevent files are stored in log_dir
        # checkpoint files during training are stored in model_dir
        log_dir = get_log_dir(run_id)

        wandb.log(dict(config))

        # stores tfevent files in log_dir
        # and training checkpoints in model_dir
        train_eval(dict(config), run_id)

        # read metrics from tfevent files and log to wandb
        eval_path = os.path.join(log_dir, 'eval')
        eval_file = os.path.join(log_dir, 'eval', os.listdir(eval_path)[0])

        train_path = os.path.join(log_dir, 'train')
        train_file = os.path.join(log_dir, 'train', os.listdir(train_path)[0])

        train_metrics = log_train_metrics(train_file)
        eval_metrics = log_eval_metrics(eval_file)

        for metric in train_metrics:
            wandb.log(metric)
            
        for metric in eval_metrics:
            wandb.log(metric)

        if STORE_TFEVENT_FILES:
            wandb.save(train_file, base_path=log_dir)
            wandb.save(eval_file, base_path=log_dir) 

# connects to sweep
wandb.agent(sweep_id, function=sweep_train_eval, count=count)