The goal of this notebook is to create a student-teacher model where we first train a teacher on labeled data, and then use this teacher model to label more data, then we swap out the teacher with a student and train again over all the samples. 

# Loading data

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import numpy as np
import datetime
import pickle
import time
import os
import pathlib
import matplotlib.pyplot as plt
import sys
import shutil
 
# Some stuff to make utils-function work
sys.path.append('../utils')
from pipeline import *
from create_model import *
from utils import *
from unlabeled_utils import *
from evaluate_model import *
%load_ext autoreload
%autoreload 2

# Jupyter-specific
%matplotlib inline

project_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

Some parameters

In [None]:
data_dir = pathlib.Path('/home/henriklg/master-thesis/data/hyper-kvasir/labeled_ttv/')
unlab_dir = pathlib.Path('/home/henriklg/master-thesis/data/hyper-kvasir/unlabeled_ttv/')

iteration = 0
dir_name = str(iteration)+"_teacher"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

conf = {
    # Dataset
    "data_dir": data_dir,
    "unlab_dir": unlab_dir,
    "ds_info": 'hypkva',
    "augment": ["crop","flip","brightness","saturation","contrast","rotate"],
    "aug_mult": 0.4,
    "resample": True,
    "class_weight": False,
    "shuffle_buffer_size": 2000,        # 0=no shuffling
    "seed": 2511,
    "neg_class": None,                 # select neg class for binary ds (normal class)
    "outcast": None,                   # list of folders to drop - currently only works for 1 item
    # Model
    "model": 'EfficientNetB0',
    "weights": "imagenet",                   # which weights to initialize the model with
    "dropout": 0.2,
    "num_epochs": 3,
    "batch_size": 16,
    "img_shape": (128, 128, 3),
    "learning_rate": 0.001,
    "optimizer": 'Adam',
    "final_activation": 'softmax',     # sigmoid for binary ds
    # Callbacks
    "tensorboard": False,
    "learning_schedule": False,
    "decay_rate": 0,                   # 128:0.25   64:1.0   32:4.0   16:16   8:64
    "checkpoint": False,
    "early_stopp": False,
    "early_stopp_patience": 7,
    # Misc
    "verbosity": 0,
    "keep_threshold": 0.8,
    "dir_name": dir_name,
    "log_dir": log_dir,
    "cache_dir": "./cache",
    }

Create training, testing and validation dataset from utils/data_prep.py.  
Returns tf.dataset for shuffled, cached and batched data

In [None]:
ds = create_dataset(conf)

In [None]:
# First iteration only
datasets_bin = [tf_bincount(ds["clean_train"], conf["num_classes"])]
sanity = []
ds["combined_train"] = ds["clean_train"]

In [None]:
def run_iteration(conf, ds, datasets_bin, sanity):
    """
    """
    ## Prep cache
    cache_dir = './cache/{}_{}_{}_resampled/'.format(
        conf["img_shape"][0], 
        conf["ds_info"],
        "train"
    )
    # create directory if not already exist
    pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True)
    
    model = create_model(conf)
    callbacks = create_callbacks(conf)
    class_weights = get_class_weights(ds["train"], conf)

    start_time = time.time()
    history = model.fit(
            ds["train"],
            steps_per_epoch = conf["steps"]["train"],
            epochs = conf["num_epochs"],
            validation_data = ds["test"],
            validation_steps = conf["steps"]["test"],
            validation_freq = 1,
            class_weight = class_weights,
            callbacks = callbacks,
            verbose = 1
    )
    if conf["verbosity"]:
        print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

    if conf["num_epochs"] > 9:
        model.save(conf["log_dir"]+'/model')
    # from tensorflow.python.keras.models import Model, load_model
    # teacher_model = tf.keras.models.load_model("./logs/20200526-121128/teacher1/model")

    evaluate_model(model, history, ds, conf)

    unlab_ds, unlab_size = create_unlab_ds(conf)

    count = {"findings": 0, "total": 0}
    pseudo = {
        "pred_list": [],
        "lab_list": [],
        "name_list": []
    }

    pseudo, count = generate_labels(
        count, pseudo, unlab_ds, unlab_size, model, conf)

    # Sort in order of highest confidence to lowest
    pseudo_sorted = custom_sort(pseudo)

    checkout_findings(pseudo_sorted, conf)
    checkout_class("polyps", pseudo_sorted, conf)

    datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin)

    sanity = update_sanity(sanity, len(added_samples), unlab_size, datasets_bin, conf)

    # Update unlab_ds
    unlab_ds, unlab_size = reduce_dataset(unlab_ds, unlab_size, remove=added_samples)
    
    # Remove cache
    try:
        shutil.rmtree(cache_dir)
    except OSError as e:
        print("Error: %s : %s" % (dir_path, e.strerror))
    time.sleep(1)

In [None]:
teacher = {
    "name": "teacher",
    "aug_mult": 0.5,
    "dropout": 0.2
}

student = {
    "name": "student",
    "aug_mult": 0.5,
    "dropout": 0.2
}

models_list = [teacher, student, teacher, student]

In [None]:
for idx, curr_model in enumerate(models_list):
    iteration = int((np.floor(idx/2.0)))   # 0,0,1,1 etc
    dir_name = str(iteration)+'_'+curr_model["name"]
    conf["log_dir"] = "./logs/{}/{}".format(project_time, dir_name)
    
    for (key, value) in curr_model.items():
        conf[key] = value
        
    run_iteration(conf, ds, datasets_bin, sanity)
    
    # refresh training data
    ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )