The goal of this notebook is to create a student-teacher model where we first train a teacher on labeled data, and then use this teacher model to label more data, then we swap out the teacher with a student and train again over all the samples. 

# Loading data

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import numpy as np
import datetime
import pickle
import time
import os
import pathlib
import matplotlib.pyplot as plt
import sys
 
# Some stuff to make utils-function work
sys.path.append('../utils')
from pipeline import *
from create_model import *
from utils import *
from unlabeled_utils import *
from evaluate_model import *
%load_ext autoreload
%autoreload 2

# Jupyter-specific
%matplotlib inline

project_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

Some parameters

In [None]:
data_dir = pathlib.Path('/home/henriklg/master-thesis/data/kvasir-capsule/labeled_new_splits/')
unlab_dir = pathlib.Path('/home/henriklg/master-thesis/data/kvasir-capsule/unlabeled_ttv/')

iteration = 0
dir_name = str(iteration)+"_teacher"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

conf = {
    # Dataset
    "data_dir": data_dir,
    "unlab_dir": unlab_dir,
    "ds_info": 'kvacap',
    "augment": ["crop","flip","brightness","saturation","contrast","rotate"],
    "aug_mult": 0.8,
    "resample": True,
    "class_weight": False,
    "shuffle_buffer_size": 2000,       # no shuffling: 0
    "seed": 2511,
    "neg_class": None,                 # select neg class for binary ds (normal class)
    "outcast": None,                   # list of folders to drop - currently only supports 1 item
    # Model
    "model": 'EfficientNetB4',
    "weights": "imagenet",             # which weights to initialize the model with
    "dropout": 0.3,
    "num_epochs": 15,
    "batch_size": 8,
    "img_shape": (128, 128, 3),
    "learning_rate": 0.001,
    "optimizer": 'Adam',
    "final_activation": 'softmax',     # sigmoid for binary ds
    # Callbacks
    "tensorboard": False,
    "decay_rate": 0,                   # 128:0.25   64:1.0   32:4.0   16:16   8:64
    "checkpoint": False,
    "early_stopp_patience": 0,         # disable: 0
    # Misc
    "verbosity": 1,
    "keep_thresh": 0.95,
    "pseudo_thresh": 20000,
    "class_limit": 1000,
    "dir_name": dir_name,
    "log_dir": log_dir,
    "cache_dir": "./cache",
    }

Create training, testing and validation dataset from utils/data_prep.py.  
Returns tf.dataset for shuffled, cached and batched data

In [None]:
ds = create_dataset(conf)

In [None]:
# First iteration only
sanity = []
ds["unlab"] = create_unlab_ds(conf)
datasets_bin = [tf_bincount(ds["clean_train"], conf["num_classes"])]
ds["combined_train"] = ds["clean_train"]

## Recover from previous checkpoint

# Step 1: Train a teacher model on labeled images

In [None]:
model = create_model(conf)

In [None]:
callbacks = create_callbacks(conf)

In [None]:
class_weights = get_class_weights(ds["train"], conf)

### Train the teacher model

In [None]:
start_time = time.time()

history = model.fit(
        ds["train"],
        steps_per_epoch = conf["steps"]["train"],
        epochs = conf["num_epochs"],
        validation_data = ds["val"],
        validation_steps = conf["steps"]["val"],
        validation_freq = 1,
        class_weight = class_weights,
        callbacks = callbacks,
        verbose = 1
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

### Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

# Step 2: use the teacher to generate pseudo labels on unlabeled images

### Run predictions on all unlabeled images

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

## Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# Step 3: Train a student model on the combination of labeled images and pseudo labeled images

Now we have trained a teacher model, and used that model to predict on unlabeled dataset to create more samples with psudo-labels.  
It's time for swapping the teacher with the student!

In [None]:
# Make changes
dir_name = str(iteration)+"_student"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.8
# Model
conf["model"] = 'EfficientNetB4'
conf["dropout"] = 0.2

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
student_model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

### Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# Step 4: Iterate this algorithm a few times by treating the student as a teacher to relabel the unlabeled data and training a new student

# Second iteration - Part 1
# --- Teacher 2 ---

In [None]:
# Make changes
iteration += 1
dir_name = str(iteration)+"_teacher"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[1]["curr_train_size"]
conf["aug_mult"] = 0.2
# Model
conf["model"] = 'EfficientNetB0'
conf["dropout"] = 0.1

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# Second Iteration - part 2
# -- Student 2 ---

In [None]:
# Make changes
dir_name = str(iteration)+"_student"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.6
# Model
conf["model"] = 'EfficientNetB4'
conf["dropout"] = 0.2

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# Third iteration - Part 1
# --- Teacher 3 ---

In [None]:
# Make changes
iteration += 1
dir_name = str(iteration)+"_teacher"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.2
# Model
conf["model"] = 'EfficientNetB0'
conf["dropout"] = 0.1

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# Third Iteration - part 2
# -- Student 3 ---

In [None]:
# Make changes
dir_name = str(iteration)+"_student"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.6
# Model
conf["model"] = 'EfficientNetB4'
conf["dropout"] = 0.2

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# 4th iteration - Part 1
# --- Teacher 4 ---

In [None]:
# Make changes
iteration += 1
dir_name = str(iteration)+"_teacher"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.2
# Model
conf["model"] = 'EfficientNetB0'
conf["dropout"] = 0.1

conf["class_limit"] = 2500

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]

# 4th Iteration - part 2
# -- Student 4 ---

In [None]:
# Make changes
dir_name = str(iteration)+"_student"
log_dir = "./logs/{}/{}".format(project_time, dir_name)

# Dataset
conf["log_dir"] = log_dir
conf["ds_sizes"]["train"] = sanity[-1]["curr_train_size"]
conf["steps"]["train"] = sanity[-1]["curr_train_size"]//conf["batch_size"] 
conf["aug_mult"] = 0.6
# Model
conf["model"] = 'EfficientNetB4'
conf["dropout"] = 0.2

In [None]:
ds["train"] = prepare_for_training(
        ds=ds["combined_train"], 
        ds_name='train_'+dir_name,
        conf=conf,
        cache=True
    )

In [None]:
# create model, callbacks and weights
model = create_model(conf)
callbacks = create_callbacks(conf) 
class_weights = get_class_weights(ds["train"], conf)

In [None]:
start_time = time.time()

history = model.fit(
    ds["train"],
    steps_per_epoch = conf["steps"]["train"], 
    epochs = conf["num_epochs"],
    validation_data = ds["val"],
    validation_steps = conf["steps"]["val"],
    validation_freq = 1,
    class_weight = class_weights,
    callbacks = callbacks
)
print ("Time spent on training: {:.2f} minutes.".format(np.round(time.time() - start_time)/60))

## Evaluate the model

In [None]:
evaluate_model(model, history, ds, conf)

In [None]:
count = {"findings": 0, "total": 0}
pseudo = {"pred_list": [], "lab_list": [], "name_list": []}

In [None]:
pseudo, count = generate_labels(pseudo, count, ds["unlab"], model, conf)

### Inspect the classified images

In [None]:
# Sort in order of highest confidence to lowest
pseudo_sorted = custom_sort(pseudo)

checkout_findings(pseudo_sorted, conf, show=False)

### Resample new findings, convert to tf.tensors and concatenate with original training data, and update unlab_ds

In [None]:
datasets_bin, added_samples = resample_and_combine(ds, conf, pseudo, pseudo_sorted, datasets_bin, limit=conf["class_limit"])

# Update unlab_ds
ds["unlab"] = reduce_dataset(ds["unlab"], remove=added_samples)

sanity, conf = update_sanity(sanity, len(added_samples), datasets_bin, conf)
sanity[-1]