In [1]:
import yaml
import re
import os
from pathlib import Path
from dataclasses import dataclass
from miniutils.progress_bar import parallel_progbar

In [2]:
@dataclass
class Variable:
    name: str

@dataclass
class Reference:
    ref_node_name: str
    output_name: str
        
def var_constructor(loader, node):
    return Variable(name=node.value)

def ref_constructor(loader, node):
    ref_node_name_and_output_name = node.value.split(".")
    assert len(ref_node_name_and_output_name) == 2
    ref_node_name, output_name = ref_node_name_and_output_name
    return Reference(ref_node_name=ref_node_name, output_name=output_name)

class PyTorchToolboxLoader(yaml.SafeLoader):
    pass


# This tells the loader that when it sees "!path" it will pass the value proceeding the !path value into the path constructor
PyTorchToolboxLoader.add_constructor('!Var', var_constructor)
PyTorchToolboxLoader.add_constructor('!Ref', ref_constructor)

In [3]:
config = yaml.load(Path("test.yml").open("r"), Loader=PyTorchToolboxLoader)

In [4]:
def replace_variables(resources, variables):
    if isinstance(resources, dict):
        for name, resource in resources.items():
            resources[name] = replace_variables(resource, variables)
    elif isinstance(resources, list):
        for i, resource in enumerate(resources):
            resources[i] = replace_variables(resource, variables)
    elif isinstance(resources, Variable):
        resources = variables[resources.name]
    else:
        return resources
    return resources

In [5]:
replaced_resources = replace_variables(config['Resources'], config['Variables'])

In [6]:
replaced_resources

{'TestSingleVariableReplacement': {'single_variable': 'hello'},
 'TestListVariableReplacement': {'list_variable': ['foo', 'bar']},
 'TestVariableInListReplacement': ['hello', ['foo', 'bar']],
 'TestVariableInListOfDictionaryReplacement': [{'dict_1': 'hello'},
  {'dict_2': ['foo', 'bar']}],
 'MockReference': {'output': ['some_reference']},
 'TestFindReference': {'ref_var': Reference(ref_node_name='MockReference', output_name='some_reference')},
 'TestFindReferenceInList': [Reference(ref_node_name='MockReference', output_name='some_reference')],
 'TestFindReferenceInListOfDictionary': {'ref_var_in_list_of_dictionary': [{'key_1': Reference(ref_node_name='MockReference', output_name='some_reference')}]}}

In [7]:
def find_references(resource):
    references = []
    if isinstance(resource, dict):
        for _, value in resource.items():
            references.extend(find_references(value))
    elif isinstance(resource, list):
        for value in resource:
            references.extend(find_references(value))
    elif isinstance(resource, Reference):
        references.append(resource)
    else:
        pass
    return references

In [8]:
for name, resource in config['Resources'].items():
    print(name)
    print(find_references(resource))

TestSingleVariableReplacement
[]
TestListVariableReplacement
[]
TestVariableInListReplacement
[]
TestVariableInListOfDictionaryReplacement
[]
MockReference
[]
TestFindReference
[Reference(ref_node_name='MockReference', output_name='some_reference')]
TestFindReferenceInList
[Reference(ref_node_name='MockReference', output_name='some_reference')]
TestFindReferenceInListOfDictionary
[Reference(ref_node_name='MockReference', output_name='some_reference')]


#### Try on original configuration file

In [265]:
from collections import Counter
from typing import Optional, Dict, Any, Iterator, Iterable, Sequence, Union, Callable, Tuple, List, Any, Collection

import pandas as pd
import numpy as np

def listify(p=None, q=None):
    "Make `p` same length as `q`"
    if p is None:
        p = []
    elif isinstance(p, str):
        p = [p]
    elif not isinstance(p, Iterable):
        p = [p]
    n = q if type(q) == int else len(p) if q is None else len(q)
    if len(p) == 1: p = p * n
    assert len(p) == n, f'List len mismatch ({len(p)} vs {n})'
    return list(p)

def load_training_data(root_image_paths, root_label_paths, use_n_samples):
    train_df = load_training_data_df(root_image_paths, root_label_paths, use_n_samples)
    labels = train_df['Target'].values
    labels_one_hot = make_one_hot(labels, n_classes=28)
    return np.array(train_df['ImagePath'].values), np.array(labels), np.array(labels_one_hot)

def load_training_data_df(root_image_paths, root_label_paths, use_n_samples):
    labels_df = load_training_labels(root_label_paths)
    labels_df.sort_values(["Id"], ascending=[True], inplace=True)
    labels_df_sorted_by_id = labels_df

    # As some duplicate images were removed, we only use the images that have labels
    image_paths = load_training_images(root_image_paths)
    image_paths_with_labels = filter_image_paths_with_labels(image_paths, labels_df)

    # Sort by ID so that the labels and the image matches up
    train_df = labels_df_sorted_by_id
    image_paths_sorted_by_id = sorted(image_paths_with_labels, key=lambda path: path.stem)
    train_df["ImagePath"] = image_paths_sorted_by_id

    if use_n_samples:
        train_df = train_df.sample(use_n_samples)
    return train_df


def load_training_labels(training_labels_path):
    labels_df = pd.read_csv(training_labels_path)
    labels_df['Target'] = [[int(i) for i in s.split()] for s in labels_df['Target']]
    labels_df['TargetTuple'] = [tuple(t) for t in labels_df['Target']]
    return labels_df

def load_training_images(training_images_path):
    image_paths = []
    for p in listify(training_images_path):
        image_paths.extend(Path(p).glob("*"))
    return image_paths

def filter_image_paths_with_labels(image_paths, labels_df):
    # We use a Counter to filter in O(n) instead of O(n^2) time
    image_id_with_labels_lookup = Counter(labels_df['Id'])
    image_paths_used_for_training = [Path(p) for p in image_paths if
                                     image_id_with_labels_lookup.get(Path(p).stem) is not None]
    return np.array(image_paths_used_for_training)

def add_number_of_labels_column(train_df):
    train_df.sort_values(["TargetTuple"], ascending=[True])
    label_counts = Counter([tuple(l) for l in train_df['Target'].values])
    train_df['Count'] = [label_counts[tuple(l)] for l in train_df['Target']]
    return train_df


def add_one_hot_labels_index_column(train_df):
    target_tuple_to_label = {v: k for k, v in enumerate(train_df['TargetTuple'].unique())}
    train_df['OneHotLabelIndex'] = train_df['TargetTuple'].map(lambda x: target_tuple_to_label[x])
    return train_df


def load_testing_data(root_image_paths, use_n_samples=None):
    X = sorted(list(Path(root_image_paths).glob("*")), key=lambda p: p.stem)
    if use_n_samples is not None:
        X = X[:use_n_samples]
    return np.array(X)


def make_one_hot(labels, n_classes=28):
    one_hots = []
    for label in labels:
        one_hot = np.zeros(n_classes)
        for label_idx in label:
            one_hot[label_idx] = 1
        one_hots.append(one_hot.astype(np.float32))
    return one_hots

def calculate_mean_and_std_for_dataset(data_paths, some_var):
    print(some_var)
    flattened_data_paths = list(chain(*data_paths))
    means, stds = list(zip(*parallel_progbar(calculate_mean_and_std, flattened_data_paths)))
    mean = np.stack(means).mean(axis=0)
    std = np.stack(stds).mean(axis=0)
    logging.info(f"Mean of dataset is: {mean}")
    logging.info(f"Standard deviation of dataset is: {std}")
    return mean, std

lookup = {
    "load_testing_data": load_testing_data,
    "load_training_data": load_training_data,
    "calculate_mean_and_std_for_dataset": calculate_mean_and_std_for_dataset
}

In [328]:
config = yaml.load(Path("densenet121_two_input_fc_with_tta_template.yml").open("r"), Loader=PyTorchToolboxLoader)

In [329]:
config["Resources"] = replace_variables(config["Resources"], config["Variables"])

In [349]:
from functools import partial

class Node:
    def __init__(self, name, references, pointer, partial, arguments, output_names):
        self.name = name
        self.references = references
        self.pointer = pointer
        self.arguments = arguments
        self.output_names = output_names
        self.partial = partial
        self.reference_replaced_arguments = None
        self.output = None
        
    def create_output(self):
        if self.partial:
            assert len(self.output_names) == 1, "If the output of node: {self.name} is partial, then there should be one output, {len(self.output_names)} outputs are found"
            self.output = {self.output_names[0]: partial(self.pointer, **self.reference_replaced_arguments)}
        else:
#             print(self.name)
#             print("output names")
#             print(self.output_names)
#             print("output")
            print(self.pointer(**self.reference_replaced_arguments))
            self.output = {output_name: output_value for output_name, output_value in zip(self.output_names, self.pointer(**self.reference_replaced_arguments))}

In [350]:
flattened_resources = {}
for resources in config["Resources"].values():
    flattened_resources = {**flattened_resources, **resources}

In [351]:
import networkx as nx
pipeline_graph = nx.DiGraph()

def load_properties_with_default_values(properties):
    assert properties["pointer"] in lookup, f"There is no lookup called: {pointer}"
    return {
        "pointer": lookup[properties["pointer"]],
        "partial": properties.get("partial", False),
        "arguments": properties.get("arguments", {}),
        "output_names": properties.get("output_names")
    }

for name, resource in flattened_resources.items():
    references = find_references(resource)
    properties = load_properties_with_default_values(resource["properties"])
    node = Node(name=name, references=references, **properties)
    pipeline_graph.add_node(name, node=node)

In [352]:
list(pipeline_graph.nodes(data=True))[0][1]['node'].pointer

<function calculate_mean_and_std_for_dataset at 0x7f7b9966f378>

In [353]:
for name, node_wrapper in pipeline_graph.nodes(data=True):
    node = node_wrapper["node"]
    for reference in node.references:
        referenced_node_name = reference.ref_node_name
        assert referenced_node_name in pipeline_graph.nodes, f"The reference: {referenced_node_name} in node: {node.name} does not exist"
        pipeline_graph.add_edge(referenced_node_name, name)

In [354]:
for node in nx.algorithms.dag.topological_sort(pipeline_graph):
    print(node)

LoadTrainingData
LoadTestingData
CalculateMeanAndStdForDataset


In [359]:
from copy import deepcopy
def run_node(graph, node):
    replace_arguments(graph, node)
    node.create_output()

def replace_arguments(graph, node):
    try:
        arguments = node.arguments
        if arguments is not None:
            reference_replaced_arguments = replace_references(graph, deepcopy(arguments))
            print(reference_replaced_arguments)
            node.reference_replaced_arguments = reference_replaced_arguments
    except AttributeError:
        pass   


def replace_references(graph, arguments, depth=0):
    if isinstance(arguments, dict):
        for name, argument in arguments.items():
            arguments[name] = replace_references(graph, argument, depth+1)
    elif isinstance(arguments, list):
        for i, argument in enumerate(arguments):
            arguments[i] = replace_references(graph, argument, depth+1)
    elif isinstance(arguments, Reference):
        # reassign name it make the intent clearer
        reference = arguments
        ref_node = graph.nodes(data=True)[reference.ref_node_name]["node"]
        ref_node_outputs = ref_node.output
        assert ref_node_outputs is not None, f"Node: {reference.ref_node_name} has no output"
        assert reference.output_name in ref_node_outputs, f"Node: {reference_node_name} has no output named {reference.output_name}"
        arguments = ref_node_outputs[reference.output_name]
    else:
        return arguments
    return arguments

    

In [360]:
for node_name in nx.algorithms.dag.topological_sort(pipeline_graph):
    node = pipeline_graph.nodes(data=True)[node_name]["node"]
    run_node(pipeline_graph, node)

{'use_n_samples': 500, 'root_image_paths': ['/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined', '/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18'], 'root_label_paths': '/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_all_no_dupes.csv'}
LoadTrainingData
output names
['train_X', 'train_y', 'train_y_one_hot']
output
(array([PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18/2831_92_B6_2.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18/19711_222_H1_2.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18/27451_1207_F8_1.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/0f779cd4-bbb7-11e8-b2ba-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/K

In [357]:
pipeline_graph.nodes(data=True)["CalculateMeanAndStdForDataset"]["node"].arguments

{'data_paths': [Reference(ref_node_name='LoadTrainingData', output_name='train_X'), Reference(ref_node_name='LoadTestingData', output_name='test_X')]}

In [358]:
pipeline_graph.nodes(data=True)["CalculateMeanAndStdForDataset"]["node"].reference_replaced_arguments

{'data_paths': [array([PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/381e246e-bbb2-11e8-b2ba-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18/2540_50_E12_2.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/147af4c8-bb9a-11e8-b2b9-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/3f46eb96-bba2-11e8-b2b9-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/56d92c4c-bbca-11e8-b2bc-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined/b44b8110-bb9b-11e8-b2b9-ac1f6b6435d0.npy'),
       PosixPath('/home/kevin/Documents/Kaggle/human-protein-image-classification/data/train_combined_HPAv18/42795_1034_D10_