# import data

In [1]:
from __future__ import annotations
from pathlib import Path
import yaml
from dataclasses import dataclass, field
from datetime import datetime as dt
import hashlib

from local.constants import WORKSPACE_ROOT

In [2]:
def str_hash(s):
    return int(hashlib.sha256(s.encode("utf-8", "replace")).hexdigest(), 16)

@dataclass
class DataType:
    name: str
    properties: dict[str, str]
    library: DataTypeLibrary
    _hash: int = None

    def __hash__(self) -> int:
        if self._hash is None:
            self._hash = str_hash(''.join(self.AsSet()))
        return self._hash
    
    @classmethod
    def SetFromDict(cls, raw: dict[str, str]):
        return set(f"{k}={v}" for k, v in raw.items())

    def AsSet(self):
        return self.SetFromDict(self.properties)
    
@dataclass
class DataTypeLibrary:
    source: Path
    schema: str
    ontology: dict
    types: dict[str, DataType] = field(default_factory=dict)

    def __getitem__(self, key: str) -> DataType:
        return self.types[key]
    
    def __in__(self, key: str) -> bool:
        return key in self.types
    
    @classmethod
    def Load(cls, path: Path) -> DataTypeLibrary:
        with open(path) as f:
            d = yaml.safe_load(f)
        lib = cls(path, d["schema"], d["ontology"])
        types = {}
        for k, v in d["types"].items():
            types[k] = DataType(
                name=k,
                properties=v,
                library=lib,
            )
        lib.types = types
        return lib

@dataclass
class DataInstance:
    source: Path
    type: DataType
    _hash: int = None

    def __hash__(self) -> int:
        if self._hash is None:
            self._hash = str_hash(str(self.source.resolve())+''.join(self.type.AsSet()))
        return self._hash
    
    @classmethod
    def Register(cls, source: Path, type: DataType):
        return cls(source, type)
    
    def Pack(self):
        return {
            "source": str(self.source),
            "type": self.type.name,
            "properties": self.type.properties,
        }

@dataclass
class DataInstanceLibrary:
    description: str
    types_library: DataTypeLibrary
    manifest: dict[str, DataInstance] = field(default_factory=dict)
    time_created: dt = field(default_factory=lambda: dt.now())
    time_modified: dt = field(default_factory=lambda: dt.now())

    @classmethod
    def Load(cls, path: Path):
        with open(path) as f:
            d = yaml.safe_load(f)

        class_attributes = set(cls.__annotations__.keys())
        TYPE_LIB = "types_library"
        d[TYPE_LIB] = DataTypeLibrary.Load(Path(d[TYPE_LIB]))
        for k, v in d.items():
            assert k in class_attributes, f"unexpected field [{k}]"
            if k == "manifest":
                manifest = {}
                for kk, vv in v.items():
                    type = d[TYPE_LIB][vv["type"]]
                    manifest[kk] = DataInstance(
                        source=Path(vv["source"]),
                        type=type,
                    )
                d[k] = manifest
        return cls(**d)

    def Dump(self, path: Path):
        self.time_modified = dt.now()
        with open(path, "w") as f:
            d = {}
            for k, v in self.__dict__.items():
                if k.startswith("_"): continue
                if callable(v): continue
                if k == "types_library":
                    v = str(v.source)
                elif k == "manifest":
                    v = {kk: vv.Pack() for kk, vv in v.items()}
                d[k] = v
            yaml.safe_dump(d, f, indent=4)

lib = DataTypeLibrary.Load(WORKSPACE_ROOT/"main/local_mock/prototypes/metagenomics.yml")
given_contigs = DataInstance.Register(WORKSPACE_ROOT/"scratch/test_ws/data/local/example.fna", lib["contigs"])
given_ref = DataInstance.Register(WORKSPACE_ROOT/"scratch/test_ws/data/local/uniprot_sprot.dmnd", lib["diamond_protein_reference"])
ilib = DataInstanceLibrary(
    description="test workspace",
    types_library=lib,
    manifest={
        "contigs": given_contigs,
        "diamond_reference.uniprot_sprot": given_ref,
    }
)
ilib_path = Path("./cache/test.yml")
# ilib.Dump(ilib_path)
# ilib2 = DataInstanceLibrary.Load(ilib_path)
# ilib2.Dump(ilib_path.with_name("test2.yml"))

# generate workflow

In [3]:
from metasmith.solver import Namespace, Endpoint, Transform, Solve
from metasmith.models import TransformInstanceLibrary

In [4]:
errs, trlib = TransformInstanceLibrary.Load(Path("./transforms/"))
errs

[PosixPath('simple_1/_common.py')]

In [5]:
trlib.manifest

{PosixPath('simple_1/diamond.py'): TransformInstance(container='docker://bschiffthaler/diamond:2.0.14', protocol=<function protocol at 0x7f9c080d44a0>, input_signature={DataType(name='diamond_protein_reference', properties={'format': 'Binary format', 'data': 'database reference', 'reference_type': 'diamond db'}, library=DataTypeLibrary(path=PosixPath('/home/tony/workspace/tools/Metasmith/main/local_mock/prototypes/metagenomics.yml'), schema=0.3, ontology={'doi': 'https://doi.org/10.1093/bioinformatics/btt113', 'name': 'EDAM', 'version': 1.25, 'strict': False}, types={'contigs': DataType(name='contigs', properties={'format': 'FASTA', 'data': 'DNA sequence'}, library=...), 'orfs_gbk': DataType(name='orfs_gbk', properties={'format': 'GenBank', 'data': 'Protein features'}, library=...), 'orfs_faa': DataType(name='orfs_faa', properties={'format': 'FASTA', 'data': 'Amino acid sequence'}, library=...), 'diamond_protein_reference': ..., 'fastal_protein_reference': DataType(name='fastal_protein

In [6]:
x = trlib.manifest[Path("simple_1/diamond.py")].input_signature
next(iter(x)).AsProperties()
# trlib.manifest[Path("dupe_test/diamond.py")].protocol(0)

{'data=database reference',
 'format=Binary format',
 'reference_type=diamond db'}

In [6]:
ilib_path = Path("./cache/test.yml")
ilib = DataInstanceLibrary.Load(ilib_path)
ilib.manifest

{'contigs': DataInstance(source=PosixPath('/home/tony/workspace/tools/Metasmith/scratch/test_ws/data/local/example.fna'), type=DataType(name='contigs', properties={'format': 'FASTA', 'data': 'DNA sequence'}, library=DataTypeLibrary(source=PosixPath('/home/tony/workspace/tools/Metasmith/main/local_mock/prototypes/metagenomics.yml'), schema=0.3, ontology={'doi': 'https://doi.org/10.1093/bioinformatics/btt113', 'name': 'EDAM', 'version': 1.25, 'strict': False}, types={'contigs': ..., 'orfs_gbk': DataType(name='orfs_gbk', properties={'format': 'GenBank', 'data': 'Protein features'}, library=..., _hash=None), 'orfs_faa': DataType(name='orfs_faa', properties={'format': 'FASTA', 'data': 'Amino acid sequence'}, library=..., _hash=None), 'diamond_protein_reference': DataType(name='diamond_protein_reference', properties={'format': 'Binary format', 'data': 'database reference', 'reference_type': 'diamond db'}, library=..., _hash=None), 'fastal_protein_reference': DataType(name='fastal_protein_ref

In [None]:
NS = Namespace()

given = {k:v.type.AsSet() for k, v in ilib.manifest.items()}

# trlib = ...