In [1]:
%run "00_path_config.ipynb"

import pathlib as pl
import hashlib as hl
import collections as col
import re

TYPE_KEYWORDS = col.OrderedDict({
    "hifi": ["hifi", "ccs", "q20"],
    "ont": ["nanopore", "ont"],
    "illwgs": ["short", "illumina"]
})

RE_SAMPLE = re.compile("(NA|HG|GM)[0-9]{5}")

RE_DATE = re.compile("(20[0-9]{2}_)|(20[0-9]{6})|([12][8920123][0-9]{4}_)")

class DataFile:
    
    def __init__(self, abs_file_path, source_folder):
        
        abs_file_path = pl.Path(abs_file_path)

        self.local_abs_path = self._get_abs_path(
            abs_file_path, "local"
        )
        self.remote_abs_path = self._get_abs_path(
            abs_file_path, "remote"
        )
        self.file_name = abs_file_path.name
        
        # path for FOFN files
        self.data_rel_path = self._get_data_rel_path(
            self.remote_abs_path
        )
        
        self.sample = self.determine_sample()
        self.read_type = self.determine_read_type()
        self.file_date = self.determine_file_date()
        self.file_group = hl.md5(str(source_folder).encode("utf-8")).hexdigest()
        self.group_date = self.determine_group_data(source_folder)
        self.matched_entry = None
        self.curator = "unknown"
        self.sample_batch = -1
        return
        
    def __lt__(self, other):
        return str(self.data_rel_path) < str(other.data_rel_path)
    
    def __eq__(self, other):
        return str(self.data_rel_path) == str(other.data_rel_path)
        
    
    def __repr__(self):
        return f"{self.sample} - {self.read_type} - {self.file_date}\n{self.data_rel_path}\n"
        
    def _get_abs_path(self, file_path, which):

        if file_path.is_relative_to(PATH_PREFIX.local):
            if which == "local":
                abs_path = file_path
            else:
                assert which == "remote"
                abs_path = self.switch_path_prefix(
                    file_path, PATH_PREFIX.local, PATH_PREFIX.remote
                )
        elif file_path.is_relative_to(PATH_PREFIX.remote):
            if which == "remote":
                abs_path = file_path
            else:
                assert which == "local"
                abs_path = self.switch_path_prefix(
                    file_path, PATH_PREFIX.remote, PATH_PREFIX.local
                )
        else:
            raise ValueError(f"Not an abs path: {file_path}")
        
        return abs_path
    
    def _get_data_rel_path(self, abs_path):
        
        rel_path = abs_path.relative_to(REMOTE_DATA_ROOT)
        return rel_path
    
    def switch_path_prefix(self, file_path, old_prefix, new_prefix):
        
        new_path = str(file_path).replace(str(old_prefix), "").strip("/")
        new_path = new_prefix.joinpath(new_path)
        return new_path
    
    def determine_sample(self):
    
        sample = None

        check_name = RE_SAMPLE.search(self.file_name)
        if check_name is not None:
            sample = check_name.group(0)
        else:
            climb_up = self.data_rel_path.parent
            climb = 3
            while climb > 0:
                check_path = RE_SAMPLE.search(climb_up.name)
                if check_path is not None:
                    sample = check_path.group(0)
                    break
                climb -= 1
                climb_up = climb_up.parent
        if sample is None:
            raise ValueError(f"Cannot extract sample name: {self.data_rel_path}")
            
        return sample
            
    
    def determine_read_type(self):
        
        my_type = None
        for read_type, type_hints in TYPE_KEYWORDS.items():
            if any(hint in str(self.data_rel_path).lower() for hint in type_hints):
                my_type = read_type
                break
        if my_type is None:
            raise ValueError(f"Cannot determine read type: {self.data_rel_path}")
        return my_type
    
    def determine_file_date(self):
        
        file_date = "20XX"
        mobj = RE_DATE.search(self.file_name)
        if mobj is not None:
            date_spec = mobj.group(0).strip("_")
            if len(date_spec) == 6:
                date_spec = "20" + date_spec
            file_date = date_spec[:4]
        return file_date
    
    def determine_group_data(self, source_folder):
        
        group_date = "20XX"
        mobj = RE_DATE.search(str(source_folder))
        if mobj is not None:
            date_spec = mobj.group(0).strip("_")
            if len(date_spec) == 6:
                date_spec = "20" + date_spec
            group_date = date_spec[:4]
        return group_date
    
    def set_matched_entry(self, entry_index):
        if self.matched_entry is not None:
            raise ValueError(f"Multi-match: has {self.matched_entry} / gets {entry_index}")
        self.matched_entry = entry_index
        return
    
    def set_curator(self, curator):
        self.curator = curator
        return
    
    def set_sample_batch(self, sample_batch):
        self.sample_batch = int(sample_batch)
        return
    
    def get_table_row(self, batch_num=False):
        
        if self.matched_entry is None:
            matched = 999999
        else:
            matched = self.matched_entry
            
        if batch_num:
            table_row = (
                self.sample, self.read_type, self.sample_batch,
                self.file_name, self.data_rel_path, matched,
                self.curator, self.file_group, self.group_date
            )
        else:            
            table_row = (
                self.sample, self.read_type, self.file_name,
                self.data_rel_path, matched, self.curator,
                self.file_group, self.group_date
            )
        return table_row
    
    def get_table_header(self, batch_num=False):
        
        table_header = [
            "sample", "read_type", "file_name",
            "file_path", "matched", "curated_by",
            "file_group", "group_date"
        ]
        
        if batch_num:
            table_header.insert(2, "processing_batch")
            
        return table_header
            


class Sample:
    
    def __init__(self, sample, order_num, sex, batch_number=None):
        
        if sample.startswith("GM"):
            self.alt = sample.strip()
            self.name = "NA" + sample[2:].strip()
        elif sample.startswith("NA"):
            self.name = sample.strip()
            self.alt = "GM" + self.name[2:]
        else:
            self.name = sample.strip()
            self.alt = "no-alt-name"
        
        self.sin = f"SIN:{self.name[2:]}"
        
        self.hgsvc_num = int(order_num)
        
        self.batch_num = -1 if batch_number is None else int(batch_number)
        
        self.sex = sex.lower()
        assert self.sex in ["female", "male"]
        
        self.hifi_complete = False
        self.ont_complete = False
               
        self.sample_files = col.defaultdict(list)
        
        self.source_dates = col.defaultdict(list)
        
        return
        
    def __lt__(self, other):
        return self.name < other.name
    
    def __eq__(self, other):
        return self.name == other.name
    
    def __repr__(self):
        return (
            f"{self.hgsvc_num}: {self.name} "
            f"(HiFi complete: {self.hifi_complete} /"
            f" ONT complete: {self.ont_complete}) --- "
            f" (processing batch: {self.batch_num})"
        )

    def get_file_group_lca_path(self, file_paths):
        
        if len(file_paths) == 1:
            lca_path = file_paths[0].parent
        else:
            all_abs = [fp.is_absolute() for fp in file_paths]
            all_rel = [not is_abs for is_abs in all_abs]
            if not (all(all_abs) or all(all_rel)):
                raise ValuError(f"Incompatible paths: {file_paths}")
            if all(all_abs):
                lca_components = ["/"]
            else:
                lca_components = []
            all_components = [str(fp).strip("/").split("/") for fp in file_paths]
            for component in zip(*all_components):
                if len(set(component)) > 1:
                    break
                lca_components.append(component[0])
            lca_path = pl.Path(*tuple(lca_components))
        return lca_path
