In [1]:
example = """
Test3.CDS.1
Test3.CDS.2 K12524
Test3.CDS.2 K00928
"""

In [2]:
def parse_kofam(data: str):
    annotation = {}
    for line in data.split('\n'):
        if line:
            _parts = line.strip().split()
            if len(_parts) == 2:
                feature_id, ko = _parts
                if feature_id not in annotation:
                    annotation[feature_id] = set()
                annotation[feature_id].add(ko)

    return annotation

In [3]:
parse_kofam(example)

{'Test3.CDS.2': {'K00928', 'K12524'}}

In [8]:
example = """
SeqID CMSVM-_Localization CMSVM-_Details CytoSVM-_Localization CytoSVM-_Details ECSVM-_Localization ECSVM-_Details ModHMM-_Localization ModHMM-_Details Motif-_Localization Motif-_Details OMPMotif-_Localization OMPMotif-_Details OMSVM-_Localization OMSVM-_Details PPSVM-_Localization PPSVM-_Details Profile-_Localization Profile-_Details SCL-BLAST-_Localization SCL-BLAST-_Details SCL-BLASTe-_Localization SCL-BLASTe-_Details Signal-_Localization Signal-_Details Cytoplasmic_Score CytoplasmicMembrane_Score Periplasmic_Score OuterMembrane_Score Extracellular_Score Final_Localization Final_Localization_Details Final_Score Secondary_Localization PSortb_Version
Test3.CDS.1 Unknown Unknown Unknown Unknown No internal helices found Unknown No motifs found Unknown No motifs found Unknown Unknown Unknown No matches to profiles found Unknown No matches against database Unknown No matches against database Unknown No signal peptide detected 2.00 2.00 2.00 2.00 2.00 Unknown 2.00 PSORTb version 3.0
Test3.CDS.2 Unknown Cytoplasmic Unknown Unknown No internal helices found Unknown No motifs found Unknown No motifs found Unknown Unknown Unknown No matches to profiles found Unknown No matches against database Unknown No matches against database Unknown No signal peptide detected 8.96 0.51 0.26 0.01 0.26 Cytoplasmic 8.96 PSORTb version 3.0
"""
def parse_psortb(data: str):
    annotation = {}
    lines = data.split('\n')
    h = lines[0].strip()
    # skip header
    for line in lines[1:]:
        if line:
            r = line.strip().split('\t')
            d = {h[i]: r[i].strip() for i in range(len(h))}
            annotation[d['SeqID']] = d

    return annotation
parse_psortb(example)

SeqID CMSVM-_Localization CMSVM-_Details CytoSVM-_Localization CytoSVM-_Details ECSVM-_Localization ECSVM-_Details ModHMM-_Localization ModHMM-_Details Motif-_Localization Motif-_Details OMPMotif-_Localization OMPMotif-_Details OMSVM-_Localization OMSVM-_Details PPSVM-_Localization PPSVM-_Details Profile-_Localization Profile-_Details SCL-BLAST-_Localization SCL-BLAST-_Details SCL-BLASTe-_Localization SCL-BLASTe-_Details Signal-_Localization Signal-_Details Cytoplasmic_Score CytoplasmicMembrane_Score Periplasmic_Score OuterMembrane_Score Extracellular_Score Final_Localization Final_Localization_Details Final_Score Secondary_Localization PSortb_Version
['Test3.CDS.1 Unknown Unknown Unknown Unknown No internal helices found Unknown No motifs found Unknown No motifs found Unknown Unknown Unknown No matches to profiles found Unknown No matches against database Unknown No matches against database Unknown No signal peptide detected 2.00 2.00 2.00 2.00 2.00 Unknown 2.00 PSORTb version 3.0']


IndexError: list index out of range

In [16]:
from pathlib import Path

def human_size(size):
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if size < 1024:
            return f"{size:.1f}{unit}"
        size /= 1024
    return f"{size:.1f}PB"


def print_path(root: Path):
    if not root.exists():
        print(f"{root} does not exist")
        return

    print(root.name)
    _print_tree(root, prefix="")


def _print_tree(root: Path, prefix: str):
    entries = sorted(root.iterdir(), key=lambda p: (p.is_file(), p.name.lower()))
    for i, path in enumerate(entries):
        is_last = i == len(entries) - 1
        connector = "└── " if is_last else "├── "

        if path.is_file():
            size = human_size(path.stat().st_size)
            print(prefix + connector + f"{path.name} ({size})")
        else:
            print(prefix + connector + path.name)
            extension = "    " if is_last else "│   "
            _print_tree(path, prefix + extension)
print_path(Path('./').resolve())

notebooks
├── .ipynb_checkpoints
│   ├── genomes-checkpoint.ipynb (10.8KB)
│   ├── pangenomes-checkpoint.ipynb (34.1KB)
│   └── parse_annotation-checkpoint.ipynb (3.3KB)
├── genomes.ipynb (19.5KB)
├── pangenomes.ipynb (46.5KB)
├── parse_annotation.ipynb (12.8KB)
├── test_pipeline_steps.ipynb (29.2KB)
└── util.py (3.7KB)


In [11]:
print_path(Path('/kb/module/work/tmp/'))

tmp
├── ani
ani
│   ├── fitness_fast.out
│   ├── kepangenomes_fast.out
│   └── phenotypes_fast.out
├── assembly
assembly
│   └── user_Escherichia_coli_K-12_MG1655.fna
├── genome
genome
│   └── user_Escherichia_coli_K-12_MG1655.faa
├── library
library
│   └── user_genome.txt
└── pangenome
pangenome
    ├── GB_GCA_021307345.1
GB_GCA_021307345.1
    │   ├── assembly
assembly
    │   ├── genome
genome
    │   │   ├── GB_GCA_000208445.2.faa
    │   │   ├── GB_GCA_000208525.2.faa
    │   │   ├── GB_GCA_000208585.2.faa
    │   │   ├── GB_GCA_000398885.1.faa
    │   │   ├── GB_GCA_000407765.1.faa
    │   │   ├── GB_GCA_000459855.1.faa
    │   │   ├── GB_GCA_002110245.1.faa
    │   │   ├── GB_GCA_002965005.1.faa
    │   │   ├── GB_GCA_002965435.1.faa
    │   │   ├── GB_GCA_002965705.1.faa
    │   │   ├── GB_GCA_002965965.1.faa
    │   │   ├── GB_GCA_002965985.1.faa
    │   │   ├── GB_GCA_002966005.1.faa
    │   │   ├── GB_GCA_003027815.1.faa
    │   │   ├── GB_GCA_003046645.1.faa
    │   │   ├─