In [1]:
## Miscelaneous
import numpy as np
import os
import sys
import collections
import matplotlib.pyplot as plt
import loompy
from collections import Counter
from tqdm import tqdm
import pickle as pkl

import pybedtools
from pybedtools import BedTool

import subprocess

import chromograph
from chromograph.peak_calling.utils import *
from chromograph.pipeline.utils import *
from chromograph.plotting.marker_plot import marker_plot
from chromograph.plotting.peak_annotation_plot import plot_peak_annotation_wheel
from chromograph.pipeline import config
from chromograph.preprocessing.doublet_finder import doublet_finder

import cytograph as cg
from cytograph.plotting.colors import colorize
from cytograph.enrichment import FeatureSelectionByMultilevelEnrichment
from cytograph.species import Species

import glob

import fisher

from typing import *

import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import scipy.stats as stats
import scipy
from statsmodels.sandbox.stats.multicomp import multipletests

import logging
logger = logging.getLogger()
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S')

config = config.load_config()

from chromograph.pipeline.TF_IDF import TF_IDF


In [28]:
import logging
import os
import re
from typing import List

import numpy as np
import yaml

import loompy


class Annotation:
    unknown_tags: set = set()

    def __init__(self, category: str, filename: str) -> None:
        with open(filename) as f:
            doc = next(yaml.load_all(f, Loader=yaml.SafeLoader))

        if "name" in doc:
            self.name = doc["name"]
        else:
            raise ValueError(os.path.basename(filename) + " did not contain a 'name' attribute, which is required.")

        if "abbreviation" in doc:
            self.abbreviation = doc["abbreviation"]
        else:
            raise ValueError(os.path.basename(filename) + " did not contain an 'abbreviation' attribute, which is required.")

        if "definition" in doc:
            self.definition = doc["definition"]
            genes = self.definition.strip().split()
            self.positives = [x[1:] for x in genes if x.startswith("+")]
            self.negatives = [x[1:] for x in genes if x.startswith("-")]
        else:
            raise ValueError(os.path.basename(filename) + " did not contain a 'definition' attribute, which is required.")

        if "categories" in doc and doc["categories"] is not None:
            self.categories = re.split(r"\W+", doc["categories"].strip())
        else:
            self.categories = []

    def __str__(self) -> str:
        temp = self.name + " (" + self.abbreviation + "; " + " ".join(["+" + x for x in self.positives])
        if len(self.negatives) > 0:
            temp = temp + " " + " ".join(["-" + x for x in self.negatives]) + ")"
        else:
            temp = temp + ")"
        return temp


class AutoAnnotator(object):
    def __init__(self, root: str, ds: loompy.LoomConnection = None) -> None:
        self.root = root
        self.definitions: List[Annotation] = []
        self.genes: List[str] = [] if ds is None else ds.ra.Gene
        self.accessions: List[str] = [] if ds is None else ds.ra.Accession
        self.annotations = None  # type: np.ndarray

        fileext = [".yaml", ".md"]
        root_len = len(self.root)
        for cur, _, files in os.walk(self.root):
            for file in files:
                errors = False
                if os.path.splitext(file)[-1] in fileext and file[-9:] != "README.md":
                    try:
                        tag = Annotation(cur[root_len:], os.path.join(cur, file))
                        for pos in tag.positives:
                            if len(self.genes) > 0 and (pos not in self.genes and pos not in self.accessions):
                                logging.error(file + ": gene '%s' not found in file", pos)
                                errors = True
                        for neg in tag.negatives:
                            if len(self.genes) > 0 and (neg not in self.genes and neg not in self.accessions):
                                logging.error(file + ": gene '%s' not found in file", neg)
                                errors = True
                        if not errors:
                            self.definitions.append(tag)
                    except Exception as e:
                        logging.error(file + ": " + str(e))
                        errors = True
        # if errors:
        # 	raise ValueError("Error loading cell tag definitions")

    def fit(self, ds: loompy.LoomConnection) -> np.ndarray:
        """
        Return the annotation for an already aggregated and trinarized loom file
        The input file should have one column per cluster and a layer named "trinaries"
        Returns:
            An array of strings giving the auto-annotation for each cluster
        """
        self.genes = ds.ra.Gene
        self.accessions = ds.ra.Accession
        binaries = ds.layers["binary"]
        self.annotations = np.empty((len(self.definitions), binaries.shape[1]))
        for ix, tag in enumerate(self.definitions):
            n_markers = len(tag.positives) + len(tag.negatives)
            for cluster in range(trinaries.shape[1]):
                p = 0
                for pos in tag.positives:
                    if pos not in self.genes and pos not in self.accessions:
                        logging.error(f"Auto-annotation gene {pos} (used for {tag}) not found in file")
                        continue
                    if pos in self.genes:
                        index = np.where(self.genes == pos)[0][0]
                    else:
                        index = np.where(self.accessions == pos)[0][0]
                    p = p + binaries[index, cluster]
                for neg in tag.negatives:
                    if neg not in self.genes and neg not in self.accessions:
                        logging.error(f"Auto-annotation gene {neg} (used for {tag}) not found in file")
                        continue
                    if neg in self.genes:
                        index = np.where(self.genes == neg)[0][0]
                    else:
                        index = np.where(self.accessions == neg)[0][0]
                    p = p + (1 - binaries[index, cluster])
                p = p / n_markers
                self.annotations[ix, cluster] = p

        attr = []
        for ix in range(self.annotations.shape[1]):
            tags = []  # type: List[str]
            for j in range(self.annotations.shape[0]):
                if self.annotations[j, ix] > 0.5:
                    tags.append(self.definitions[j].abbreviation)
            tags.sort()
            attr.append(" ".join(tags))

        return np.array(attr)

    def annotate(self, ds: loompy.LoomConnection) -> None:
        """
        Annotate an aggregated and trinarized loom file
        Remarks:
            Creates the following new column attributes:
                AutoAnnotation:		Space-separated list of auto-annotation labels

        The input file should have one column per cluster and a layer named "trinaries"
        """
        ds.ca.AutoAnnotation = self.fit(ds)

In [5]:
# from chromograph.peak_analysis.utils import KneeBinarization
# f = '/proj/DEV_ATAC/Build_20210312/All/All_prom.agg.loom'

# with loompy.connect(f) as ds:
#     ds['binary'], ds.ca.CPM_threshold = KneeBinarization(ds)

13:22:25 INFO     Binarize peak matrix
100%|██████████| 217/217 [00:11<00:00, 18.49it/s]


In [29]:
with loompy.connect(f) as ds:
    Annotator = AutoAnnotator(root='/home/camiel/auto-annotation/Human/', ds=ds)

In [41]:
for x in Annotator.definitions:
    tests = len(x.positives) + len(x.negatives)
    print(tests, len(x.positives), len(x.negatives), x.negatives)

2 2 0 []
1 1 0 []
2 2 0 []
1 1 0 []
1 1 0 []
5 5 0 []
3 3 0 []
2 2 0 []
3 3 0 []
1 1 0 []
2 2 0 []
1 1 0 []
1 1 0 []
2 2 0 []
2 2 0 []
1 1 0 []
1 1 0 []
1 1 0 []
1 1 0 []
1 1 0 []
1 1 0 []
1 1 0 []
1 1 0 []
4 4 0 []
2 2 0 []
3 3 0 []
3 2 1 ['SLC6A5']
2 2 0 []
3 3 0 []
2 2 0 []
1 1 0 []
1 1 0 []
1 1 0 []
3 3 0 []
2 2 0 []
2 2 0 []
1 1 0 []
1 1 0 []
2 2 0 []
2 2 0 []
2 2 0 []
1 1 0 []
3 1 2 ['PAX6', 'NKX2-1']
1 1 0 []
2 2 0 []
2 2 0 []
