In [67]:
import numpy as np
import os
import sys
import pybedtools
from pybedtools import BedTool
import collections
import csv
import matplotlib.pyplot as plt
import gzip
import loompy
import scipy.sparse as sparse
import json
import urllib.request
import logging
import pickle
import importlib
import multiprocessing as mp
import tqdm
import gc

from chromograph.pipeline import config
from chromograph.preprocessing.utils import *
from chromograph.features.feature_count import *
from chromograph.preprocessing.doublet_finder import doublet_finder

In [5]:
indir_rnaXatac = '/data/proj/chromium/10X290_1_ABCD_1'
indir_atac = '/data/proj/chromium/10X291_2_AB_1'

In [28]:
fb = indir_rnaXatac + '/outs/per_barcode_metrics.csv'
ff = indir_rnaXatac + '/outs/atac_fragments.tsv.gz'
fs = indir_rnaXatac + '/outs/summary.csv'

In [None]:
barcode,gex_barcode,atac_barcode,is_cell,excluded_reason,gex_raw_reads,gex_mapped_reads,gex_conf_intergenic_reads,gex_conf_exonic_reads,gex_conf_intronic_reads,gex_conf_exonic_unique_reads,gex_conf_exonic_antisense_reads,gex_conf_exonic_dup_reads,gex_exonic_umis,gex_conf_intronic_unique_reads,gex_conf_intronic_antisense_reads,gex_conf_intronic_dup_reads,gex_intronic_umis,gex_conf_txomic_unique_reads,gex_umis_count,gex_genes_count,atac_raw_reads,atac_unmapped_reads,atac_lowmapq,atac_dup_reads,atac_chimeric_reads,atac_mitochondrial_reads,atac_fragments,atac_TSS_fragments,atac_peak_region_fragments,atac_peak_region_cutsites

In [None]:
barcodes = np.genfromtxt(fb, delimiter=',', skip_header=2,
                         dtype={'names':('barcode','total','duplicate','chimeric','unmapped','lowmapq','mitochondrial','passed_filters','cell_id','is__cell_barcode',
                                           'TSS_fragments','DNase_sensitive_region_fragments','enhancer_region_fragments','promoter_region_fragments','on_target_fragments',
                                           'blacklist_region_fragments','peak_region_fragments','peak_region_cutsites'),
                                 'formats':('U18', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'U18', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8')})


In [51]:
barcodes = np.genfromtxt(fb, delimiter=',', skip_header=1,
                         dtype={'names':('barcode','gex_barcode','atac_barcode','is__cell_barcode','excluded_reason','gex_raw_reads','gex_mapped_reads','gex_conf_intergenic_reads',
                                         'gex_conf_exonic_reads','gex_conf_intronic_reads','gex_conf_exonic_unique_reads','gex_conf_exonic_antisense_reads',
                                         'gex_conf_exonic_dup_reads','gex_exonic_umis','gex_conf_intronic_unique_reads','gex_conf_intronic_antisense_reads',
                                         'gex_conf_intronic_dup_reads','gex_intronic_umis','gex_conf_txomic_unique_reads','gex_umis_count','gex_genes_count',
                                         'total','unmapped','lowmapq','duplicate','chimeric','mitochondrial',
                                         'passed_filters','TSS_fragments','peak_region_fragments','peak_region_cutsites'),
                                 'formats':('U18', 'U18', 'U18', 'i8', 'i8', 'i8', 'i8', 'i8', 
                                            'i8', 'i8', 'i8', 'i8', 
                                            'i8', 'i8', 'i8', 'i8', 
                                            'i8', 'i8', 'i8', 'i8', 'i8',
                                            'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 
                                            'i8', 'i8', 'i8', 'i8')})
barcodes['barcode'] = barcodes['atac_barcode']
barcodes['barcode']

array(['ACAGCGGGTGTGTTAC-1', 'ACAGCGGGTCCTCCAT-1', 'ACAGCGGGTCATGGTT-1',
       ..., 'CGTACTTCAATACCGA-1', 'CGTACTTCAAGGCCAT-1',
       'CGTACTTCAACCCTTG-1'], dtype='<U18')

In [18]:
X =['barcode','gex_barcode','atac_barcode','is_cell','excluded_reason','gex_raw_reads','gex_mapped_reads','gex_conf_intergenic_reads','gex_conf_exonic_reads',
 'gex_conf_intronic_reads','gex_conf_exonic_unique_reads','gex_conf_exonic_antisense_reads','gex_conf_exonic_dup_reads','gex_exonic_umis','gex_conf_intronic_unique_reads','gex_conf_intronic_antisense_reads',
 'gex_conf_intronic_dup_reads','gex_intronic_umis','gex_conf_txomic_unique_reads','gex_umis_count','gex_genes_count',
 'atac_raw_reads','atac_unmapped_reads','atac_lowmapq','atac_dup_reads','atac_chimeric_reads','atac_mitochondrial_reads',
 'atac_fragments','atac_TSS_fragments','atac_peak_region_fragments','atac_peak_region_cutsites']
# X
Y = ['U18', 'U18', 'U18', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8']
print(len(X), len(Y))

31 31


In [20]:
X1 - ['AAACAGCCAAACAACA-1','AAACAGCCAAACAACA-1','ACAGCGGGTGTGTTAC-1']
X2 = ['AAACAGCCAAACAACA-1','AAACAGCCAAACAACA-1','ACAGCGGGTGTGTTAC-1']

['AAACAGCCAAACAACA-1', 'AAACAGCCAAACAACA-1', 'ACAGCGGGTGTGTTAC-1']

In [36]:
summary = np.genfromtxt(fs, dtype=str, delimiter=',')

In [65]:
summary

array([['Sample ID', 'Pipeline version', 'Genome',
        'Estimated number of cells',
        'ATAC Confidently mapped read pairs',
        'ATAC Fraction of genome in peaks',
        'ATAC Fraction of high-quality fragments in cells',
        'ATAC Fraction of high-quality fragments overlapping TSS',
        'ATAC Fraction of high-quality fragments overlapping peaks',
        'ATAC Fraction of transposition events in peaks in cells',
        'ATAC Mean raw read pairs per cell',
        'ATAC Median high-quality fragments per cell',
        'ATAC Non-nuclear read pairs', 'ATAC Number of peaks',
        'ATAC Percent duplicates', 'ATAC Q30 bases in barcode',
        'ATAC Q30 bases in read 1', 'ATAC Q30 bases in read 2',
        'ATAC Q30 bases in sample index i1', 'ATAC Sequenced read pairs',
        'ATAC TSS enrichment score', 'ATAC Unmapped read pairs',
        'ATAC Valid barcodes', 'Feature linkages detected',
        'GEX Fraction of transcriptomic reads in cells',
        'GEX

In [44]:
d = {str(k): str(v) for k, v in zip(summary[0,:], summary[1,:])}
d['binsize'] = 5000

In [56]:
meta = {}

passed = (barcodes['is__cell_barcode'] == 1) & (barcodes['passed_filters'] > 5000) & (barcodes['passed_filters'] < 100000)
for key in barcodes.dtype.names:
    meta[key] = barcodes[key][passed]

In [57]:
b = 'ACAGCGGGTGTCCTGA-1'

print(np.where(meta['gex_barcode'] == b)[0])
print(np.where(meta['atac_barcode'] == b)[0])


[]
[0]


In [64]:
samples = ['10X291_2', '10X291_3']

for sample in samples:
    f = os.path.join('/datb/sl/camiel/scATAC/samples/', sample, f'{sample}_5kb.loom')
    with loompy.connect(f) as ds:
        ds.ca.Tissue = np.repeat('Cortex', ds.shape[1])