In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import tables
import anndata
from typing import Dict, Optional
import numpy as np
import scipy.sparse as sp
from scipy import io
import glob
import os
import upsetplot
from scipy.io import mmread
import csv

import argparse

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#### FUNCTIONS FROM CELLBENDER
def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.
    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.
    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.
    """

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    # check and see if we have barcode index annotations, and if the file is filtered
    barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))]
    if len(barcode_key) > 0:
        max_barcode_ind = d[barcode_key[0]].max()
        filtered_file = (max_barcode_ind >= X.shape[0])
    else:
        filtered_file = True

    if analyzed_barcodes_only:
        if filtered_file:
            # filtered file being read, so we don't need to subset
            print('Assuming we are loading a "filtered" file that contains only cells.')
            pass
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        elif 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the anndata object.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)},
                            dtype=X.dtype)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this
    if 'genes' in d.keys():
        d['id'] = d.pop('genes')

    # For purely aesthetic purposes, rename "id" to "gene_id"
    if 'id' in d.keys():
        d['gene_id'] = d.pop('id')

    # If genomes are empty, try to guess them based on gene_id
    if 'genome' in d.keys():
        if np.array([s.decode() == '' for s in d['genome']]).all():
            if '_' in d['gene_id'][0].decode():
                print('Genome field blank, so attempting to guess genomes based on gene_id prefixes')
                d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    # Add a special additional field to .var if it exists.
    if 'features_analyzed_inds' in adata.uns.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds'])
                                            else False for i in range(adata.shape[1])]

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass
    else:
        # Add a special additional field to .obs if all barcodes are included.
        if 'barcodes_analyzed_inds' in adata.uns.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds'])
                                                else False for i in range(adata.shape[0])]

    return adata


def _fill_adata_slots_automatically(adata, d):
    """Add other information to the adata object in the appropriate slot."""

    # TODO: what about "features_analyzed_inds"?  If not all features are analyzed, does this work?

    for key, value in d.items():
        try:
            if value is None:
                continue
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == adata.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == adata.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))


#### END FUNCTIONS FROM CELLBENDER
            
def cellbender_anndata_to_cell_probability(a):
    return a.obs.cell_probability


def cellbender_anndata_to_sparse_matrix(adata, min_cell_probability=0):
    barcodes = adata.obs[adata.obs.cell_probability>=min_cell_probability].index.to_list()
    features = adata.var.gene_id.to_list()
    matrix = adata[adata.obs.cell_probability>=min_cell_probability].X.transpose()
    return {'features': features, 'barcodes': barcodes, 'matrix': matrix}


def umi_count_after_decontamination(adata):
    x = cellbender_anndata_to_sparse_matrix(adata)
    return dict(zip(x['barcodes'], x['matrix'].sum(axis=0).tolist()[0]))


def barcode_rank_plot(metrics, ax):
    df = metrics.sort_values('rna_umis', ascending=False)
    df['barcode_rank'] = range(1, len(df) + 1)
    sns.scatterplot(x='barcode_rank', y='rna_umis', data=df, ax=ax, hue='pass_all_filters', palette={True: 'red', False: 'black'}, edgecolor=None, alpha=0.2)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Barcode rank')
    ax.set_ylabel('UMIs')
    return ax


def rna_umis_vs_rna_mito_plot(metrics, ax):
    sns.scatterplot(x='rna_umis', y='rna_fraction_mitochondrial', data=metrics, ax=ax, hue='pass_all_filters', palette={True: 'red', False: 'black'}, edgecolor=None, alpha=0.02, s=3)
    ax.set_xscale('log')
    ax.set_xlabel('UMIs')
    ax.set_ylabel('Fraction mito. (RNA)')
    return ax


def rna_umis_vs_exon_to_full_gene_body_ratio(metrics, ax):
    sns.scatterplot(x='rna_umis', y='rna_exon_to_full_gene_body_ratio', data=metrics, ax=ax, hue='pass_all_filters', palette={True: 'red', False: 'black'}, edgecolor=None, alpha=0.02, s=3)
    ax.set_xscale('log')
    ax.set_xlabel('UMIs')
    ax.set_ylabel('Exon/full-gene-body ratio (RNA)')
    return ax


def cellbender_fraction_removed(metrics, ax):
    sns.scatterplot(x='rna_umis', y='fraction_cellbender_removed', data=metrics, ax=ax, hue='pass_all_filters', palette={True: 'red', False: 'black'}, edgecolor=None, alpha=0.05)
    ax.set_xscale('log')
    ax.set_xlabel('UMIs')
    ax.set_ylabel('Fraction ambient')
    return ax


def cellbender_cell_probabilities(metrics, ax):
    sns.histplot(x='cell_probability', data=metrics[(metrics.filter_rna_min_umi) & (metrics.filter_rna_max_mito)], ax=ax, bins=20)
    ax.set_xlabel('Cellbender cell prob.\nfor nuclei passing UMI and mito. thresholds')
    return ax

def cellbender_anndata_to_cell_probability(a):
    return a.obs.cell_probability


def cellbender_anndata_to_sparse_matrix(adata, min_cell_probability=0):
    barcodes = adata.obs[adata.obs.cell_probability>=min_cell_probability].index.to_list()
    features = adata.var.gene_id.to_list()
    matrix = adata[adata.obs.cell_probability>=min_cell_probability].X.transpose()
    return {'features': features, 'barcodes': barcodes, 'matrix': matrix}


def umi_count_after_decontamination(adata):
    x = cellbender_anndata_to_sparse_matrix(adata)
    return dict(zip(x['barcodes'], x['matrix'].sum(axis=0).tolist()[0]))


def barcode_rank_plot(metrics, ax):
    df = metrics.sort_values('rna_umis', ascending=False)
    df['barcode_rank'] = range(1, len(df) + 1)
    sns.scatterplot(x='barcode_rank', y='rna_umis', data=df, ax=ax, hue='pass_all_filters', palette={True: 'red', False: 'black'}, edgecolor=None, alpha=0.2)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Barcode rank')
    ax.set_ylabel('UMIs')
    return ax

# HPAP samples

In [11]:
def get_metrics(donor): # for hpap samples
    passQC = "/nfs/turbo/umms-scjp-pank/1_HPAP/results/rna/gencode_v39/emptyDrops/results/pctMTusingBelowEndCliff_pctMtless30_FDR0.005/cellbender_default/" + donor + "_passQC_barcodes.csv"

    with open('/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/20250314_meta_runs.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[1] == donor:
                metadata = row

    if metadata[4] == 'V2':
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v2/737K-august-2016.txt"
    else:
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v3/3M-february-2018.txt"

    CELLBENDER = "/nfs/turbo/umms-scjp-pank/1_HPAP/results/rna/gencode_v39_private/cellbender/cellbender_optimized/" + donor + "-hg38.cellbender_FPR_0.05.h5"
    RNA_METRICS = "/nfs/turbo/umms-scjp-pank/1_HPAP/results/rna//gencode_v39/qc/" + donor + "-hg38.qc.txt"
    GENE_FULL_EXON_OVER_INTRON_COUNTS = "/nfs/turbo/umms-scjp-pank/1_HPAP/results/rna/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/GeneFull_ExonOverIntron/raw'
    GENE_COUNTS = "/nfs/turbo/umms-scjp-pank/1_HPAP/results/rna/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/Gene/raw'
    
    rna_barcodes = pd.read_csv(RNA_BARCODE_WHITELIST, header=None)[0].to_list()

    #load metrics df
    adata = anndata_from_h5(CELLBENDER, analyzed_barcodes_only=True)
    rna_metrics = pd.read_csv(RNA_METRICS, sep='\t')
    rna_metrics = rna_metrics[rna_metrics.barcode!='-']
    metrics = rna_metrics.set_index('barcode').rename(columns=lambda x: 'rna_' + x)

    metrics = metrics.reset_index()
    cell_probability = cellbender_anndata_to_cell_probability(adata)
    post_cellbender_umis = umi_count_after_decontamination(adata)

    metrics['cell_probability'] = metrics.barcode.map(lambda x: cell_probability[x] if x in cell_probability else np.nan)
    metrics['post_cellbender_umis'] = metrics.barcode.map(lambda x: post_cellbender_umis[x] if x in post_cellbender_umis else np.nan)
    metrics['pct_cellbender_removed'] = (metrics.rna_umis - metrics.post_cellbender_umis) / metrics.rna_umis * 100
    metrics['rna_pct_mitochondrial'] = metrics.rna_fraction_mitochondrial * 100
    
    bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
    metrics = metrics[metrics['barcode'].isin(bc)]

    return metrics

In [13]:
donor_list = ["HPAP-019", "HPAP-020", "HPAP-021", "HPAP-022", 
              "HPAP-023", "HPAP-024", "HPAP-026", "HPAP-028", 
              "HPAP-029", "HPAP-032", "HPAP-034", "HPAP-035", 
              "HPAP-036", "HPAP-037", "HPAP-038", "HPAP-039", 
              "HPAP-040", "HPAP-042", "HPAP-043", "HPAP-044", 
              "HPAP-045", "HPAP-047", "HPAP-049", "HPAP-050", 
              "HPAP-051", "HPAP-052", "HPAP-053", "HPAP-054", 
              "HPAP-055", "HPAP-056", "HPAP-057", "HPAP-058", 
              "HPAP-059", "HPAP-061", "HPAP-063", "HPAP-064", 
              "HPAP-065", "HPAP-070", "HPAP-071", "HPAP-072", 
              "HPAP-074", "HPAP-075", "HPAP-079", "HPAP-080", 
              "HPAP-081", "HPAP-082", "HPAP-083", "HPAP-084", 
              "HPAP-085", "HPAP-087", "HPAP-091", "HPAP-092", 
              "HPAP-095", "HPAP-096", "HPAP-097", "HPAP-099", 
              "HPAP-100", "HPAP-101", "HPAP-103", "HPAP-104", 
              "HPAP-105", "HPAP-106", "HPAP-107", "HPAP-108", 
              "HPAP-109", "HPAP-110", "HPAP-111", "HPAP-113", 
              "HPAP-114", "HPAP-117", "HPAP-118", "HPAP-119", 
              "HPAP-120", "HPAP-122", "HPAP-123", "HPAP-124", 
              "HPAP-126", "HPAP-077"]
donor_list.len

In [14]:
for donor in donor_list:
    metrics = get_metrics(donor)
    outmetrics = "/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/metrics/" + donor + "_metrics.csv"
    metrics.to_csv(outmetrics, index=False)  

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()


# IIDP samples

In [24]:
def get_metrics(donor): # for iidp samples
    passQC = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/emptyDrops/results/pctMTusingBelowEndCliff_pctMtless30_FDR0.005/cellbender_default/" + donor + "_passQC_barcodes.csv"

    with open('/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/20250314_meta_runs.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[1] == donor:
                metadata = row

    if metadata[4] == 'V2':
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v2/737K-august-2016.txt"
    else:
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v3/3M-february-2018.txt"

    CELLBENDER = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39_private/cellbender/cellbender_optimized/" + donor + "-hg38.cellbender_FPR_0.05.h5"
    RNA_METRICS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/qc/" + donor + "-hg38.qc.txt"
    GENE_FULL_EXON_OVER_INTRON_COUNTS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/GeneFull_ExonOverIntron/raw'
    GENE_COUNTS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/Gene/raw'
    
    rna_barcodes = pd.read_csv(RNA_BARCODE_WHITELIST, header=None)[0].to_list()

    #load metrics df
    adata = anndata_from_h5(CELLBENDER, analyzed_barcodes_only=True)
    rna_metrics = pd.read_csv(RNA_METRICS, sep='\t')
    rna_metrics = rna_metrics[rna_metrics.barcode!='-']
    metrics = rna_metrics.set_index('barcode').rename(columns=lambda x: 'rna_' + x)

    metrics = metrics.reset_index()
    cell_probability = cellbender_anndata_to_cell_probability(adata)
    post_cellbender_umis = umi_count_after_decontamination(adata)

    metrics['cell_probability'] = metrics.barcode.map(lambda x: cell_probability[x] if x in cell_probability else np.nan)
    metrics['post_cellbender_umis'] = metrics.barcode.map(lambda x: post_cellbender_umis[x] if x in post_cellbender_umis else np.nan)
    metrics['pct_cellbender_removed'] = (metrics.rna_umis - metrics.post_cellbender_umis) / metrics.rna_umis * 100
    metrics['rna_pct_mitochondrial'] = metrics.rna_fraction_mitochondrial * 100
    
    bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
    metrics = metrics[metrics['barcode'].isin(bc)]

    return metrics

In [25]:
donor_list = ["SRR10751487", "SRR10751488", "SRR10751489", "SRR10751490", "SRR10751492", "SRR10751493", 
              "SRR10751494", "SRR10751495", "SRR10751496", "SRR10751497", "SRR10751498", "SRR10751499", 
              "SRR10751500", "SRR10751501", "SRR15736867", "SRR15736868", "SRR15736869", "SRR15736870", 
              "SRR15736871", "SRR15736872", "SRR15736873", "SRR15736874", "SRR15736875", "SRR15736876", 
              "SRR15736877", "SRR15736878", "SRR15736879", "SRR15736880", "SRR15736881", "SRR15736894", 
              "SRR15736895", "SRR15736896", "SRR15736897", "SRR15736898", "SRR15736899", "SRR15736900", 
              "SRR15736901", "SRR15736902", "SRR15736903", "SRR15736904", "SRR15736905", "SRR15736960", 
              "SRR15736961", "SRR15736962", "SRR15736963", "SRR15736964", "SRR17168069", "SRR17168070", 
              "SRR17168071", "SRR17168076", "SRR17168077", 
              "SRR17168079", "SRR18858546", "SRR18858547", "SRR18858548", "SRR18858549", "SRR18858550", 
              "SRR18858551", "SRR18858552", "SRR18858553", "SRR18858554", "SRR18858555", "SRR18858556", 
              "SRR18858557", "SRR22775778", "SRR22775779", "SRR22775780", "SRR22775781", "SRR22775784", 
              "SRR22775785", "SRR22775786", "SRR22775787", "SRR22775788", "SRR22775791", "SRR22775792", 
              "SRR22775795", "SRR22775796", "SRR22775797", "SRR22775798", "SRR22775801", "SRR22775802", 
              "SRR22775805", "SRR22775806", "SRR22775807", "SRR22775808", "SRR22775809", "SRR22775810", 
              "SRR22775811", "SRR22775812", "SRR22775815", "SRR22775816", "SRR22775817", "SRR22775818", 
              "SRR22775819", "SRR22775820", "SRR22775821", "SRR22775822", "SRR22775823", "SRR22775824", 
              "SRR22775825", "SRR22775826", "SRR22775827", "SRR22775828", "SRR22775829", "SRR22775830", 
              "SRR22775831", "SRR22775832", "SRR22775833", "SRR22775834", "SRR22775835", "SRR22775836", 
              "SRR22775837", "SRR22775838", "SRR22775839", "SRR22775840"]

In [26]:
for donor in donor_list:
    metrics = get_metrics(donor)
    outmetrics = "/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/metrics/" + donor + "_metrics.csv"
    metrics.to_csv(outmetrics, index=False)  

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()


## HTO samples

In [27]:
def get_metrics(donor): # for iidp samples
    passQC = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/emptyDrops/results/pctMTusingBelowEndCliff_pctMtless30_FDR0.005/cellbender_default/" + donor + "_passQC_barcodes_demultiplexed.csv"

    with open('/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/20250314_meta_runs.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[1] == donor:
                metadata = row

    if metadata[4] == 'V2':
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v2/737K-august-2016.txt"
    else:
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v3/3M-february-2018.txt"

    CELLBENDER = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39_private/cellbender/cellbender_optimized/" + donor + "-hg38.cellbender_FPR_0.05.h5"
    RNA_METRICS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/qc/" + donor + "-hg38.qc.txt"
    GENE_FULL_EXON_OVER_INTRON_COUNTS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/GeneFull_ExonOverIntron/raw'
    GENE_COUNTS = "/nfs/turbo/umms-scjp-pank/2_IIDP/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/Gene/raw'
    
    rna_barcodes = pd.read_csv(RNA_BARCODE_WHITELIST, header=None)[0].to_list()

    #load metrics df
    adata = anndata_from_h5(CELLBENDER, analyzed_barcodes_only=True)
    rna_metrics = pd.read_csv(RNA_METRICS, sep='\t')
    rna_metrics = rna_metrics[rna_metrics.barcode!='-']
    metrics = rna_metrics.set_index('barcode').rename(columns=lambda x: 'rna_' + x)

    metrics = metrics.reset_index()
    cell_probability = cellbender_anndata_to_cell_probability(adata)
    post_cellbender_umis = umi_count_after_decontamination(adata)

    metrics['cell_probability'] = metrics.barcode.map(lambda x: cell_probability[x] if x in cell_probability else np.nan)
    metrics['post_cellbender_umis'] = metrics.barcode.map(lambda x: post_cellbender_umis[x] if x in post_cellbender_umis else np.nan)
    metrics['pct_cellbender_removed'] = (metrics.rna_umis - metrics.post_cellbender_umis) / metrics.rna_umis * 100
    metrics['rna_pct_mitochondrial'] = metrics.rna_fraction_mitochondrial * 100
    
    bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
    metrics = metrics[metrics['barcode'].isin(bc)]

    return metrics

In [28]:
donor_list = ["SRR27326986", "SRR27326987", "SRR27326992", "SRR27326993", "SRR27326994", "SRR27326995", "SRR27326996", "SRR27326997"]

In [29]:
for donor in donor_list:
    metrics = get_metrics(donor)
    outmetrics = "/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/metrics/" + donor + "_metrics.csv"
    metrics.to_csv(outmetrics, index=False)  

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()


# Prodo samples

In [18]:
def get_metrics(donor): # for prodo samples
    passQC = "/nfs/turbo/umms-scjp-pank/3_Prodo/results/gencode_v39/emptyDrops/results/pctMTusingBelowEndCliff_pctMtless30_FDR0.005/cellbender_default/" + donor + "_passQC_barcodes.csv"

    with open('/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/20250314_meta_runs.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[1] == donor:
                metadata = row

    if metadata[4] == 'V2':
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v2/737K-august-2016.txt"
    else:
        RNA_BARCODE_WHITELIST = "/nfs/turbo/umms-scjp-pank/1_HPAP/scripts/snRNAseq-NextFlow_v3/3M-february-2018.txt"

    CELLBENDER = "/nfs/turbo/umms-scjp-pank/3_Prodo/results/gencode_v39_private/cellbender/cellbender_optimized/" + donor + "-hg38.cellbender_FPR_0.05.h5"
    RNA_METRICS = "/nfs/turbo/umms-scjp-pank/3_Prodo/results/gencode_v39/qc/" + donor + "-hg38.qc.txt"
    GENE_FULL_EXON_OVER_INTRON_COUNTS = "/nfs/turbo/umms-scjp-pank/3_Prodo/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/GeneFull_ExonOverIntron/raw'
    GENE_COUNTS = "/nfs/turbo/umms-scjp-pank/3_Prodo/results/gencode_v39/starsolo/" + donor + "-hg38/" + donor + "-hg38.Solo.out/" + '/Gene/raw'
    
    rna_barcodes = pd.read_csv(RNA_BARCODE_WHITELIST, header=None)[0].to_list()

    #load metrics df
    adata = anndata_from_h5(CELLBENDER, analyzed_barcodes_only=True)
    rna_metrics = pd.read_csv(RNA_METRICS, sep='\t')
    rna_metrics = rna_metrics[rna_metrics.barcode!='-']
    metrics = rna_metrics.set_index('barcode').rename(columns=lambda x: 'rna_' + x)

    metrics = metrics.reset_index()
    cell_probability = cellbender_anndata_to_cell_probability(adata)
    post_cellbender_umis = umi_count_after_decontamination(adata)

    metrics['cell_probability'] = metrics.barcode.map(lambda x: cell_probability[x] if x in cell_probability else np.nan)
    metrics['post_cellbender_umis'] = metrics.barcode.map(lambda x: post_cellbender_umis[x] if x in post_cellbender_umis else np.nan)
    metrics['pct_cellbender_removed'] = (metrics.rna_umis - metrics.post_cellbender_umis) / metrics.rna_umis * 100
    metrics['rna_pct_mitochondrial'] = metrics.rna_fraction_mitochondrial * 100
    
    bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
    metrics = metrics[metrics['barcode'].isin(bc)]

    return metrics

In [22]:
donor_list = ["SRR12831415", "SRR12831416", "SRR12831418", "SRR12831419", "SRR22266773", 
              "SRR22266774", "SRR22266775", "SRR27307488", "SRR27307489", "SRR27307490", "SRR27307491", 
              "SRR27307492", "SRR27307493", "SRR27307494", "SRR27307495", "SRR27307496", "SRR27307497", 
              "SRR27307498", "SRR27307499", "SRR27307500", "SRR27307501", "SRR27307502", "SRR27307503", 
              "SRR27307504", "SRR27307505", "SRR27307506", "SRR27307507", "SRR27307508", "SRR27307509", 
              "SRR27307510", "SRR27307511", "SRR27307512", "SRR27307513", "SRR27307514", "SRR27307515", 
              "SRR27307516", "SRR27307517", "SRR27307518", "SRR27307519", "SRR27307520", "SRR27307521", 
              "SRR27307522", "SRR27307523", "SRR27307524", "SRR27307525", "SRR27307526", "SRR27307527", 
              "SRR27307528", "SRR27307529", "SRR27307530", "SRR27307531", "SRR27307532", "SRR27307533", 
              "SRR27307534", "SRR27307535", "SRR27307536", "SRR27307537", "SRR27307538", "SRR27307539", 
              "SRR27307540", "SRR27307541", "SRR27307542", "SRR27307543", "SRR27307544", "SRR27307545", 
              "SRR27307546", "SRR27307547", "SRR27307548", "SRR27307549", "SRR27307550", "SRR27307551", 
              "SRR27307552", "SRR27307553", "SRR27307554", "SRR27307555", "SRR27307556", "SRR27307557", 
              "SRR27307558", "SRR27307559", "SRR27307560", "SRR27307561", "SRR27307562", "SRR27307563", 
              "SRR27307564", "SRR27307565", "SRR27307566", "SRR27307567", "SRR27307568", "SRR27307569", 
              "SRR27307570", "SRR27307571", "SRR27307572", "SRR27307573", "SRR27307574", "SRR27307575", 
              "SRR27307576", "SRR27307577", "SRR27307578", "SRR27307579"]

In [23]:
for donor in donor_list:
    metrics = get_metrics(donor)
    outmetrics = "/nfs/turbo/umms-scjp-pank/4_integration/data/202503_freeze/metrics/" + donor + "_metrics.csv"
    metrics.to_csv(outmetrics, index=False)  

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.re

  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
  bc = pd.read_csv(passQC, header=None, delim_whitespace="\t")[0].to_list()
