<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Purpose" data-toc-modified-id="Purpose-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Purpose</a></span></li><li><span><a href="#Input" data-toc-modified-id="Input-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Input</a></span></li><li><span><a href="#Output" data-toc-modified-id="Output-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Output</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Parameters</a></span><ul class="toc-item"><li><span><a href="#Not-through-papermill" data-toc-modified-id="Not-through-papermill-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Not through papermill</a></span></li></ul></li><li><span><a href="#Load-Cell-Metadata" data-toc-modified-id="Load-Cell-Metadata-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Load Cell Metadata</a></span></li></ul></div>

# Basic Data Walkthrough and Cell Mapping Metric Filtering

## Purpose
- Overview of basic QC metrics per brain region
- Have a simple cell filtering based on basic QC metrics

## Input
- Cell metadata table

## Output
- Selected cell metadata table

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
from cemba_data.plot.preprocessing import cutoff_vs_cell_remain, plot_on_plate

In [3]:
result_dir = pathlib.Path()
fig_dir = result_dir / 'fig/cell_basic_qc'
fig_dir.mkdir(exist_ok=True, parents=True)

## Parameters

In [4]:
# parameters cell
study_name = 'snm3C'

cell_metadata_path = '/home/hanliu/project/mouse_rostral_brain/snm3C/summary.txt'

### Not through papermill

In [5]:
# QC metric filtering
qc_metric = {
    'CCC_Rate': (0, 0.05),
    'CG_RateAdj': (0.5, 1),
    'CH_RateAdj': (0., 0.15),
    'FinalReads': (500000, 10000000),
    'MappingRate': (0.5, 1)
}

# plot cell metadata columns
metadata_distribution_plot = {
    'CCC_Rate': (0., 0.1),
    'CH_RateAdj': (0, 0.1),
    'CG_RateAdj': (0.7, 0.9),
    'InputReads': (0, 1e7),
    'MappedReads': (0, 1e7),
    'FinalReads': (0, 1e7),
    'MappingRate': (0.5, 1)
}

final_reads_column = 'FinalReads'
plate_view = False

## Load Cell Metadata

In [6]:
total_cell_meta = pd.read_csv(cell_metadata_path, header=None, sep='\t', index_col=0)
total_cell_meta.index = total_cell_meta.index.map(lambda i: i[20:])

In [7]:
total_cell_meta = total_cell_meta[total_cell_meta[3] > 100000]

In [8]:
mcds_path_list = [
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_0.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_1000.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_1500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_2000.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_2500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_3000.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_3500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_4000.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_4500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_5000.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_5500.mcds',
    '/home/hanliu/project/mouse_rostral_brain/snm3C/dataset/snm3C_chunk_6000.mcds'
]

In [None]:
mcds = xr.open_mfdataset(mcds_path_list, combine='nested', concat_dim='cell')

In [11]:
new_mcds = xr.Dataset({k: mcds[k] for k in ['geneslop2k_da', 'chrom100k_da']})

In [12]:
use_cells = new_mcds.get_index('cell') & total_cell_meta.index

In [13]:
new_mcds.sel(cell=use_cells).to_netcdf('snm3C.for_clustering.mcds')

In [16]:
mch_data = new_mcds['chrom100k_da'].sel(mc_type='CHN').sum('chrom100k').squeeze()
mcg_data = new_mcds['chrom100k_da'].sel(mc_type='CGN').sum('chrom100k').squeeze()


In [20]:
total_cell_meta['mCH_rate'] = (mch_data.sel(count_type='mc') / mch_data.sel(count_type='cov')).to_pandas()
total_cell_meta['mCG_rate'] = (mcg_data.sel(count_type='mc') / mcg_data.sel(count_type='cov')).to_pandas()

In [21]:
total_cell_meta.to_csv('cell_meta.csv')