In [None]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ALLCools.integration import calculate_diagonal_score

from wmb import *

## Parameters

In [None]:
level = 'L2'
deep_level = 'L4'
category_key = 'L3'
deep_category_key = 'L4'

## Load Annot

In [None]:
import anndata
adata_merge = anndata.read_h5ad(f'../L1/Neuron/final_with_coords.h5ad')

In [None]:
adata_merge.obs['Modality'].value_counts() 

In [None]:
mc_annot = cemba.get_mc_annot()
mc_neurons = ~mc_annot['L1_annot'].isin(
    ['ODC', 'OPC', 'ASC', 'MGC', 'CB', 'CBX', 'DG'])
mc_annot = mc_annot.sel(cell=mc_neurons)

In [None]:
atac_annot = cemba_atac.get_atac_annot()

In [None]:
atac_neurons = ~atac_annot['L2_annot'].isin([
    'VPIA', 'VLMC', 'MGL', 'PER', 'VEC', 'RGL', 'ASC', 'EPEN', 'BERG', 'OPC','IOL', 'OGC', 'GRC', 'GRANGL'
])

# due to inconsistent atac_annot, we use integration group here to select cells
atac_inte_group = pd.read_csv('../L1/Neuron/atac_integration_group.csv.gz', index_col=0)

atac_annot = atac_annot.sel(cell=atac_inte_group.index)

In [None]:
all_mc_cluster = mc_annot[category_key].to_pandas().value_counts().index
all_atac_cluster = atac_annot[category_key].to_pandas().value_counts().index

## Get L1 Confusion

In [None]:
def extend_confusion_matrix(matrix, mc_in_level, mc_out_level, atac_in_level,
                            atac_out_level):
    """Extend confusion matrix from one cluster resolution to the other deeper resolution"""
    mc_in_to_out = pd.Series(mc_annot[mc_in_level],
                             index=mc_annot[mc_out_level]).to_dict() #get the L2 and L4 identity for each cell
    mc_in_to_out = pd.Series(
        {k: v
         for k, v in mc_in_to_out.items() if v in matrix.columns}) #some l2 did not go to L4
    atac_in_to_out = pd.Series(atac_annot[atac_in_level],
                              index=atac_annot[atac_out_level]).to_dict()
    atac_in_to_out = pd.Series(
        {k: v
         for k, v in atac_in_to_out.items() if v in matrix.index})

    matrix = matrix.reindex(pd.Series(mc_in_to_out).values,
                            axis=1).reindex(pd.Series(atac_in_to_out).values)
    matrix.columns = mc_in_to_out.index
    matrix.index = atac_in_to_out.index
    return matrix

In [None]:
l1_confusion_matrix = pd.read_hdf(f'../L1/Neuron/L2.overlap_score.hdf')

l1_confusion_matrix = extend_confusion_matrix(l1_confusion_matrix,
                                              mc_in_level='L2',
                                              mc_out_level='L4',
                                              atac_in_level='L2',
                                              atac_out_level='L4')

In [None]:
l1_confusion_matrix.to_hdf('L1_confusion_matrx.L4_clusters.hdf', key='data')

In [None]:
assert l1_confusion_matrix.isna().values.sum() == 0

In [None]:
l1_confusion_matrix

## Get Inte L2 Confusion

In [None]:
def get_inte_groups(path, annot, category):
    """read cell [mc|atac]_integration_group.csv.gz, return cluster"""
    groups = pd.read_csv(path, index_col=0).squeeze()  # cell to group
    groups.index = groups.index.map(
        annot[category].to_pandas())  # category to group
    groups = pd.Series(groups.to_dict())  # deduplicates
    return groups #each category ro integrup

In [None]:
for path in pathlib.Path(f'../Neuron/{level}').glob(
        f'InteGroup*/{category_key}.overlap_score.hdf'):
    group = path.parent.name
   

In [None]:
confusion_matrix_dict = {}
row_groups = {}
col_groups = {}

for path in pathlib.Path(f'../{level}/Neuron').glob(
        f'InteGroup*/{category_key}.overlap_score.hdf'):
    group = path.parent.name

    # integration group
    # from leiden clustering on confusion matrix
    # may be manually merged in 07.ipynb
    row_group = get_inte_groups(path.parent / 'atac_integration_group.csv.gz',
                                annot=atac_annot,
                                category=category_key)
    col_group = get_inte_groups(path.parent / 'mc_integration_group.csv.gz',
                                annot=mc_annot,
                                category=category_key)
    row_groups[group] = row_group
    col_groups[group] = col_group

    # confusion matrix
    df = pd.read_hdf(path)
    diag_score = calculate_diagonal_score(df,
                                          col_group=col_group,
                                          row_group=row_group) #mean score
    print(f'{group} diag score: {diag_score:.2f}')

    df.index.name = f'atac.{category_key}'
    df.columns.name = f'mC.{category_key}'
    # reorder row and col based on inte groups
    df = df.loc[row_group.sort_values().index,
                col_group.sort_values().index].copy()
    confusion_matrix_dict[group] = df

all_confusion = pd.concat(confusion_matrix_dict.values())

In [None]:
all_confusion

In [None]:
all_confusion.to_hdf('L1_confusion_matrx.L3_clusters.hdf', key='data')

In [None]:
all_confusion_l4 = extend_confusion_matrix(all_confusion,
                                           mc_in_level='L3',
                                           mc_out_level='L4',
                                           atac_in_level='L3',
                                           atac_out_level='L4')
all_confusion_l4.to_hdf('L2_confusion_matrx.L4_clusters.hdf', key='data')

## Check Missing Clusters

In [None]:
assert all_mc_cluster.size == all_confusion.columns.size
assert all_atac_cluster.size == all_confusion.index.size

assert all_confusion.index.duplicated().sum() == 0
assert all_confusion.columns.duplicated().sum() == 0

## Get L3/4 Integration

### Get atac L4 to Inte L2

In [None]:
l3_group_names = []
for l2_group, row_group in row_groups.items():
    l3_group_name = l2_group + '_' + row_group.astype(str)
    l3_group_names.append(l3_group_name)
atac_l3_to_inte_l2 = pd.concat(l3_group_names)

In [None]:
atac_l4_to_atac_l3 = pd.DataFrame(atac_annot['L3'],
                                index=atac_annot['L4']).squeeze()
atac_l4_to_atac_l3 = pd.Series(atac_l4_to_atac_l3.to_dict())
atac_l4_to_inte_l2 = atac_l4_to_atac_l3.map(atac_l3_to_inte_l2)

In [None]:
atac_l4_to_inte_l2

### Get mC L4 to Inte L2

In [None]:
l3_group_names = []
for l2_group, row_group in col_groups.items():
    l3_group_name = l2_group + '_' + row_group.astype(str)
    l3_group_names.append(l3_group_name)
mc_l3_to_inte_l2 = pd.concat(l3_group_names)

In [None]:
mc_l4_to_mc_l3 = pd.DataFrame(mc_annot['L3'], index=mc_annot['L4']).squeeze()
mc_l4_to_mc_l3 = pd.Series(mc_l4_to_mc_l3.to_dict())
mc_l4_to_inte_l2 = mc_l4_to_mc_l3.map(mc_l3_to_inte_l2)

In [None]:
mc_l4_to_inte_l2

## Get Inte L4 Confusion

In [None]:
l4_confusion_matrix_dict = {}
l4_row_groups = {}
l4_col_groups = {}

for path in pathlib.Path(f'../{deep_level}/Neuron').glob(
        f'InteGroup*/{deep_category_key}.overlap_score.hdf'):
    group = path.parent.name

    # integration group
    # from leiden clustering on confusion matrix
    # may be manually merged in 07.ipynb
    row_group = get_inte_groups(path.parent / 'atac_integration_group.csv.gz',
                                annot=atac_annot,
                                category=deep_category_key)
    col_group = get_inte_groups(path.parent / 'mc_integration_group.csv.gz',
                                annot=mc_annot,
                                category=deep_category_key)
    l4_row_groups[group] = row_group
    l4_col_groups[group] = col_group

    # confusion matrix
    df = pd.read_hdf(path)
    diag_score = calculate_diagonal_score(df,
                                          col_group=col_group,
                                          row_group=row_group)
    print(f'{group} diag score: {diag_score:.2f}')

    df.index.name = f'atac.{category_key}'
    df.columns.name = f'mC.{category_key}'
    # reorder row and col based on inte groups
    df = df.loc[row_group.sort_values().index,
                col_group.sort_values().index].copy()
    l4_confusion_matrix_dict[group] = df

## Create L4 Patch on L2 Confusion Matrix

In [None]:
l2_with_l4_patch = all_confusion_l4.copy()

In [None]:
for df in l4_confusion_matrix_dict.values():
    l2_with_l4_patch.loc[df.index, df.columns] = df

In [None]:
l2_with_l4_patch.to_hdf('L4_confusion_matrx.L4_clusters.hdf', key='data')

## Deal with Cluster Match

In [None]:
l4_group_names = []
for l2_group, row_group in l4_row_groups.items():
    l4_group_name = l2_group + '_' + row_group.astype(str)
    l4_group_names.append(l4_group_name)
atac_l4_to_inte_l4 = pd.concat(l4_group_names)

In [None]:
l4_group_names = []
for l2_group, row_group in l4_col_groups.items():
    l4_group_name = l2_group + '_' + row_group.astype(str)
    l4_group_names.append(l4_group_name)
mc_l4_to_inte_l4 = pd.concat(l4_group_names)

## Final Cluster Map

In [None]:
atac_l4_to_final_group = {}
for atac_cluster, l2_inte_group in atac_l4_to_inte_l2.items():
    if atac_cluster in atac_l4_to_inte_l4.index:
        l4_inte_group = atac_l4_to_inte_l4[atac_cluster]
        assert l4_inte_group.startswith(l2_inte_group) is True
        final_group = l4_inte_group
    else:
        final_group = l2_inte_group
    atac_l4_to_final_group[atac_cluster] = final_group
atac_l4_to_final_group = pd.Series(atac_l4_to_final_group)

In [None]:
atac_l4_to_final_group.to_csv('atac_l4_to_integration_group.csv')

In [None]:
mc_l4_to_final_group = {}
for mc_cluster, l2_inte_group in mc_l4_to_inte_l2.items():
    if mc_cluster in mc_l4_to_inte_l4.index:
        l4_inte_group = mc_l4_to_inte_l4[mc_cluster]
        assert l4_inte_group.startswith(l2_inte_group) is True
        final_group = l4_inte_group
    else:
        final_group = l2_inte_group
    mc_l4_to_final_group[mc_cluster] = final_group
mc_l4_to_final_group = pd.Series(mc_l4_to_final_group)

In [None]:
mc_l4_to_final_group.to_csv('mc_l4_to_integration_group.csv')

In [None]:
atac_l4_to_final_group

In [None]:
mc_l4_to_final_group