# [PMC6620049](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6620049/)

> [GSE127465](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE127465)

|sp|tissue|status|
|:-:|:-:|:-:|
|h|blood|tumor|
|h|liver|tumor|
|m|liver|tumor|
|m|liver|healthy|


+ immue cell 大类群
+ immue cell 亚类
    + Neutrophil
    + Dendritic Cells
    + Monocyte

In [1]:
import sys
from pathlib import Path
p_link = Path("/public/workspace/licanchengup/link")
p_publish = p_link.joinpath("res_publish")
None if str(p_publish) in sys.path else sys.path.append(str(p_publish))

In [None]:
from func import *

In [None]:
p_root = p_cache.joinpath('disease','LungCancer_GSE127465')

def limite_func(adata, key, value):
    adata = adata.copy()
    adata = adata[adata.obs[key].str.match(value)]
    return adata


map_limite_func = {

    "allImm": lambda adata: adata,
    
    "DendriticCells": lambda adata: limite_func(
        adata, key="sub_cell_type", value="p?(Mono)?DC\\d?"
    ),

    "Monocyte": lambda adata: limite_func(
        adata, key="sub_cell_type", value="Mono\\d"
    ),
    "Neutrophils": lambda adata: limite_func(
        adata, key="cell_type", value="^Neutrophils$"
    ),
}

  from .autonotebook import tqdm as notebook_tqdm
2024-03-21 20:43:08.036023: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df_path = pd.DataFrame({
    'path': p_root.joinpath('GSE127465_RAW').iterdir()
})
df_path['name'] = df_path['path'].apply(lambda x: x.name)
df_path = df_path[df_path['path'].apply(lambda x: x.match('*raw_counts*'))]
df_path = df_path.join(df_path['name'].str.extract(
    '^(?P<gsm_id>GSM\\d+)_(?P<sp>[^_]+)_(?P<sample>.+)_raw_counts.tsv.gz'))
display(df_path.head(2), df_path.shape)

Unnamed: 0,path,name,gsm_id,sp,sample
0,/public/workspace/licanchengup/link/res_publis...,GSM3635310_mouse_t_1_2_raw_counts.tsv.gz,GSM3635310,mouse,t_1_2
1,/public/workspace/licanchengup/link/res_publis...,GSM3635304_mouse_h_1_1_raw_counts.tsv.gz,GSM3635304,mouse,h_1_1


(40, 5)

In [4]:
html_table = pd.read_csv(
    'GSE127465_html_table.csv',
    index_col=None,
    header=None,
    names=[
        'gsm_id',
        'tag'])
html_table = html_table.join(html_table['tag'].str.extract(
    '^(?P<sample_fullname>\\w+) \\[(?P<sp>[^_]+)_(?P<sample>\\w+)]'))
display(html_table.head(2), html_table.shape)
[print('{}\t{} is unique'.format(html_table[_].is_unique, _))
 for _ in html_table.columns]

df_path = df_path.merge(html_table.loc[:, 'gsm_id,sample_fullname'.split(
    ',')], on='gsm_id').sort_values(['sp', 'sample'])

Unnamed: 0,gsm_id,tag,sample_fullname,sp,sample
0,GSM3635278,patient_1_tumor [human_p1t1],patient_1_tumor,human,p1t1
1,GSM3635279,patient_1_tumor [human_p1t2],patient_1_tumor,human,p1t2


(40, 5)

True	gsm_id is unique
True	tag is unique
False	sample_fullname is unique
False	sp is unique
True	sample is unique


In [5]:
df_path['tissue'] = 'lung'
df_path['tissue'] = df_path['tissue'].mask(
    df_path['sample_fullname'].str.contains('blood'), 'blood')
df_path['status'] = ''
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('healthy'), 'healthy')
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('patient'), 'tumor')
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('tumor'), 'tumor')
df_path['individual'] = df_path['sample_fullname']\
    .str.extract('([mousepaint]+_\\d)', expand=False)\
    .str.replace('patient', 'human', regex=False)\
    .apply(lambda x: '{}{}'.format(x[0], x[-1]))
df_path['repeat'] = df_path['sample'].str.extract('(\\d)$', expand=False)
df_path['_batch'] = df_path.apply(lambda row: '{}{}{}_{}'.format(
    row['individual'], row['status'][0], row['tissue'][0], row['repeat']
), axis=1)

In [6]:
df_path.loc[:, '_batch,sp,individual,status,tissue,repeat'.split(
    ',')].head(2)

Unnamed: 0,_batch,sp,individual,status,tissue,repeat
26,h1tb_1,human,h1,tumor,blood,1
2,h1tb_2,human,h1,tumor,blood,2


# human

In [7]:
df_meta_h = pd.read_csv(
    p_root.joinpath('GSE127465_human_cell_metadata_54773x25.tsv.gz'),
    sep='\t',
    index_col=None)

print(*df_meta_h.columns, sep='\n')
df_meta_h = df_meta_h.loc[:,
                          'Barcode,Library,Patient,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune'.split(',')]
df_meta_h = pd.merge(
    df_path.loc[:, 'sample,_batch,sp,individual,status,tissue,repeat'.split(',')],
    df_meta_h,
    left_on='sample', right_on='Library'
)

_temp = df_meta_h.apply(
    lambda row: '{Barcode};{Library}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
df_meta_h.index = _temp.to_numpy()
del _temp
display(df_meta_h.head(2), df_meta_h.shape)

df_path_h = df_path.query("sp == 'human'")

Patient
Tissue
Barcoding emulsion
Library
Barcode
Total counts
Percent counts from mitochondrial genes
Most likely LM22 cell type
Major cell type
Minor subset
used_in_NSCLC_all_cells
x_NSCLC_all_cells
y_NSCLC_all_cells
used_in_NSCLC_and_blood_immune
x_NSCLC_and_blood_immune
y_NSCLC_and_blood_immune
used_in_NSCLC_immune
x_NSCLC_immune
y_NSCLC_immune
used_in_NSCLC_non_immune
x_NSCLC_non_immune
y_NSCLC_non_immune
used_in_blood
x_blood
y_blood


Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Patient,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune
bcCSBJ;p1b1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcCSBJ,p1b1,p1,bT cells,bCD4T1,False,,
bcAVUP;p1b1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcAVUP,p1b1,p1,bMonocytes,bMono2,False,,


(54773, 15)

In [8]:
df_path_h.index = np.arange(df_path_h.shape[0])
p_temp_h = p_cache.joinpath('disease', 'temp_LC_h.h5ad')
adata_h = None
if p_temp_h.exists():
    adata_h = sc.read_h5ad(p_temp_h)
else:
    res_h = {}
    for _i, _row in df_path_h.iterrows():
        print('\r[{}/{}]{}'.format(_i+1,
                                   df_path_h.shape[0],
                                   _row['sample']).ljust(75, '-'), end='')
        _adata = None
        _adata = sc.read_csv(_row['path'], delimiter='\t')
        res_h.update({
            _row['sample']: _adata
        })
    print('sample\tobs_unique\tvar_unique')
    for k, v in res_h.items():
        print(
            '{}\t{}\t{}'.format(
                k,
                v.obs.index.is_unique,
                v.var.index.is_unique))

    adata_h = sc.concat(res_h, index_unique=';')
    adata_h.X = csr_matrix(adata_h.X)
    display(type(adata_h.X))
    del res_h
    display(
        adata_h.obs.head(2),
        adata_h.obs.shape,
        adata_h.obs.index.is_unique,
        adata_h.var.head(2),
        adata_h.var.shape,
        adata_h.var.index.is_unique)

    display(
        pd.Series(
            df_meta_h.index.isin(
                adata_h.obs.index)).value_counts(), pd.Series(
            adata_h.obs.index.isin(
                df_meta_h.index)).value_counts())
    adata_h = adata_h[adata_h.obs.index.isin(df_meta_h.index)]
    adata_h.obs = adata_h.obs.loc[:, []].join(df_meta_h)

    adata_h.write_h5ad(p_temp_h)
    print('[out] {}'.format(p_temp_h.name))
adata_h.obs = adata_h.obs.apply(lambda x: x.astype(str) if x.dtype.name == 'category' else x)
display(adata_h, adata_h.obs.head(2))

AnnData object with n_obs × n_vars = 54773 × 41861
    obs: 'sample', '_batch', 'sp', 'individual', 'status', 'tissue', 'repeat', 'Barcode', 'Library', 'Patient', 'Major cell type', 'Minor subset', 'used_in_NSCLC_immune', 'x_NSCLC_immune', 'y_NSCLC_immune'

Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Patient,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune
bcCSBJ;p1b1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcCSBJ,p1b1,p1,bT cells,bCD4T1,False,,
bcAVUP;p1b1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcAVUP,p1b1,p1,bMonocytes,bMono2,False,,


In [9]:
adata_h.obs['barcode'] = adata_h.obs.index.str.extract(
    '(\\w+);', expand=False)
_temp = adata_h.obs.apply(
    lambda row: '{barcode};{_batch}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
adata_h.obs.index = _temp
display(adata_h.obs.head(2), adata_h.obs.shape)
del _temp

Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Patient,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune,barcode
bcCSBJ;h1tb_1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcCSBJ,p1b1,p1,bT cells,bCD4T1,False,,,bcCSBJ
bcAVUP;h1tb_1,p1b1,h1tb_1,human,h1,tumor,blood,1,bcAVUP,p1b1,p1,bMonocytes,bMono2,False,,,bcAVUP


(54773, 16)

## human blood and lung cancer

In [10]:
adata_hb = adata_h[adata_h.obs['tissue'] == 'blood', :]
adata_hl = adata_h[adata_h.obs.index.isin(
    adata_h.obs.query("used_in_NSCLC_immune & tissue == 'lung'").index), :]
display(adata_hb, adata_hl)

View of AnnData object with n_obs × n_vars = 14411 × 41861
    obs: 'sample', '_batch', 'sp', 'individual', 'status', 'tissue', 'repeat', 'Barcode', 'Library', 'Patient', 'Major cell type', 'Minor subset', 'used_in_NSCLC_immune', 'x_NSCLC_immune', 'y_NSCLC_immune', 'barcode'

View of AnnData object with n_obs × n_vars = 34558 × 41861
    obs: 'sample', '_batch', 'sp', 'individual', 'status', 'tissue', 'repeat', 'Barcode', 'Library', 'Patient', 'Major cell type', 'Minor subset', 'used_in_NSCLC_immune', 'x_NSCLC_immune', 'y_NSCLC_immune', 'barcode'

In [11]:
display(adata_hl.obs['Major cell type'].unique())
adata_hl.obs['cell_type'] = adata_hl.obs['Major cell type'].str.replace(
    '^t', '', regex=True)
display(adata_hl.obs['cell_type'].unique())
display(
    group_agg(
        adata_hl.obs, [
            'Major cell type', 'cell_type'], {
                'cell_type': ['count']}))
assert not adata_hl.obs['cell_type'].isna().any(), '[Error] nan'

display(adata_hl.obs['Minor subset'].unique())
adata_hl.obs['sub_cell_type'] = adata_hl.obs['Minor subset'].str.replace(
    '^t', '', regex=True)
display(adata_hl.obs['sub_cell_type'].unique())
display(
    group_agg(
        adata_hl.obs, [
            'Minor subset', 'sub_cell_type'], {
                'sub_cell_type': ['count']}))
assert not adata_hl.obs['sub_cell_type'].isna().any(), '[Error] nan'

array(['tMoMacDC', 'tMast cells', 'tT cells', 'tNeutrophils', 'tNK cells',
       'tPlasma cells', 'tB cells', 'tpDC', 'tRBC'], dtype=object)

  adata_hl.obs['cell_type'] = adata_hl.obs['Major cell type'].str.replace(


array(['MoMacDC', 'Mast cells', 'T cells', 'Neutrophils', 'NK cells',
       'Plasma cells', 'B cells', 'pDC', 'RBC'], dtype=object)

Unnamed: 0,Major cell type,cell_type,cell_type_count
0,tB cells,B cells,5218
1,tMast cells,Mast cells,732
2,tMoMacDC,MoMacDC,9372
3,tNK cells,NK cells,1116
4,tNeutrophils,Neutrophils,2728
5,tPlasma cells,Plasma cells,2197
6,tRBC,RBC,108
7,tT cells,T cells,12776
8,tpDC,pDC,311


array(['tMac2', 'tMono1', 'tMast1', 'tMac6', 'tMacCycl', 'tT2', 'tT4',
       'tT1', 'tN5', 'tT3', 'tNK1', 'tMonoDC', 'tPC6', 'tDC2', 'tMac7',
       'tMac3', 'tMac1', 'tMac4', 'tMac8', 'tN3', 'tMono2', 'tT6', 'tN4',
       'tPC3', 'tMac9', 'tPC1', 'tB', 'tpDC', 'tMono3', 'tDC3', 'tN2',
       'tMac5', 'tPC4', 'tT7', 'tDC1', 'tN1', 'tT5', 'tNK2', 'tMast2',
       'tPC2', 'tPC5', 'tRBC'], dtype=object)

array(['Mac2', 'Mono1', 'Mast1', 'Mac6', 'MacCycl', 'T2', 'T4', 'T1',
       'N5', 'T3', 'NK1', 'MonoDC', 'PC6', 'DC2', 'Mac7', 'Mac3', 'Mac1',
       'Mac4', 'Mac8', 'N3', 'Mono2', 'T6', 'N4', 'PC3', 'Mac9', 'PC1',
       'B', 'pDC', 'Mono3', 'DC3', 'N2', 'Mac5', 'PC4', 'T7', 'DC1', 'N1',
       'T5', 'NK2', 'Mast2', 'PC2', 'PC5', 'RBC'], dtype=object)

Unnamed: 0,Minor subset,sub_cell_type,sub_cell_type_count
0,tB,B,5218
1,tDC1,DC1,171
2,tDC2,DC2,330
3,tDC3,DC3,101
4,tMac1,Mac1,949
5,tMac2,Mac2,919
6,tMac3,Mac3,2554
7,tMac4,Mac4,418
8,tMac5,Mac5,165
9,tMac6,Mac6,284


In [12]:
adata_hl.obs = adata_hl.obs.loc[:,
                                '_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune'.split(',')]
display(adata_hl.obs.head(2), adata_hl.obs.shape)

Unnamed: 0,_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune
bcHTNA;h1tl_1,h1tl_1,MoMacDC,Mac2,human,h1,tumor,lung,1,p1t1,tMoMacDC,tMac2,True,1654.548012,-1145.346928
bcHNVA;h1tl_1,h1tl_1,MoMacDC,Mac2,human,h1,tumor,lung,1,p1t1,tMoMacDC,tMac2,True,1651.629433,-1211.471793


(34558, 14)

In [13]:
for k, v in map_limite_func.items():
    print(k.ljust(75, '-'))
    _adata = v(adata_hl)
    h5ad_to_mtx(_adata, p_cache.joinpath('disease', 'LC_h_{}'.format(k)))
    display(_adata.obs['cell_type' if k == 'allImm' else 'sub_cell_type'].value_counts())
print('\n[finish]\n'.center(100, '-'))

allImm---------------------------------------------------------------------
frist 10 data.X nonzero elements:
 [[2 1 1 1 1 1 1 1 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_h_allImm


cell_type
T cells         12776
MoMacDC          9372
B cells          5218
Neutrophils      2728
Plasma cells     2197
NK cells         1116
Mast cells        732
pDC               311
RBC               108
Name: count, dtype: int64

DendriticCells-------------------------------------------------------------


  adata.var["gene_names"] = adata.var_names.to_numpy()


frist 10 data.X nonzero elements:
 [[1 2 1 3 1 6 1 1 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_h_DendriticCells


sub_cell_type
MonoDC    1097
DC2        330
pDC        311
DC1        171
DC3        101
Name: count, dtype: int64

Monocyte-------------------------------------------------------------------


  adata.var["gene_names"] = adata.var_names.to_numpy()


frist 10 data.X nonzero elements:
 [[1 1 1 1 1 1 3 1 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_h_Monocyte


sub_cell_type
Mono1    698
Mono2    252
Mono3    226
Name: count, dtype: int64

Neutrophils----------------------------------------------------------------
frist 10 data.X nonzero elements:
 [[1 1 1 1 1 1 1 2 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_h_Neutrophils


  adata.var["gene_names"] = adata.var_names.to_numpy()


sub_cell_type
N5    902
N4    789
N1    526
N3    425
N2     86
Name: count, dtype: int64

---------------------------------------------
[finish]
---------------------------------------------


In [14]:
del adata_h,adata_hb,adata_hl

# mouse

In [15]:
df_meta_m = pd.read_csv(
    p_root.joinpath('GSE127465_mouse_cell_metadata_15939x12.tsv.gz'),
    sep='\t',
    index_col=None)
print(*df_meta_m.columns, sep='\n')
df_meta_m = df_meta_m.loc[:,
                          'Barcode,Library,Major cell type,Minor subset,x,y'.split(',')]
df_meta_m = pd.merge(
    df_path.loc[:, 'sample,_batch,sp,individual,status,tissue,repeat'.split(',')],
    df_meta_m,
    left_on='sample', right_on='Library'
)

_temp = df_meta_m.apply(
    lambda row: '{Barcode};{Library}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
df_meta_m.index = _temp.to_numpy()
del _temp
display(df_meta_m.head(2), df_meta_m.shape)

df_path_m = df_path.query("sp == 'mouse'")

Tumor or healthy
Biological replicate
Library
Barcode
Library prep batch
Total counts
Percent counts from mitochondrial genes
Most likely Immgen cell type
Major cell type
Minor subset
x
y


Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Major cell type,Minor subset,x,y
bc0001;h_1_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0001,h_1_1,Neutrophils,N1,262.182566,-776.218781
bc0002;h_1_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0002,h_1_1,Neutrophils,N1,141.317902,-766.314486


(15939, 13)

In [16]:
df_path_m.index = np.arange(df_path_m.shape[0])
p_temp_m = p_cache.joinpath('disease', 'temp_LC_m.h5ad')
adata_m = None
if p_temp_m.exists():
    adata_m = sc.read_h5ad(p_temp_m)
else:
    res_m = {}
    for _i, _row in df_path_m.iterrows():
        print('\r[{}/{}]{}'.format(_i+1,
                                   df_path_m.shape[0],
                                   _row['sample']).ljust(75, '-'), end='')
        _adata = None
        _adata = sc.read_csv(_row['path'], delimiter='\t')
        res_m.update({
            _row['sample']: _adata
        })
    print('sample\tobs_unique\tvar_unique')
    for k, v in res_m.items():
        print(
            '{}\t{}\t{}'.format(
                k,
                v.obs.index.is_unique,
                v.var.index.is_unique))
    adata_m = sc.concat(res_m, index_unique=';')
    adata_m.X = csr_matrix(adata_m.X)
    display(type(adata_m.X))
    del res_m
    display(
        adata_m.obs.head(2),
        adata_m.obs.shape,
        adata_m.obs.index.is_unique,
        adata_m.var.head(2),
        adata_m.var.shape,
        adata_m.var.index.is_unique)
    
    display(pd.Series(df_meta_m.index.isin(adata_m.obs.index)).value_counts(),
            pd.Series(adata_m.obs.index.isin(df_meta_m.index)).value_counts())
    adata_m = adata_m[adata_m.obs.index.isin(df_meta_m.index)]
    adata_m.obs = adata_m.obs.loc[:, []].join(df_meta_m)
    adata_m.write_h5ad(p_temp_m)
    print('[out] {}'.format(p_temp_h.name))
adata_m.obs = adata_m.obs.apply(lambda x: x.astype(str) if x.dtype.name == 'category' else x)
display(adata_m,adata_m.obs.head(2))

AnnData object with n_obs × n_vars = 15939 × 28205
    obs: 'sample', '_batch', 'sp', 'individual', 'status', 'tissue', 'repeat', 'Barcode', 'Library', 'Major cell type', 'Minor subset', 'x', 'y'

Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Major cell type,Minor subset,x,y
bc0001;h_1_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0001,h_1_1,Neutrophils,N1,262.182566,-776.218781
bc0002;h_1_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0002,h_1_1,Neutrophils,N1,141.317902,-766.314486


In [17]:
adata_m.obs['barcode'] = adata_m.obs.index.str.extract(
    '(\\w+);', expand=False)
_temp = adata_m.obs.apply(
    lambda row: '{barcode};{_batch}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
adata_m.obs.index = _temp
display(adata_m.obs.head(2), adata_m.obs.shape)
del _temp

Unnamed: 0,sample,_batch,sp,individual,status,tissue,repeat,Barcode,Library,Major cell type,Minor subset,x,y,barcode
bc0001;m1hl_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0001,h_1_1,Neutrophils,N1,262.182566,-776.218781,bc0001
bc0002;m1hl_1,h_1_1,m1hl_1,mouse,m1,healthy,lung,1,bc0002,h_1_1,Neutrophils,N1,141.317902,-766.314486,bc0002


(15939, 14)

In [18]:
adata_m.obs['cell_type'] = adata_m.obs['Major cell type']
adata_m.obs['sub_cell_type'] = adata_m.obs['Minor subset']
adata_m.obs = adata_m.obs.loc[:,
                                '_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,x,y'.split(',')]
display(adata_m.obs.head(2), adata_m.obs.shape)

Unnamed: 0,_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,x,y
bc0001;m1hl_1,m1hl_1,Neutrophils,N1,mouse,m1,healthy,lung,1,h_1_1,Neutrophils,N1,262.182566,-776.218781
bc0002;m1hl_1,m1hl_1,Neutrophils,N1,mouse,m1,healthy,lung,1,h_1_1,Neutrophils,N1,141.317902,-766.314486


(15939, 13)

## mouse heathly and lung canner

In [19]:
display(adata_m.obs['status'].value_counts())
adata_mt = adata_m[adata_m.obs['status'] == 'tumor', :]
adata_mh = adata_m[adata_m.obs['status'] == 'healthy', :]
display(adata_mt,adata_mh)

status
tumor      9201
healthy    6738
Name: count, dtype: int64

View of AnnData object with n_obs × n_vars = 9201 × 28205
    obs: '_batch', 'cell_type', 'sub_cell_type', 'sp', 'individual', 'status', 'tissue', 'repeat', 'sample', 'Major cell type', 'Minor subset', 'x', 'y'

View of AnnData object with n_obs × n_vars = 6738 × 28205
    obs: '_batch', 'cell_type', 'sub_cell_type', 'sp', 'individual', 'status', 'tissue', 'repeat', 'sample', 'Major cell type', 'Minor subset', 'x', 'y'

In [20]:
for k, v in map_limite_func.items():
    print(k.ljust(75, '-'))
    _adata = v(adata_mt)
    h5ad_to_mtx(_adata, p_cache.joinpath('disease', 'LC_m_{}'.format(k)))
    display(_adata.obs.shape,_adata.obs['cell_type' if k == 'allImm' else 'sub_cell_type'].value_counts())
print('\n[finish]\n'.center(100, '-'))

allImm---------------------------------------------------------------------


  adata.var["gene_names"] = adata.var_names.to_numpy()


frist 10 data.X nonzero elements:
 [[1 1 1 2 1 1 1 2 1 3]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_m_allImm


(9201, 13)

cell_type
Neutrophils    3593
B cells        1874
MoMacDC        1768
T cells        1491
NK cells        400
pDC              52
Basophils        23
Name: count, dtype: int64

DendriticCells-------------------------------------------------------------
frist 10 data.X nonzero elements:
 [[1 1 1 1 1 5 2 2 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_m_DendriticCells


  adata.var["gene_names"] = adata.var_names.to_numpy()


(646, 13)

sub_cell_type
DC3       293
DC1       119
DC2       106
MonoDC     76
pDC        52
Name: count, dtype: int64

Monocyte-------------------------------------------------------------------


  adata.var["gene_names"] = adata.var_names.to_numpy()


frist 10 data.X nonzero elements:
 [[1 5 1 1 1 1 1 2 1 2]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_m_Monocyte


(771, 13)

sub_cell_type
Mono1    407
Mono3    210
Mono2    154
Name: count, dtype: int64

Neutrophils----------------------------------------------------------------


  adata.var["gene_names"] = adata.var_names.to_numpy()


frist 10 data.X nonzero elements:
 [[1 1 1 2 1 3 1 1 1 1]]
[out] /public/workspace/licanchengup/link/res_publish/run/cache/disease/LC_m_Neutrophils


(3593, 13)

sub_cell_type
N4    1782
N1     968
N5     580
N3     130
N6     109
N2      24
Name: count, dtype: int64

---------------------------------------------
[finish]
---------------------------------------------


In [21]:
del adata_m,adata_mh,adata_mt,_adata

# info and parameters

In [22]:
info = pd.DataFrame({
    'path': p_cache.joinpath('disease').iterdir()
})
info['name'] = info['path'].apply(lambda x: x.name)
info = info[info['path'].apply(lambda x: x.match('*/LC_*'))]
info = info.join(info['name'].str.extract(
    "LC_(?P<sp_simple>[hm])_(?P<tag>\\w+)"))
info['name'] = info.apply(lambda row: 'LC{}{}'.format(row['sp_simple'],
                                       {
    'allImm': 'all',
    'Monocyte': 'Mono',
    'Neutrophils': 'Neu',
    'DendriticCells': 'DCs'
}.setdefault(row['tag'], '')), axis=1)
info['sp'] = info['sp_simple'].map(map_sp)
info['tissue'] = 'LC'
info= info.loc[:,'tissue,sp,path,name,sp_simple,tag'.split(',')]
info

Unnamed: 0,tissue,sp,path,name,sp_simple,tag
0,LC,mouse,/public/workspace/licanchengup/link/res_publis...,LCmDCs,m,DendriticCells
1,LC,mouse,/public/workspace/licanchengup/link/res_publis...,LCmMono,m,Monocyte
2,LC,human,/public/workspace/licanchengup/link/res_publis...,LChDCs,h,DendriticCells
4,LC,human,/public/workspace/licanchengup/link/res_publis...,LChNeu,h,Neutrophils
6,LC,human,/public/workspace/licanchengup/link/res_publis...,LChall,h,allImm
7,LC,mouse,/public/workspace/licanchengup/link/res_publis...,LCmNeu,m,Neutrophils
8,LC,human,/public/workspace/licanchengup/link/res_publis...,LChMono,h,Monocyte
10,LC,mouse,/public/workspace/licanchengup/link/res_publis...,LCmall,m,allImm


In [25]:
df_para = pd.merge(
    info.query("sp == 'human'"),
info.query("sp == 'mouse'"),
on=['tissue','tag'],suffixes=('_ref','_que')
)
df_para['key_cell_type'] = df_para['tag'].apply(lambda x:"cell_type" if x == "allImm" else "sub_cell_type")
df_para = df_para.drop(columns=['tag'])
df_para['tissue'] = 'LC'
display(df_para)
print(*df_para.columns,sep='\n')

Unnamed: 0,tissue,sp_ref,path_ref,name_ref,sp_simple_ref,sp_que,path_que,name_que,sp_simple_que,key_cell_type
0,LC,human,/public/workspace/licanchengup/link/res_publis...,LChDCs,h,mouse,/public/workspace/licanchengup/link/res_publis...,LCmDCs,m,sub_cell_type
1,LC,human,/public/workspace/licanchengup/link/res_publis...,LChNeu,h,mouse,/public/workspace/licanchengup/link/res_publis...,LCmNeu,m,sub_cell_type
2,LC,human,/public/workspace/licanchengup/link/res_publis...,LChall,h,mouse,/public/workspace/licanchengup/link/res_publis...,LCmall,m,cell_type
3,LC,human,/public/workspace/licanchengup/link/res_publis...,LChMono,h,mouse,/public/workspace/licanchengup/link/res_publis...,LCmMono,m,sub_cell_type


tissue
sp_ref
path_ref
name_ref
sp_simple_ref
sp_que
path_que
name_que
sp_simple_que
key_cell_type


In [26]:
df_para.to_csv(p_cache.joinpath('parameter_LC.csv'),index=False)
print("\n[finish]\n".center(100,"-"))

---------------------------------------------
[finish]
---------------------------------------------
