# disease_LC_0

整理GSE127465数据集,最后输出`cache/parameter_LC.csv`

更新时间 2024年3月26日

# [PMC6620049](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6620049/)

> [GSE127465](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE127465)

|sp|tissue|status|
|:-:|:-:|:-:|
|h|blood|tumor|
|h|liver|tumor|
|m|liver|tumor|
|m|liver|healthy|


+ immue cell 大类群
+ immue cell 亚类
    + Neutrophil
    + Dendritic Cells
    + Monocyte

In [1]:
import sys
from pathlib import Path
p_root = Path('~/link/res_publish').expanduser()
None if str(p_root) in sys.path else sys.path.append(str(p_root))

In [None]:
from func import *

In [None]:
p_root_item = p_cache.joinpath('disease','LC_GSE127465')

def limite_func(adata, key, value):
    adata = adata.copy()
    adata = adata[adata.obs[key].str.match(value)]
    return adata


map_limite_func = {

    "allImm": lambda adata: adata,
    
    "DendriticCells": lambda adata: limite_func(
        adata, key="sub_cell_type", value="p?(Mono)?DC\\d?"
    ),

    "Monocyte": lambda adata: limite_func(
        adata, key="sub_cell_type", value="Mono\\d"
    ),
    "Neutrophils": lambda adata: limite_func(
        adata, key="cell_type", value="^Neutrophils$"
    ),
}

In [None]:
df_path = pd.DataFrame({
    'path': p_root_item.joinpath('GSE127465_RAW').iterdir()
})
df_path['name'] = df_path['path'].apply(lambda x: x.name)
df_path = df_path[df_path['path'].apply(lambda x: x.match('*raw_counts*'))]
df_path = df_path.join(df_path['name'].str.extract(
    '^(?P<gsm_id>GSM\\d+)_(?P<sp>[^_]+)_(?P<sample>.+)_raw_counts.tsv.gz'))
display(df_path.head(2), df_path.shape)

In [None]:
html_table = pd.read_csv(
    'GSE127465_html_table.csv',
    index_col=None,
    header=None,
    names=[
        'gsm_id',
        'tag'])
html_table = html_table.join(html_table['tag'].str.extract(
    '^(?P<sample_fullname>\\w+) \\[(?P<sp>[^_]+)_(?P<sample>\\w+)]'))
display(html_table.head(2), html_table.shape)
[print('{}\t{} is unique'.format(html_table[_].is_unique, _))
 for _ in html_table.columns]

df_path = df_path.merge(html_table.loc[:, 'gsm_id,sample_fullname'.split(
    ',')], on='gsm_id').sort_values(['sp', 'sample'])

In [None]:
df_path['tissue'] = 'lung'
df_path['tissue'] = df_path['tissue'].mask(
    df_path['sample_fullname'].str.contains('blood'), 'blood')
df_path['status'] = ''
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('healthy'), 'healthy')
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('patient'), 'tumor')
df_path['status'] = df_path['status'].mask(
    df_path['sample_fullname'].str.contains('tumor'), 'tumor')
df_path['individual'] = df_path['sample_fullname']\
    .str.extract('([mousepaint]+_\\d)', expand=False)\
    .str.replace('patient', 'human', regex=False)\
    .apply(lambda x: '{}{}'.format(x[0], x[-1]))
df_path['repeat'] = df_path['sample'].str.extract('(\\d)$', expand=False)
df_path['_batch'] = df_path.apply(lambda row: '{}{}{}_{}'.format(
    row['individual'], row['status'][0], row['tissue'][0], row['repeat']
), axis=1)

In [None]:
df_path.loc[:, '_batch,sp,individual,status,tissue,repeat'.split(
    ',')].head(2)

# human

In [None]:
df_meta_h = pd.read_csv(
    p_root_item.joinpath('GSE127465_human_cell_metadata_54773x25.tsv.gz'),
    sep='\t',
    index_col=None)

print(*df_meta_h.columns, sep='\n')
df_meta_h = df_meta_h.loc[:,
                          'Barcode,Library,Patient,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune'.split(',')]
df_meta_h = pd.merge(
    df_path.loc[:, 'sample,_batch,sp,individual,status,tissue,repeat'.split(',')],
    df_meta_h,
    left_on='sample', right_on='Library'
)

_temp = df_meta_h.apply(
    lambda row: '{Barcode};{Library}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
df_meta_h.index = _temp.to_numpy()
del _temp
display(df_meta_h.head(2), df_meta_h.shape)

df_path_h = df_path.query("sp == 'human'")

In [None]:
df_path_h.index = np.arange(df_path_h.shape[0])
p_temp_h = p_cache.joinpath('disease', 'temp_LC_h.h5ad')
adata_h = None
if p_temp_h.exists():
    adata_h = sc.read_h5ad(p_temp_h)
else:
    res_h = {}
    for _i, _row in df_path_h.iterrows():
        print('\r[{}/{}]{}'.format(_i+1,
                                   df_path_h.shape[0],
                                   _row['sample']).ljust(75, '-'), end='')
        _adata = None
        _adata = sc.read_csv(_row['path'], delimiter='\t')
        res_h.update({
            _row['sample']: _adata
        })
    print('sample\tobs_unique\tvar_unique')
    for k, v in res_h.items():
        print(
            '{}\t{}\t{}'.format(
                k,
                v.obs.index.is_unique,
                v.var.index.is_unique))

    adata_h = sc.concat(res_h, index_unique=';')
    adata_h.X = csr_matrix(adata_h.X)
    display(type(adata_h.X))
    del res_h
    display(
        adata_h.obs.head(2),
        adata_h.obs.shape,
        adata_h.obs.index.is_unique,
        adata_h.var.head(2),
        adata_h.var.shape,
        adata_h.var.index.is_unique)

    display(
        pd.Series(
            df_meta_h.index.isin(
                adata_h.obs.index)).value_counts(), pd.Series(
            adata_h.obs.index.isin(
                df_meta_h.index)).value_counts())
    adata_h = adata_h[adata_h.obs.index.isin(df_meta_h.index)]
    adata_h.obs = adata_h.obs.loc[:, []].join(df_meta_h)

    adata_h.write_h5ad(p_temp_h)
    print('[out] {}'.format(p_temp_h.name))
adata_h.obs = adata_h.obs.apply(lambda x: x.astype(str) if x.dtype.name == 'category' else x)
display(adata_h, adata_h.obs.head(2))

In [None]:
adata_h.obs['barcode'] = adata_h.obs.index.str.extract(
    '(\\w+);', expand=False)
_temp = adata_h.obs.apply(
    lambda row: '{barcode};{_batch}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
adata_h.obs.index = _temp
display(adata_h.obs.head(2), adata_h.obs.shape)
del _temp

## human blood and lung cancer

In [None]:
adata_hb = adata_h[adata_h.obs['tissue'] == 'blood', :]
adata_hl = adata_h[adata_h.obs.index.isin(
    adata_h.obs.query("used_in_NSCLC_immune & tissue == 'lung'").index), :]
display(adata_hb, adata_hl)

In [None]:
display(adata_hl.obs['Major cell type'].unique())
adata_hl.obs['cell_type'] = adata_hl.obs['Major cell type'].str.replace(
    '^t', '', regex=True)
display(adata_hl.obs['cell_type'].unique())
display(
    group_agg(
        adata_hl.obs, [
            'Major cell type', 'cell_type'], {
                'cell_type': ['count']}))
assert not adata_hl.obs['cell_type'].isna().any(), '[Error] nan'

display(adata_hl.obs['Minor subset'].unique())
adata_hl.obs['sub_cell_type'] = adata_hl.obs['Minor subset'].str.replace(
    '^t', '', regex=True)
display(adata_hl.obs['sub_cell_type'].unique())
display(
    group_agg(
        adata_hl.obs, [
            'Minor subset', 'sub_cell_type'], {
                'sub_cell_type': ['count']}))
assert not adata_hl.obs['sub_cell_type'].isna().any(), '[Error] nan'

In [None]:
adata_hl.obs = adata_hl.obs.loc[:,
                                '_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,used_in_NSCLC_immune,x_NSCLC_immune,y_NSCLC_immune'.split(',')]
display(adata_hl.obs.head(2), adata_hl.obs.shape)

In [None]:
for k, v in map_limite_func.items():
    print(k.ljust(75, '-'))
    _adata = v(adata_hl)
    h5ad_to_mtx(_adata, p_cache.joinpath('disease', 'LC_h_{}'.format(k)))
    display(_adata.obs['cell_type' if k == 'allImm' else 'sub_cell_type'].value_counts())
print('\n[finish]\n'.center(100, '-'))

In [None]:
del adata_h,adata_hb,adata_hl

# mouse

In [None]:
df_meta_m = pd.read_csv(
    p_root_item.joinpath('GSE127465_mouse_cell_metadata_15939x12.tsv.gz'),
    sep='\t',
    index_col=None)
print(*df_meta_m.columns, sep='\n')
df_meta_m = df_meta_m.loc[:,
                          'Barcode,Library,Major cell type,Minor subset,x,y'.split(',')]
df_meta_m = pd.merge(
    df_path.loc[:, 'sample,_batch,sp,individual,status,tissue,repeat'.split(',')],
    df_meta_m,
    left_on='sample', right_on='Library'
)

_temp = df_meta_m.apply(
    lambda row: '{Barcode};{Library}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
df_meta_m.index = _temp.to_numpy()
del _temp
display(df_meta_m.head(2), df_meta_m.shape)

df_path_m = df_path.query("sp == 'mouse'")

In [None]:
df_path_m.index = np.arange(df_path_m.shape[0])
p_temp_m = p_cache.joinpath('disease', 'temp_LC_m.h5ad')
adata_m = None
if p_temp_m.exists():
    adata_m = sc.read_h5ad(p_temp_m)
else:
    res_m = {}
    for _i, _row in df_path_m.iterrows():
        print('\r[{}/{}]{}'.format(_i+1,
                                   df_path_m.shape[0],
                                   _row['sample']).ljust(75, '-'), end='')
        _adata = None
        _adata = sc.read_csv(_row['path'], delimiter='\t')
        res_m.update({
            _row['sample']: _adata
        })
    print('sample\tobs_unique\tvar_unique')
    for k, v in res_m.items():
        print(
            '{}\t{}\t{}'.format(
                k,
                v.obs.index.is_unique,
                v.var.index.is_unique))
    adata_m = sc.concat(res_m, index_unique=';')
    adata_m.X = csr_matrix(adata_m.X)
    display(type(adata_m.X))
    del res_m
    display(
        adata_m.obs.head(2),
        adata_m.obs.shape,
        adata_m.obs.index.is_unique,
        adata_m.var.head(2),
        adata_m.var.shape,
        adata_m.var.index.is_unique)
    
    display(pd.Series(df_meta_m.index.isin(adata_m.obs.index)).value_counts(),
            pd.Series(adata_m.obs.index.isin(df_meta_m.index)).value_counts())
    adata_m = adata_m[adata_m.obs.index.isin(df_meta_m.index)]
    adata_m.obs = adata_m.obs.loc[:, []].join(df_meta_m)
    adata_m.write_h5ad(p_temp_m)
    print('[out] {}'.format(p_temp_h.name))
adata_m.obs = adata_m.obs.apply(lambda x: x.astype(str) if x.dtype.name == 'category' else x)
display(adata_m,adata_m.obs.head(2))

In [None]:
adata_m.obs['barcode'] = adata_m.obs.index.str.extract(
    '(\\w+);', expand=False)
_temp = adata_m.obs.apply(
    lambda row: '{barcode};{_batch}'.format(
        **row), axis=1)
assert _temp.is_unique, '[Error] not unique'
adata_m.obs.index = _temp
display(adata_m.obs.head(2), adata_m.obs.shape)
del _temp

In [None]:
adata_m.obs['cell_type'] = adata_m.obs['Major cell type']
adata_m.obs['sub_cell_type'] = adata_m.obs['Minor subset']
adata_m.obs = adata_m.obs.loc[:,
                                '_batch,cell_type,sub_cell_type,sp,individual,status,tissue,repeat,sample,Major cell type,Minor subset,x,y'.split(',')]
display(adata_m.obs.head(2), adata_m.obs.shape)

## mouse heathly and lung canner

In [None]:
display(adata_m.obs['status'].value_counts())
adata_mt = adata_m[adata_m.obs['status'] == 'tumor', :]
adata_mh = adata_m[adata_m.obs['status'] == 'healthy', :]
display(adata_mt,adata_mh)

In [None]:
for k, v in map_limite_func.items():
    print(k.ljust(75, '-'))
    _adata = v(adata_mt)
    h5ad_to_mtx(_adata, p_cache.joinpath('disease', 'LC_m_{}'.format(k)))
    display(_adata.obs.shape,_adata.obs['cell_type' if k == 'allImm' else 'sub_cell_type'].value_counts())
print('\n[finish]\n'.center(100, '-'))

In [None]:
del adata_m,adata_mh,adata_mt,_adata

# info and parameters

In [None]:
info = pd.DataFrame({
    'path': p_cache.joinpath('disease').iterdir()
})
info['name'] = info['path'].apply(lambda x: x.name)
info = info[info['name'] != 'LC_GSE127465'	]
info = info[info['path'].apply(lambda x: x.match('*/LC_*'))]
info = info.join(info['name'].str.extract(
    "LC_(?P<sp_simple>[hm])_(?P<tag>\\w+)"))
info['name'] = info.apply(lambda row: 'LC{}{}'.format(row['sp_simple'],
                                       {
    'allImm': 'all',
    'Monocyte': 'Mono',
    'Neutrophils': 'Neu',
    'DendriticCells': 'DCs'
}.setdefault(row['tag'], '')), axis=1)
info['sp'] = info['sp_simple'].map(map_sp)
info['tissue'] = 'LC'
info= info.loc[:,'tissue,sp,path,name,sp_simple,tag'.split(',')]
info['path'] = info['path'].apply(lambda x:x.relative_to(p_cache))
info.to_csv(p_cache.joinpath('info_LC.csv'),index=False)
info

In [None]:
df_para = pd.merge(
    info.query("sp == 'human'"),
info.query("sp == 'mouse'"),
on=['tissue','tag'],suffixes=('_ref','_que')
)
df_para['key_cell_type'] = df_para['tag'].apply(lambda x:"cell_type" if x == "allImm" else "sub_cell_type")
df_para = df_para.drop(columns=['tag'])
df_para['tissue'] = 'LC'
display(df_para)
print(*df_para.columns,sep='\n')

In [None]:
df_para.to_csv(p_cache.joinpath('parameter_LC.csv'),index=False)
print("\n[finish]\n".center(100,"-"))