In [32]:
import os 
import pandas as pd 
import subprocess as sp
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")

## Cell type Accessibility 

In [33]:
coacc = 'results/refs/chiou_et_al_2021/Supplemental4.celltype.cCREs.41586_2021_3552_MOESM7_ESM.xlsx'
coacc = pd.read_excel(coacc, skiprows=2)

In [34]:
cell_dict = {
    'activated CD4 T': 'tbd',
    'activated CD4 T, naive T': 'CD4N',
    'memory B, naive B': 'tbd',
    'INShi beta, INSlo beta': 'tbd',
    'INShi beta': 'tbd',
    'nonclassical monocyte': 'NCM',
    'GCGlo alpha': 'tbd',
    'ductal': 'tbd',
    'cytotoxic NK': 'NK',
    'adaptive NK': 'tbd',
    'adaptive NK, cytotoxic NK': 'tbd',
    'pancreatic CD8 T': 'tbd',
    'quiescent stellate': 'tbd',
    'regulatory T': 'TREGMEM',
    'memory CD8 T': 'tbd',
    'naive B': 'NB',
    'acinar': 'tbd',
    'SSThi delta': 'tbd',
    'naive T': 'tbd',
    'conventional dendritic': 'tbd',
    'cytotoxic CD8 T, cytotoxic NK': 'tbd',
    'pancreatic macrophage': 'tbd',
    'plasmacytoid dendritic': 'tbd',
    'endothelial': 'tbd',
    'memory B': 'tbd',
    'cytotoxic CD8 T, memory B': 'tbd',
    'GCGhi alpha': 'tbd',
    'cytotoxic CD8 T': 'tbd',
    'SSTlo delta': 'tbd',
    'gamma': 'tbd',
    'activated stellate': 'tbd',
    'SSTlo delta, quiescent stellate': 'tbd',
    'activated CD4 T, regulatory T': 'tbd',
    'classical monocyte': 'CM',
    'INSlo beta': 'tbd',
    'activated stellate, quiescent stellate': 'tbd',
    'acinar, ductal': 'tbd',
    'conventional dendritic, plasmacytoid dendritic': 'tbd',
    'megakaryocyte': 'tbd',
    'activated CD4 T, memory CD8 T': 'tbd',
    'conventional dendritic, pancreatic macrophage': 'tbd',
    'naive T, plasmacytoid dendritic': 'tbd',
    'memory CD8 T, plasmacytoid dendritic': 'tbd',
    'adaptive NK, endothelial': 'tbd',
    'cytotoxic CD8 T, pancreatic CD8 T': 'tbd',
    'classical monocyte, nonclassical monocyte': 'tbd',
    'GCGhi alpha, INShi beta': 'tbd',
    'adaptive NK, megakaryocyte': 'tbd',
    'memory CD8 T, pancreatic CD8 T': 'tbd',
    'megakaryocyte, regulatory T': 'tbd',
    'activated CD4 T, cytotoxic CD8 T': 'tbd',
    'endothelial, memory CD8 T': 'tbd',
    'SSThi delta, gamma': 'tbd',
    'SSThi delta, SSTlo delta': 'tbd',
    'cytotoxic CD8 T, memory CD8 T': 'tbd',
    'megakaryocyte, plasmacytoid dendritic': 'tbd',
    'INShi beta, acinar': 'tbd',
    'GCGhi alpha, GCGlo alpha': 'tbd',
    'cytotoxic NK, memory CD8 T': 'tbd',
    'cytotoxic CD8 T, adaptive NK': 'tbd',
    'cytotoxic CD8 T, naive T': 'tbd',
    'ductal, memory CD8 T': 'tbd',
    'ductal, megakaryocyte': 'tbd',
    'cytotoxic CD8 T, regulatory T': 'tbd',
    'GCGlo alpha, INSlo beta': 'tbd',
    'activated CD4 T, pancreatic macrophage': 'tbd',
    'GCGlo alpha, SSTlo delta': 'tbd',
    'adaptive NK, pancreatic CD8 T': 'tbd',
    'nonclassical monocyte, pancreatic macrophage': 'tbd',
    'cytotoxic NK, ductal': 'tbd',
    'gamma, pancreatic CD8 T': 'tbd',
    'adaptive NK, ductal': 'tbd',
    'classical monocyte, conventional dendritic': 'tbd',
    'cytotoxic CD8 T, endothelial': 'tbd',
    'adaptive NK, memory CD8 T': 'tbd',
    'SSThi delta, memory CD8 T': 'tbd',
    'SSTlo delta, memory CD8 T': 'tbd',
    'acinar, adaptive NK': 'tbd',
    'activated CD4 T, memory CD8 T, regulatory T': 'tbd',
    'pancreatic CD8 T, plasmacytoid dendritic': 'tbd',
    'acinar, memory CD8 T': 'tbd',
    'memory CD8 T, quiescent stellate': 'tbd',
    'GCGhi alpha, cytotoxic NK': 'tbd',
    'adaptive NK, pancreatic macrophage': 'tbd',
    'activated CD4 T, ductal': 'tbd'}

In [35]:
cell_dict = {k: v for k, v in cell_dict.items() if v != 'tbd'}

In [36]:
cell_dict

{'activated CD4 T, naive T': 'CD4N',
 'nonclassical monocyte': 'NCM',
 'cytotoxic NK': 'NK',
 'regulatory T': 'TREGMEM',
 'naive B': 'NB',
 'classical monocyte': 'CM'}

In [37]:
cl_grps = coacc.groupby('clusters')

In [47]:
for chiou_cl, dice_cl in cell_dict.items():
    df = cl_grps.get_group(chiou_cl)
    df = df.iloc[:, 0:3]
    df.iloc[:, 0] = 'chr' + df.iloc[:, 0].astype(str)
    dy = 'results/main/coacc/{}/'.format(dice_cl)
    os.makedirs(dy, exist_ok=True)
    
    fn = os.path.join(dy, '{}.bed'.format(dice_cl))
    df.to_csv(fn, sep='\t', index=False, header=False)
    
    # run bgzip
    cmd = '{} {}'.format(bgzip, fn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())
    
    # run tabix
    lrange_gzfn = fn + '.gz'
    cmd = '{} -f {}'.format(tabix, lrange_gzfn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())

    print('Created the gzfn: {}'.format(fn + '.gz'))
    print('Created the tabix: {}'.format(fn + '.gz.tbi'))    

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/coacc/CD4N/CD4N.bed
out: 
err: 
/mnt/BioApps/tabix/tabix-0.2.6/tabix -f results/main/coacc/CD4N/CD4N.bed.gz
out: 
err: 
Created the gzfn: results/main/coacc/CD4N/CD4N.bed.gz
Created the tabix: results/main/coacc/CD4N/CD4N.bed.gz.tbi
/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/coacc/NCM/NCM.bed
out: 
err: 
/mnt/BioApps/tabix/tabix-0.2.6/tabix -f results/main/coacc/NCM/NCM.bed.gz
out: 
err: 
Created the gzfn: results/main/coacc/NCM/NCM.bed.gz
Created the tabix: results/main/coacc/NCM/NCM.bed.gz.tbi
/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/coacc/NK/NK.bed
out: 
err: 
/mnt/BioApps/tabix/tabix-0.2.6/tabix -f results/main/coacc/NK/NK.bed.gz
out: 
err: 
Created the gzfn: results/main/coacc/NK/NK.bed.gz
Created the tabix: results/main/coacc/NK/NK.bed.gz.tbi
/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/coacc/TREGMEM/TREGMEM.bed
out: 
err: 
/mnt/BioApps/tabix/tabix-0.2.6/tabix -f results/main/coacc/TREGMEM/TREGMEM.bed.gz
out: 

## Merged Accessibility 

In [51]:
#coacc_all = 'results/refs/chiou_et_al_2021/Supplemental3.cCREs.41586_2021_3552_MOESM6_ESM.xlsx'
#coacc_all = pd.read_excel(coacc_all, skiprows=2)

In [52]:
df = coacc_all.iloc[:, 0:3]
df.iloc[:, 0] = 'chr' + df.iloc[:, 0].astype(str)
dy = 'results/main/coacc/ALL/'.format(dice_cl)
os.makedirs(dy, exist_ok=True)

In [54]:
fn = os.path.join(dy, 'combined-loops.bed'.format(dice_cl))
df.to_csv(fn, sep='\t', index=False, header=False)

# run bgzip
cmd = '{} {}'.format(bgzip, fn)
print(cmd)
job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

out, err = job.communicate()
print('out:', out.decode())
print('err:', err.decode())

# run tabix
lrange_gzfn = fn + '.gz'
cmd = '{} -f {}'.format(tabix, lrange_gzfn)
print(cmd)
job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

out, err = job.communicate()
print('out:', out.decode())
print('err:', err.decode())

print('Created the gzfn: {}'.format(fn + '.gz'))
print('Created the tabix: {}'.format(fn + '.gz.tbi'))    

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/coacc/ALL/combined-loops.bed
out: 
err: 
/mnt/BioApps/tabix/tabix-0.2.6/tabix -f results/main/coacc/ALL/combined-loops.bed.gz
out: 
err: 
Created the gzfn: results/main/coacc/ALL/combined-loops.bed.gz
Created the tabix: results/main/coacc/ALL/combined-loops.bed.gz.tbi
