# 初步探索poisson table
- log2_FC: log2(treat_count.norm/ctrl_count.norm), 如果ctrl_count.norm不存在，就用 chr 突变背景
- log2_FC_mut: log2(treat_mut_count.norm/ctrl_mut_count.norm), 如果ctrl_mut_count.norm不存在，就用 chr 突变背景
- region_block_state: B-Blocked, S-SNV, N-Non-SNV

In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
file = '../poisson_res/poisson_res_all.tsv.gz'

In [3]:
df = pd.read_csv(file, header=0, index_col=None, sep='\t')
df.head()

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,...,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR
0,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,"chr1_31029_GA,chr1_31030_G.,chr1_31031_GA,chr1...",B-N-N-N,...,0.005063,0.035765,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.714251
1,DetectSeq_ATP8-DddA11_REP-1,chr1,56485,56490,chr1_56485_56490,4,1,3,"chr1_56485_CT,chr1_56487_CT,chr1_56488_C.,chr1...",B-N-N-N,...,0.025315,0.041726,0.0,0.005961,"0,1,2,3,4 10,0,0,0,0 6,0,1,0,0",0.720943,-0.442445,TestOK,0.582932,0.714251
2,DetectSeq_ATP8-DddA11_REP-1,chr1,56638,56647,chr1_56638_56647,3,0,3,"chr1_56638_CT,chr1_56641_C.,chr1_56647_CT",N-N-N,...,0.040504,0.077491,0.0,0.0,"0,1,2,3 16,0,0,0 3,10,0,0",0.935956,,TestOK,0.841752,0.841765
3,DetectSeq_ATP8-DddA11_REP-1,chr1,63226,63239,chr1_63226_63239,5,1,4,"chr1_63226_CT,chr1_63227_C.,chr1_63230_CT,chr1...",N-N-N-N-B,...,0.053162,0.065569,0.0,0.005961,"0,1,2,3,4,5 21,0,0,0,0,0 10,0,0,1,0,0",0.302631,-0.442445,TestOK,0.582932,0.714251
4,DetectSeq_ATP8-DddA11_REP-1,chr1,70291,70300,chr1_70291_70300,3,0,3,"chr1_70291_CT,chr1_70292_CA,chr1_70300_CT",N-N-N,...,0.040504,0.113256,0.0,0.011922,"0,1,2,3 15,1,0,0 11,6,2,0",1.483444,0.557555,TestOK,0.37762,0.580319


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1562657 entries, 0 to 1562656
Data columns (total 28 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   <sample>                       1562657 non-null  object 
 1   chr_name                       1562657 non-null  object 
 2   region_start                   1562657 non-null  int64  
 3   region_end                     1562657 non-null  int64  
 4   mpmat_index                    1562657 non-null  object 
 5   region_site_num                1562657 non-null  int64  
 6   region_block_site_num          1562657 non-null  int64  
 7   region_mut_site_num            1562657 non-null  int64  
 8   region_site_index              1562657 non-null  object 
 9   region_block_state             1562657 non-null  object 
 10  region_highest_site_index      1546820 non-null  object 
 11  region_highest_site_mut_num    1562657 non-null  int64  
 12  region_highest

In [5]:
df.isna().sum()

<sample>                              0
chr_name                              0
region_start                          0
region_end                            0
mpmat_index                           0
region_site_num                       0
region_block_site_num                 0
region_mut_site_num                   0
region_site_index                     0
region_block_state                    0
region_highest_site_index         15837
region_highest_site_mut_num           0
region_highest_site_cover_num         0
region_highest_site_mut_ratio         0
ctrl_count                            0
treat_count                           0
ctrl_mut_count                        0
treat_mut_count                       0
ctrl_count.norm                       0
treat_count.norm                      0
ctrl_mut_count.norm                   0
treat_mut_count.norm                  0
count_info                            0
log2_FC                               0
log2_FC_mut                      229694


In [6]:
# # 查看一下两列 NA 是什么
# test_df = df.query('region_highest_site_index.isna()')
# test2_df = df.query('log2_FC_mut.isna()')

In [7]:
df = df.assign(
    bed_name = df.mpmat_index + '_highest_' + df.region_highest_site_index,
    strand = '.'
)
df

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,...,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,"chr1_31029_GA,chr1_31030_G.,chr1_31031_GA,chr1...",B-N-N-N,...,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.714251,chr1_31029_31037_highest_chr1_31031_GA,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,56485,56490,chr1_56485_56490,4,1,3,"chr1_56485_CT,chr1_56487_CT,chr1_56488_C.,chr1...",B-N-N-N,...,0.0,0.005961,"0,1,2,3,4 10,0,0,0,0 6,0,1,0,0",0.720943,-0.442445,TestOK,0.582932,0.714251,chr1_56485_56490_highest_chr1_56487_CT,.
2,DetectSeq_ATP8-DddA11_REP-1,chr1,56638,56647,chr1_56638_56647,3,0,3,"chr1_56638_CT,chr1_56641_C.,chr1_56647_CT",N-N-N,...,0.0,0.000000,"0,1,2,3 16,0,0,0 3,10,0,0",0.935956,,TestOK,0.841752,0.841765,chr1_56638_56647_highest_chr1_56638_CT,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,63226,63239,chr1_63226_63239,5,1,4,"chr1_63226_CT,chr1_63227_C.,chr1_63230_CT,chr1...",N-N-N-N-B,...,0.0,0.005961,"0,1,2,3,4,5 21,0,0,0,0,0 10,0,0,1,0,0",0.302631,-0.442445,TestOK,0.582932,0.714251,chr1_63226_63239_highest_chr1_63226_CT,.
4,DetectSeq_ATP8-DddA11_REP-1,chr1,70291,70300,chr1_70291_70300,3,0,3,"chr1_70291_CT,chr1_70292_CA,chr1_70300_CT",N-N-N,...,0.0,0.011922,"0,1,2,3 15,1,0,0 11,6,2,0",1.483444,0.557555,TestOK,0.377620,0.580319,chr1_70291_70300_highest_chr1_70300_CT,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562652,DetectSeq_SIRT6-DddA11_REP-1,chrX,155948218,155948220,chrX_155948218_155948220,2,0,2,"chrX_155948218_CT,chrX_155948220_CT",N-N,...,0.0,0.000000,"0,1,2 0,0,0 5,0,4",-0.208028,,TestOK,0.806282,0.846502,chrX_155948218_155948220_highest_chrX_15594821...,.
1562653,DetectSeq_SIRT6-DddA11_REP-1,chrX,155948229,155948244,chrX_155948229_155948244,4,0,4,"chrX_155948229_GA,chrX_155948233_GA,chrX_15594...",N-N-N-N,...,0.0,0.000000,"0,1,2,3,4 0,0,0,0,0 5,0,0,4,0",-0.208028,,TestOK,0.806282,0.846502,chrX_155948229_155948244_highest_chrX_15594822...,.
1562654,DetectSeq_SIRT6-DddA11_REP-1,chrX,155974406,155974427,chrX_155974406_155974427,7,0,7,"chrX_155974406_CT,chrX_155974413_CT,chrX_15597...",N-N-N-N-N-N-N,...,0.0,0.016684,"0,1,2,3,4,5,6,7 0,0,0,0,0,0,0,0 17,4,1,2,0,0,0,0",1.207009,1.310806,TestOK,0.192759,0.318252,chrX_155974406_155974427_highest_chrX_15597442...,.
1562655,DetectSeq_SIRT6-DddA11_REP-1,chrX,156022047,156022062,chrX_156022047_156022062,4,0,4,"chrX_156022047_GA,chrX_156022048_GA,chrX_15602...",N-N-N-N,...,0.0,0.016684,"0,1,2,3,4 0,0,0,0,0 4,6,3,0,0",0.322486,1.310806,TestOK,0.192759,0.318252,chrX_156022047_156022062_highest_chrX_15602206...,.


# find significant region

## strict selection 

In [8]:
df_sign_strict = (df
    .query('FDR <= 0.0001')  # 10911
    .query('log2_FC_mut >= 2')  # 10911
    .query('ctrl_mut_count <= 1')  # 9551
    .query('`treat_mut_count.norm` * 100 >= 10')  # 9551
    .query('treat_mut_count >= 20')  # 9551
    .query('treat_mut_count / treat_count >= 0.15')  # 6080
    .query('region_block_site_num <= 1')  # 5720
    .query('region_highest_site_mut_ratio >= 0.35')  # 2929
)

print(df_sign_strict.shape[0])
df_sign_strict.groupby('<sample>').describe()

2929


Unnamed: 0_level_0,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_end,region_end,...,p_value,p_value,FDR,FDR,FDR,FDR,FDR,FDR,FDR,FDR
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
<sample>,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
DetectSeq_ATP8-DddA11_REP-1,160.0,72326520.0,54172810.0,630957.0,27081692.75,60138169.5,114205900.0,242043276.0,160.0,72326580.0,...,5.548434e-09,4.032488e-07,160.0,6e-06,1.5e-05,0.0,1.1452059999999999e-20,3.628154e-09,2e-06,8.6e-05
DetectSeq_ATP8-DddA6_REP-1,369.0,73238340.0,52822040.0,630440.0,30522027.0,60296487.0,112191500.0,230513936.0,369.0,73238410.0,...,2.354367e-08,5.105593e-07,369.0,1e-05,2.3e-05,0.0,1.200588e-11,1.471017e-07,6e-06,9.8e-05
DetectSeq_ATP8-DddAwt_REP-1,88.0,53567810.0,42595210.0,631391.0,22923202.25,49321013.5,60677990.0,201232409.0,88.0,53567870.0,...,8.644648e-09,1.531386e-07,88.0,9e-06,1.8e-05,0.0,1.705089e-11,9.436538e-08,5e-06,6.7e-05
DetectSeq_JAK2-DddA11_REP-1,603.0,73516320.0,53503460.0,686172.0,32317164.5,60225127.0,108211700.0,246696632.0,603.0,73516370.0,...,1.114112e-08,6.48217e-07,603.0,9e-06,2.2e-05,1.1726140000000001e-99,4.837212e-18,5.682422e-10,2e-06,9.9e-05
DetectSeq_SIRT6-DddA11_REP-1,1709.0,75217790.0,54574690.0,242415.0,31532755.0,60917089.0,110310900.0,248905893.0,1709.0,75217850.0,...,1.577425e-08,1.137115e-06,1709.0,7e-06,1.6e-05,6.405502e-146,3.677355e-16,2.551968e-09,2e-06,9.1e-05


In [9]:
df_sign_strict.isna().sum().sum()

0

## lenient selection

In [10]:
df_sign_lenient = (df
    .query('FDR <= 0.01')  # 25118
    .query('log2_FC_mut >= 2')  # 25118
    .query('ctrl_mut_count <= 1')  # 22009
    .query('`treat_mut_count.norm` * 100 >= 5')  # 22009
    .query('treat_mut_count >= 10')  # 22009
    .query('treat_mut_count / treat_count >= 0.15')  # 14476
    .query('region_block_site_num <= 1')  # 13522
    .query('region_highest_site_mut_ratio >= 0.30')  # 7001
)

print(df_sign_lenient.shape[0])
df_sign_lenient.groupby('<sample>').describe()

7001


Unnamed: 0_level_0,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_end,region_end,...,p_value,p_value,FDR,FDR,FDR,FDR,FDR,FDR,FDR,FDR
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
<sample>,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
DetectSeq_ATP8-DddA11_REP-1,390.0,71404890.0,52684470.0,630957.0,26449309.5,59933966.5,109372023.5,242043276.0,390.0,71404950.0,...,1.3e-05,7.1e-05,390.0,0.001397,0.002228,0.0,7.914552e-09,7.8e-05,0.00188,0.008006
DetectSeq_ATP8-DddA6_REP-1,1062.0,78209190.0,54454230.0,58741.0,34931723.0,65550265.5,113457006.0,246387176.0,1062.0,78209250.0,...,2.2e-05,0.000111,1062.0,0.001802,0.002871,0.0,3.674352e-07,0.000188,0.002568,0.009767
DetectSeq_ATP8-DddAwt_REP-1,198.0,56648610.0,45054480.0,631391.0,22815892.0,50126483.5,62436712.5,228622163.0,198.0,56648660.0,...,7e-06,4e-05,198.0,0.001615,0.00253,0.0,1.808493e-07,0.000127,0.002013,0.00913
DetectSeq_JAK2-DddA11_REP-1,1122.0,70747300.0,52719370.0,497136.0,28627039.75,58011844.5,103544116.5,246696632.0,1122.0,70747350.0,...,1e-05,0.000131,1122.0,0.001164,0.002306,1.1726140000000001e-99,1.105413e-11,5e-06,0.001209,0.009687
DetectSeq_SIRT6-DddA11_REP-1,4229.0,75982110.0,55105850.0,49953.0,31975522.0,62469941.0,111973236.0,248905893.0,4229.0,75982160.0,...,3.3e-05,0.000308,4229.0,0.001592,0.002827,6.405502e-146,2.534612e-09,3.2e-05,0.001646,0.009766


In [11]:
df_sign_lenient.isna().sum().sum()

0

In [12]:
dt_mpmat_index = {}

for sample, _df in df_sign_lenient.groupby('<sample>'):
    dt_mpmat_index[sample] = _df.mpmat_index
    
# dt_mpmat_index

In [None]:
def check_mpmat_index_in_others(x):
    dt_mpmat_index_isin = dict()
    
    for sample, sample_mpmat_index in dt_mpmat_index.items():
        if x['mpmat_index'] in v:
            pass

In [None]:
df_igv = df_sign_lenient[['chr_name', 'region_start', 'region_end', 'bed_name', 'log2_FC_mut']]

# IGV 截图脚本

In [None]:
# 理论上这里得到的是所有 samples 的 candidate list
# 经过先 merge mpmat 的处理，再 call 点，应该不存在能 overlap 到一起的 region，只能是样本间 share 相同位置或者 not share，不存在 overlap 又不相同的情况了

In [None]:
df_igv.head()

In [None]:
# 整理df格式为bed文件格式
print(df_igv)
#        0          1          2
# 0   chr5   69093805   69093830
# 1   chr8   37153384   37153424
# 2  chr15   57559994   57560017
# 3  chr15   68651256   68651277
# 4  chr10  119445511  119445546
# 5   chr8   20184990   20185028
# 6  chr19   45187694   45187712
# 7  chr15   81265992   81266016
# 8   chr2  201232409  201232430
# 9   chr9   98034893   98034930

In [None]:
# 填写相关信息

path_out = '/Volumes/zhaohn_HD/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/igv'
date = 20221020
format_ = "png"
height = 1500

# 格式化脚本
text = f"maxPanelHeight {height}\nsnapshotDirectory {path_out}/off-targets_{date}\n"
print(text)

In [None]:
df_snapshot = df_igv.iloc[:,0:5]

for index, row_info in df_snapshot.iterrows():
    chrom, start, stop, bed_name, score = row_info
    
    path_out_png = f'{score}_{bed_name}.snapshot.{format_}'
    middle = int((start + stop) / 2)
    
    text += f"goto {chrom}:{middle - 100}-{middle + 100}\nsnapshot {path_out_png}\n"
print(text[:1000])

In [None]:
with open(f'{path_out}/off-targets_{date}_snapshot.igv_shot_script', 'wt') as f:
    f.write(text)

In [None]:
# DetectSeq_JAK2
# DetectSeq_SIRT6
# IND share?