# 待办事项

# 依赖包和环境设置

import python packages

In [1]:
# !pip install matplotlib matplotlib_venn pandas numpy seaborn pandarallel pybedtools plotly upsetplotly --upgrade
# !pip install git+https://github.com/ponnhide/pyCircos.git --upgrade

In [73]:
import collections
import os
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib_venn import venn2
from pybedtools import BedTool
from pycircos import (  # pip install git+https://github.com/ponnhide/pyCircos.git
    Garc,
    Gcircle,
)

In [74]:
from pandarallel import pandarallel

pd.set_option("max_colwidth", 35)  # column最大宽度
pd.set_option("display.width", 200)  # dataframe宽度
pd.set_option("display.max_columns", None)  # column最大显示数
pd.set_option("display.max_rows", 50)  # row最大显示数
pandarallel.initialize()  # 多线程设置，默认使用全部核心 nb_workers=24

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


enable rpy2

In [75]:
# enables the gaR magic,  not neceasary  if you’ve already done this
# %load_ext rpy2.ipython
# %reload_ext rpy2.ipython

import R libraries

In [76]:
# %%R
# install.packages('ggpubr')
# install.packages('ggrepel')
# install.packages('ggupset')
# install.packages('UpSetR')

In [77]:
# %%R
# 可以再%%R后面放光标 cmd + i
# library(tidyverse)
# library(ggpubr)
# library(ggrepel)
# library(ggupset)
# library(UpSetR)

# QC 步骤
查看测序和 Mapping 质量

## 搜集 MultiQC 信息

In [78]:
df_qc = pd.read_html('../qc/multiqc/multiqc_report.html')[0]
# df_qc
df_qc['Sample Name'] = df_qc['Sample Name'].str[:-3]
df_qc['% Dups'] = df_qc['% Dups'].str[:-1].astype(float)
df_qc['% GC'] = df_qc['% GC'].str[:-1].astype(float)
df_qc['Read Length'] = df_qc['Read Length'].str[:-3].astype(int)
df_qc['% Failed'] = df_qc['% Failed'].str[:-1].astype(float)
df_qc = df_qc.groupby('Sample Name').agg(np.mean)
df_qc['Read Length'] = df_qc['Read Length'] * 2
df_qc['M Seqs'] = df_qc['M Seqs'] * 2
df_qc.reset_index(inplace=True)
df_qc.rename(columns={'M Seqs': 'M Seqs <raw fq>'}, inplace=True)
df_qc

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,% Failed,M Seqs <raw fq>
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300.0,10.0,568.4
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300.0,10.0,612.8
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300.0,10.0,583.0
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300.0,10.0,521.4
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300.0,15.0,355.8
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300.0,10.0,528.0
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300.0,5.0,427.0
7,test,8.75,42.0,300.0,0.0,0.0


## 搜集 Hisat-3N Mapping 信息

In [79]:
ls = sorted(glob('../bam/*_hisat3n.hisat3n.log'))
tmpls = []

for file in ls:
    with open(file, 'rt') as f:
        sname = file.split('/')[-1].replace('_hisat3n.hisat3n.log', '')
        ratio = float(f.readlines()[-1].split('%')[0])
        tmpls.append([sname, ratio])
df_3n = pd.DataFrame(tmpls, columns=['Sample Name', '% Hisat-3n'])
df_3n

Unnamed: 0,Sample Name,% Hisat-3n
0,DetectSeq_ATP8-DddA11_REP-1,66.9
1,DetectSeq_ATP8-DddA6_REP-1,71.75
2,DetectSeq_ATP8-DddAwt_REP-1,79.26
3,DetectSeq_JAK2-DddA11_REP-1,61.5
4,DetectSeq_JAK2-DddA11_REP-2,61.79
5,DetectSeq_SIRT6-DddA11_REP-1,69.31
6,DetectSeq_SIRT6-DddA11_REP-2,74.18
7,test,66.62


## 搜集 Final Mapping 信息 （Hisat3N mapping + BWA remapping）

In [80]:
# 法1：
# ls = sorted(glob('../bam/*_final_rmdup.bam.flagstats.tsv'))
# tmpls = []

# for file in ls:
#     with open(file, 'rt') as f:
#         sname = file.split('/')[-1].replace('_final_rmdup.bam.flagstats.tsv', '')
#         ratio = float(f.readlines()[6].split('\t')[0])
#         tmpls.append([sname, ratio])
# df_final_map_flagstats = pd.DataFrame(tmpls, columns=['Sample Name', 'Seqs'])
# df_final_map_flagstats['']
# df_final_map_flagstats

# 法2：
text = """\
touch ../bam/all_final_mapped_reads.txt
rm ../bam/all_final_mapped_reads.txt

for i in `ls ../bam/*_rmdup.bam`
    samtools idxstats $i | \
    	awk '{sum += $3} END {print sum/1000000 "M reads"}' | \
    	xargs echo "$i" \
    	>> ../bam/all_final_mapped_reads.txt
"""
with open('../bam/all_final_mapped_reads.sh', 'wt') as f:
    f.write(text)

assert os.system('zsh ../bam/all_final_mapped_reads.sh') == 0


df_final_map = pd.read_csv('../bam/all_final_mapped_reads.txt', sep=' ', header=None, names=['Sample Name', 'M Seqs', '_'], usecols=[0, 1])
df_final_map

Unnamed: 0,Sample Name,M Seqs
0,../bam/DetectSeq_ATP8-DddA11_RE...,176.744M
1,../bam/DetectSeq_ATP8-DddA6_REP...,163.108M
2,../bam/DetectSeq_ATP8-DddAwt_RE...,219.36M
3,../bam/DetectSeq_JAK2-DddA11_RE...,163.946M
4,../bam/DetectSeq_JAK2-DddA11_RE...,143.23M
5,../bam/DetectSeq_SIRT6-DddA11_R...,180.379M
6,../bam/DetectSeq_SIRT6-DddA11_R...,215.754M
7,../bam/test_final_rmdup.bam,0.030129M


In [81]:
df_final_map['Sample Name'] = df_final_map['Sample Name'].str.replace('../bam/', '').str.replace('_final_rmdup.bam', '')
df_final_map.rename(columns={'M Seqs': 'M Seqs <final mapped rm dup>'}, inplace=True)
df_final_map

  df_final_map['Sample Name'] = df_final_map['Sample Name'].str.replace('../bam/', '').str.replace('_final_rmdup.bam', '')


Unnamed: 0,Sample Name,M Seqs <final mapped rm dup>
0,DetectSeq_ATP8-DddA11_REP-1,176.744M
1,DetectSeq_ATP8-DddA6_REP-1,163.108M
2,DetectSeq_ATP8-DddAwt_REP-1,219.36M
3,DetectSeq_JAK2-DddA11_REP-1,163.946M
4,DetectSeq_JAK2-DddA11_REP-2,143.23M
5,DetectSeq_SIRT6-DddA11_REP-1,180.379M
6,DetectSeq_SIRT6-DddA11_REP-2,215.754M
7,test,0.030129M


## 汇总 QC 信息

In [82]:
df_qc_all = df_qc.merge(df_3n).merge(df_final_map)
df_qc_all

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,% Failed,M Seqs <raw fq>,% Hisat-3n,M Seqs <final mapped rm dup>
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300.0,10.0,568.4,66.9,176.744M
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300.0,10.0,612.8,71.75,163.108M
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300.0,10.0,583.0,79.26,219.36M
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300.0,10.0,521.4,61.5,163.946M
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300.0,15.0,355.8,61.79,143.23M
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300.0,10.0,528.0,69.31,180.379M
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300.0,5.0,427.0,74.18,215.754M
7,test,8.75,42.0,300.0,0.0,0.0,66.62,0.030129M


In [83]:
df_qc_all['Read Length'] = df_qc_all['Read Length'].astype(int)
df_qc_all['M Seqs <Hisat-3n mapped>'] = df_qc_all['M Seqs <raw fq>'] * df_qc_all['% Hisat-3n'] / 100
df_qc_all['M Seqs <Hisat-3n mapped>'] = df_qc_all['M Seqs <Hisat-3n mapped>'].map(lambda x: round(x, 2))
df_qc_all['M Seqs <final mapped rm dup>'] = df_qc_all['M Seqs <final mapped rm dup>'].str[:-1].astype(float)
df_qc_all['M Seqs <final mapped rm dup>'] = df_qc_all['M Seqs <final mapped rm dup>'].map(lambda x: round(x, 2))
df_qc_all = df_qc_all.drop(columns=['% Failed', '% Hisat-3n'])
df_qc_all = df_qc_all.iloc[:, [0, 1, 2, 3, 4, 6, 5]].copy()
df_qc_all = df_qc_all[df_qc_all['Sample Name'] != 'test'].copy()
df_qc_all['% Effective Seqs'] = df_qc_all['M Seqs <final mapped rm dup>'] / df_qc_all['M Seqs <raw fq>'] * 100
df_qc_all['% Effective Seqs'] = df_qc_all['% Effective Seqs'].map(lambda x: round(x, 2))
df_qc_all

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,M Seqs <raw fq>,M Seqs <Hisat-3n mapped>,M Seqs <final mapped rm dup>,% Effective Seqs
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300,568.4,380.26,176.74,31.09
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300,612.8,439.68,163.11,26.62
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300,583.0,462.09,219.36,37.63
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300,521.4,320.66,163.95,31.44
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300,355.8,219.85,143.23,40.26
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300,528.0,365.96,180.38,34.16
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300,427.0,316.75,215.75,50.53


# [TEST] 测试 call 点
搞一个老的 TAS-independent 的 mpmat 文件

generate old TAS-independent off-target mpmat to call regions

In [84]:
df_old_final_list = pd.read_csv(
    "../tables/20220312-DdCBE-off_target_type.FinallistV4.CheckPrimer.AddV4ID.tsv",
    sep='\t',
    header=0,
    usecols=['region_id', 'off_target_id.V4.ND4', 'off_target_id.V4.ND5.1', 'off_target_id.V4.ND6'],
)
df_old_final_list.columns = ['mpmat_index', 'id_ND4', 'id_ND5.1', 'id_ND6']
df_old_final_list.head()

Unnamed: 0,mpmat_index,id_ND4,id_ND5.1,id_ND6
0,chr1_1471366_1471410,,,
1,chr1_1693068_1693084,,,
2,chr1_2044988_2044998,,ND5.1-TAS.IND-1,
3,chr1_5806517_5806533,,,
4,chr1_5983069_5983085,,,


In [85]:
def query_ind(x):
    x = x.fillna('')
    if 'IND' in x['id_ND4']:
        return True
    elif 'IND' in x['id_ND5.1']:
        return True
    elif 'IND' in x['id_ND6']:
        return True
    else:
        return False


df_old_final_list.apply(query_ind, axis=1).sum()

650

In [86]:
df_old_share = df_old_final_list[df_old_final_list.apply(query_ind, axis=1)].reset_index(drop=True)
df_old_share

Unnamed: 0,mpmat_index,id_ND4,id_ND5.1,id_ND6
0,chr1_2044988_2044998,,ND5.1-TAS.IND-1,
1,chr1_9702414_9702449,,ND5.1-TAS.IND-2,ND6-TAS.IND-1
2,chr1_12618165_12618174,,ND5.1-TAS.IND-3,
3,chr1_13019998_13020019,,,ND6-TAS.IND-2
4,chr1_13354705_13354738,,,ND6-TAS.IND-3
...,...,...,...,...
645,chrX_136925330_136925338,,,ND6-TAS.IND-538
646,chrX_138128211_138128237,,,ND6-TAS.IND-539
647,chrX_150683735_150683758,,ND5.1-TAS.IND-452,ND6-TAS.IND-540
648,chrX_153669436_153669453,,ND5.1-TAS.IND-453,ND6-TAS.IND-541


In [87]:
# 拿 650 个 old share 的点做一个 mpmat 出来去对着这五个 sample call 一下相关位置
df_mpmat = pd.read_csv(
    '../tables/293T-DdCBE-merge_hg38.MAPQ20.C6_M4_R1_T10.sort.V4.mpmat.gz',
    sep='\t',
    header=None,
)
df_mpmat['mpmat_index'] = df_mpmat[0] + '_' + df_mpmat[1].astype(str) + '_' + df_mpmat[2].astype(str)
df_mpmat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,mpmat_index
0,chr1,48137,48144,3,2,0,"chr1_48137_CT,chr1_48139_CT,chr...",424,191919,"0.21053,0.10526,0.21053","False,False,False",000,"Pass,Filter,Pass",chr1_48137_48144
1,chr1,127550,127558,2,2,0,"chr1_127550_CT,chr1_127558_CT",55,2221,"0.22727,0.2381","False,False",00,"Pass,Pass",chr1_127550_127558
2,chr1,261533,261543,2,2,0,"chr1_261533_GA,chr1_261543_GA",44,1417,"0.28571,0.23529","False,False",00,"Pass,Pass",chr1_261533_261543
3,chr1,629090,629115,9,4,0,"chr1_629090_CT,chr1_629091_CT,c...",27352314824290,"716,807,1057,1448,1633,1884,209...","0.03771,0.00372,0.00473,0.01588...","False,False,False,False,False,F...",000000000,"Pass,Filter,Filter,Pass,Filter,...",chr1_629090_629115
4,chr1,629136,629148,6,2,0,"chr1_629136_CT,chr1_629138_CT,c...",10010190968,357935713591360436143617,"0.02794,0.0028,0.00529,0.0,0.00...","False,False,False,False,False,F...",000000,"Pass,Filter,Filter,Filter,Filte...",chr1_629136_629148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103310,chrM,16218,16261,20,18,1,"chrM_16218_CT,chrM_16221_CT,chr...","142,102,109,61,179,94,147,120,1...","7817,7756,7847,7891,7860,7832,7...","0.01817,0.01315,0.01389,0.00773...","False,False,False,False,False,F...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","Pass,Pass,Pass,Filter,Pass,Pass...",chrM_16218_16261
103311,chrM,16255,16274,3,3,0,"chrM_16255_GA,chrM_16273_GA,chr...",110145337,753676207678,"0.0146,0.01903,0.04389","False,False,False",000,"Pass,Pass,Pass",chrM_16255_16274
103312,chrM,16259,16279,11,8,1,"chrM_16259_CT,chrM_16260_CT,chr...","104,85,87,58,129,107,93,103,712...","7715,7730,7801,7811,7836,7758,7...","0.01348,0.011,0.01115,0.00743,0...","False,False,False,False,False,F...",00000000000,"Pass,Pass,Pass,Filter,Pass,Pass...",chrM_16259_16279
103313,chrM,16329,16373,7,7,0,"chrM_16329_GA,chrM_16336_GA,chr...",131147246421466656106,7673775577467663779777647859,"0.01707,0.01896,0.03176,0.05494...","False,False,False,False,False,F...",0000000,"Pass,Pass,Pass,Pass,Pass,Pass,Pass",chrM_16329_16373


In [88]:
df_merge = pd.merge(left=df_mpmat, right=df_old_share, on='mpmat_index', how='right')

df_aim_mpmat = df_merge.iloc[:, :-4]
df_aim_mpmat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr1,2044988,2044998,7,6,0,"chr1_2044988_CT,chr1_2044989_CT...",1411711085,25252523222223,"0.56,0.44,0.28,0.47826,0.0,0.36...","False,False,False,False,False,F...",0000000,"Pass,Pass,Pass,Pass,Filter,Pass..."
1,chr1,9702414,9702449,11,9,0,"chr1_9702414_GA,chr1_9702415_GA...",4138771810201232,"103,100,96,100,94,97,91,91,93,9...","0.03883,0.01,0.03125,0.08,0.074...","False,False,False,False,False,F...",00000000000,"Pass,Filter,Filter,Pass,Pass,Pa..."
2,chr1,12618165,12618174,5,5,0,"chr1_12618165_CT,chr1_12618166_...",1141187,2629293126,"0.42308,0.13793,0.37931,0.25806...","False,False,False,False,False",00000,"Pass,Pass,Pass,Pass,Pass"
3,chr1,13019998,13020019,9,8,0,"chr1_13019998_GA,chr1_13020006_...",72755117719,384240424241424142,"0.18421,0.04762,0.175,0.11905,0...","False,False,False,False,False,F...",000000000,"Pass,Filter,Pass,Pass,Pass,Pass..."
4,chr1,13354705,13354738,11,11,0,"chr1_13354705_CT,chr1_13354706_...",379282313161241045,6361666867686568687169,"0.5873,0.14754,0.42424,0.33824,...","False,False,False,False,False,F...",00000000000,"Pass,Pass,Pass,Pass,Pass,Pass,P..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,chrX,136925330,136925338,5,5,0,"chrX_136925330_CT,chrX_13692533...",1061589,3131303231,"0.32258,0.19355,0.5,0.25,0.29032","False,False,False,False,False",00000,"Pass,Pass,Pass,Pass,Pass"
646,chrX,138128211,138128237,9,7,0,"chrX_138128211_GA,chrX_13812821...",54344316926,373838383834373735,"0.13514,0.10526,0.07895,0.10526...","False,False,False,False,False,F...",000000000,"Pass,Pass,Filter,Pass,Pass,Filt..."
647,chrX,150683735,150683758,11,11,0,"chrX_150683735_GA,chrX_15068373...",81213153541755752909,"217,211,211,209,207,204,211,203...","0.03687,0.05687,0.06161,0.07177...","False,False,False,False,False,F...",00000000000,"Pass,Pass,Pass,Pass,Pass,Pass,P..."
648,chrX,153669436,153669453,10,8,0,"chrX_153669436_CT,chrX_15366943...",28222511473737,62646166646667666567,"0.45161,0.34375,0.40984,0.16667...","False,False,False,False,False,F...",0000000000,"Pass,Pass,Pass,Pass,Pass,Pass,F..."


In [89]:
os.makedirs('../mpmat', exist_ok=True)
df_aim_mpmat.to_csv('../mpmat/2022-10-21_nature_4_5.1_6_share_650-off-targets.mpmat', header=None, index=None, sep='\t')

## analysis with old TAS-independent list (650 sites) -> df_aim_mpmat

### Detect-seq signal comparation for DddA\_wt/6/11

In [90]:
# 650 mpmat to get 650 poisson_res
df_old_site_new_signal = pd.read_csv(
    '../poisson_res_use_old650/poisson_res_all_use650.tsv.gz',
    header=0,
    index_col=None,
    sep='\t'
)
df_old_site_new_signal

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR
0,DetectSeq_ATP8-DddA6_REP-1,chr1,2044988,2044998,chr1_2044988_2044998,7,0,7,"chr1_2044988_CT,chr1_2044989_CT...",N-N-N-N-N-N-N,chr1_2044989_CT,4,6,0.666667,9,6,0,3,0.022784,0.038561,0.000000,0.019281,"0,1,2,3,4,5,6,7 9,0,0,0,0,0,0,0...",0.759147,1.251110,TestOK,2.153850e-01,4.105579e-01
1,DetectSeq_ATP8-DddA6_REP-1,chr1,9702414,9702449,chr1_9702414_9702449,11,0,11,"chr1_9702414_GA,chr1_9702415_GA...",N-N-N-N-N-N-N-N-N-N-N,chr1_9702414_GA,1,10,0.100000,18,13,0,2,0.045567,0.083549,0.000000,0.012854,"0,1,2,3,4,5,6,7,8,9,10,11 16,2,...",0.874624,0.666147,TestOK,3.559715e-01,5.418769e-01
2,DetectSeq_ATP8-DddA6_REP-1,chr1,12618165,12618174,chr1_12618165_12618174,5,0,5,"chr1_12618165_CT,chr1_12618166_...",N-N-N-N-N,chr1_12618166_CT,2,8,0.250000,10,8,0,2,0.025315,0.051415,0.000000,0.012854,"0,1,2,3,4,5 10,0,0,0,0,0 4,2,2,...",1.022181,0.666147,TestOK,3.559715e-01,5.418769e-01
3,DetectSeq_ATP8-DddA6_REP-1,chr1,13019998,13020019,chr1_13019998_13020019,9,0,9,"chr1_13019998_GA,chr1_13020006_...",N-N-N-N-N-N-N-N-N,chr1_13019998_GA,0,2,0.000000,4,4,0,0,0.010126,0.025707,0.000000,0.000000,"0,1,2,3,4,5,6,7,8,9 4,0,0,0,0,0...",1.344109,,TestOK,8.274602e-01,8.274602e-01
4,DetectSeq_ATP8-DddA6_REP-1,chr1,13354705,13354738,chr1_13354705_13354738,11,0,11,"chr1_13354705_CT,chr1_13354706_...",N-N-N-N-N-N-N-N-N-N-N,chr1_13354705_CT,0,0,0.000000,24,0,0,0,0.060756,0.000000,0.000000,0.000000,"0,1,2,3,4,5,6,7,8,9,10,11 24,0,...",,,TestOK,8.274602e-01,8.274602e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3245,DetectSeq_SIRT6-DddA11_REP-1,chrX,136925330,136925338,chrX_136925330_136925338,5,0,5,"chrX_136925330_CT,chrX_13692533...",N-N-N-N-N,chrX_136925330_CT,42,103,0.407767,43,129,0,59,0.108855,0.717403,0.000000,0.328114,"0,1,2,3,4,5 43,0,0,0,0,0 27,43,...",2.720374,5.608487,TestOK,2.433039e-20,9.702301e-20
3246,DetectSeq_SIRT6-DddA11_REP-1,chrX,138128211,138128237,chrX_138128211_138128237,9,1,8,"chrX_138128211_GA,chrX_13812821...",N-N-N-B-N-N-N-N-N,chrX_138128237_GA,12,28,0.428571,33,37,1,13,0.083540,0.205767,0.002532,0.072296,"0,1,2,3,4,5,6,7,8,9 32,0,1,0,0,...",1.300471,4.835851,TestOK,2.889936e-04,4.053103e-04
3247,DetectSeq_SIRT6-DddA11_REP-1,chrX,150683735,150683758,chrX_150683735_150683758,11,0,11,"chrX_150683735_GA,chrX_15068373...",N-N-N-N-N-N-N-N-N-N-N,chrX_150683756_GA,93,200,0.465000,37,269,2,188,0.093666,1.495979,0.005063,1.045517,"0,1,2,3,4,5,6,7,8,9,10,11 35,0,...",3.997421,7.690000,TestOK,3.003205e-70,1.626736e-68
3248,DetectSeq_SIRT6-DddA11_REP-1,chrX,153669436,153669453,chrX_153669436_153669453,10,0,10,"chrX_153669436_CT,chrX_15366943...",N-N-N-N-N-N-N-N-N-N,chrX_153669436_CT,7,22,0.318182,22,26,0,11,0.055693,0.144593,0.000000,0.061174,"0,1,2,3,4,5,6,7,8,9,10 22,0,0,0...",1.376420,3.185275,TestOK,1.185269e-03,1.578739e-03


In [91]:
df_old_site_new_signal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250 entries, 0 to 3249
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   <sample>                       3250 non-null   object 
 1   chr_name                       3250 non-null   object 
 2   region_start                   3250 non-null   int64  
 3   region_end                     3250 non-null   int64  
 4   mpmat_index                    3250 non-null   object 
 5   region_site_num                3250 non-null   int64  
 6   region_block_site_num          3250 non-null   int64  
 7   region_mut_site_num            3250 non-null   int64  
 8   region_site_index              3250 non-null   object 
 9   region_block_state             3250 non-null   object 
 10  region_highest_site_index      3250 non-null   object 
 11  region_highest_site_mut_num    3250 non-null   int64  
 12  region_highest_site_cover_num  3250 non-null   i

In [92]:
df_old_site_new_signal.isna().sum().sum()

370

In [93]:
df_old_site_new_signal = df_old_site_new_signal.assign(
    bed_name=df_old_site_new_signal.mpmat_index + '_highest_' + df_old_site_new_signal.region_highest_site_index,
    strand='.'
)
df_old_site_new_signal

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA6_REP-1,chr1,2044988,2044998,chr1_2044988_2044998,7,0,7,"chr1_2044988_CT,chr1_2044989_CT...",N-N-N-N-N-N-N,chr1_2044989_CT,4,6,0.666667,9,6,0,3,0.022784,0.038561,0.000000,0.019281,"0,1,2,3,4,5,6,7 9,0,0,0,0,0,0,0...",0.759147,1.251110,TestOK,2.153850e-01,4.105579e-01,chr1_2044988_2044998_highest_ch...,.
1,DetectSeq_ATP8-DddA6_REP-1,chr1,9702414,9702449,chr1_9702414_9702449,11,0,11,"chr1_9702414_GA,chr1_9702415_GA...",N-N-N-N-N-N-N-N-N-N-N,chr1_9702414_GA,1,10,0.100000,18,13,0,2,0.045567,0.083549,0.000000,0.012854,"0,1,2,3,4,5,6,7,8,9,10,11 16,2,...",0.874624,0.666147,TestOK,3.559715e-01,5.418769e-01,chr1_9702414_9702449_highest_ch...,.
2,DetectSeq_ATP8-DddA6_REP-1,chr1,12618165,12618174,chr1_12618165_12618174,5,0,5,"chr1_12618165_CT,chr1_12618166_...",N-N-N-N-N,chr1_12618166_CT,2,8,0.250000,10,8,0,2,0.025315,0.051415,0.000000,0.012854,"0,1,2,3,4,5 10,0,0,0,0,0 4,2,2,...",1.022181,0.666147,TestOK,3.559715e-01,5.418769e-01,chr1_12618165_12618174_highest_...,.
3,DetectSeq_ATP8-DddA6_REP-1,chr1,13019998,13020019,chr1_13019998_13020019,9,0,9,"chr1_13019998_GA,chr1_13020006_...",N-N-N-N-N-N-N-N-N,chr1_13019998_GA,0,2,0.000000,4,4,0,0,0.010126,0.025707,0.000000,0.000000,"0,1,2,3,4,5,6,7,8,9 4,0,0,0,0,0...",1.344109,,TestOK,8.274602e-01,8.274602e-01,chr1_13019998_13020019_highest_...,.
4,DetectSeq_ATP8-DddA6_REP-1,chr1,13354705,13354738,chr1_13354705_13354738,11,0,11,"chr1_13354705_CT,chr1_13354706_...",N-N-N-N-N-N-N-N-N-N-N,chr1_13354705_CT,0,0,0.000000,24,0,0,0,0.060756,0.000000,0.000000,0.000000,"0,1,2,3,4,5,6,7,8,9,10,11 24,0,...",,,TestOK,8.274602e-01,8.274602e-01,chr1_13354705_13354738_highest_...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3245,DetectSeq_SIRT6-DddA11_REP-1,chrX,136925330,136925338,chrX_136925330_136925338,5,0,5,"chrX_136925330_CT,chrX_13692533...",N-N-N-N-N,chrX_136925330_CT,42,103,0.407767,43,129,0,59,0.108855,0.717403,0.000000,0.328114,"0,1,2,3,4,5 43,0,0,0,0,0 27,43,...",2.720374,5.608487,TestOK,2.433039e-20,9.702301e-20,chrX_136925330_136925338_highes...,.
3246,DetectSeq_SIRT6-DddA11_REP-1,chrX,138128211,138128237,chrX_138128211_138128237,9,1,8,"chrX_138128211_GA,chrX_13812821...",N-N-N-B-N-N-N-N-N,chrX_138128237_GA,12,28,0.428571,33,37,1,13,0.083540,0.205767,0.002532,0.072296,"0,1,2,3,4,5,6,7,8,9 32,0,1,0,0,...",1.300471,4.835851,TestOK,2.889936e-04,4.053103e-04,chrX_138128211_138128237_highes...,.
3247,DetectSeq_SIRT6-DddA11_REP-1,chrX,150683735,150683758,chrX_150683735_150683758,11,0,11,"chrX_150683735_GA,chrX_15068373...",N-N-N-N-N-N-N-N-N-N-N,chrX_150683756_GA,93,200,0.465000,37,269,2,188,0.093666,1.495979,0.005063,1.045517,"0,1,2,3,4,5,6,7,8,9,10,11 35,0,...",3.997421,7.690000,TestOK,3.003205e-70,1.626736e-68,chrX_150683735_150683758_highes...,.
3248,DetectSeq_SIRT6-DddA11_REP-1,chrX,153669436,153669453,chrX_153669436_153669453,10,0,10,"chrX_153669436_CT,chrX_15366943...",N-N-N-N-N-N-N-N-N-N,chrX_153669436_CT,7,22,0.318182,22,26,0,11,0.055693,0.144593,0.000000,0.061174,"0,1,2,3,4,5,6,7,8,9,10 22,0,0,0...",1.376420,3.185275,TestOK,1.185269e-03,1.578739e-03,chrX_153669436_153669453_highes...,.


In [94]:
df_sign_strict = (df_old_site_new_signal
                  .query('FDR <= 0.0001')  # 10911
                  .query('log2_FC_mut >= 2')  # 10911
                  .query('ctrl_mut_count <= 1')  # 9551
                  .query('`treat_mut_count.norm` * 100 >= 10')  # 9551
                  .query('treat_mut_count >= 20')  # 9551
                  .query('treat_mut_count / treat_count >= 0.15')  # 6080
                  .query('region_block_site_num <= 1')  # 5720
                  .query('region_highest_site_mut_ratio >= 0.35')  # 2929
                  )

print(df_sign_strict.shape[0])
df_sign_strict.groupby('<sample>').describe()

617


Unnamed: 0_level_0,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_end,region_end,region_end,region_end,region_end,region_end,region_end,region_end,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,p_value,p_value,p_value,p_value,p_value,p_value,p_value,p_value,FDR,FDR,FDR,FDR,FDR,FDR,FDR,FDR
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
<sample>,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2,Unnamed: 121_level_2,Unnamed: 122_level_2,Unnamed: 123_level_2,Unnamed: 124_level_2,Unnamed: 125_level_2,Unnamed: 126_level_2,Unnamed: 127_level_2,Unnamed: 128_level_2,Unnamed: 129_level_2,Unnamed: 130_level_2,Unnamed: 131_level_2,Unnamed: 132_level_2,Unnamed: 133_level_2,Unnamed: 134_level_2,Unnamed: 135_level_2,Unnamed: 136_level_2,Unnamed: 137_level_2,Unnamed: 138_level_2,Unnamed: 139_level_2,Unnamed: 140_level_2,Unnamed: 141_level_2,Unnamed: 142_level_2,Unnamed: 143_level_2,Unnamed: 144_level_2,Unnamed: 145_level_2,Unnamed: 146_level_2,Unnamed: 147_level_2,Unnamed: 148_level_2,Unnamed: 149_level_2,Unnamed: 150_level_2,Unnamed: 151_level_2,Unnamed: 152_level_2,Unnamed: 153_level_2,Unnamed: 154_level_2,Unnamed: 155_level_2,Unnamed: 156_level_2,Unnamed: 157_level_2,Unnamed: 158_level_2,Unnamed: 159_level_2,Unnamed: 160_level_2
DetectSeq_ATP8-DddA11_REP-1,31.0,86007340.0,49946550.0,2220193.0,50522662.5,78617702.0,113368600.0,201232409.0,31.0,86007370.0,49946550.0,2220214.0,50522683.0,78617723.0,113368600.0,201232430.0,31.0,10.83871,3.32763,4.0,9.0,11.0,13.0,18.0,31.0,0.064516,0.249731,0.0,0.0,0.0,0.0,1.0,31.0,10.774194,3.263022,4.0,9.0,11.0,13.0,18.0,31.0,21.225806,12.227591,10.0,13.0,18.0,24.5,68.0,31.0,50.645161,30.638264,25.0,31.0,41.0,59.0,174.0,31.0,0.422565,0.05308,0.35,0.378354,0.411765,0.453463,0.545455,31.0,22.580645,10.760341,9.0,14.0,20.0,28.5,49.0,31.0,66.290323,40.377546,32.0,40.5,48.0,80.5,230.0,31.0,0.129032,0.340777,0.0,0.0,0.0,0.0,1.0,31.0,35.290323,16.925707,20.0,23.0,27.0,45.5,88.0,31.0,0.057163,0.02724,0.022784,0.035441,0.05063,0.072148,0.124044,31.0,0.395146,0.240684,0.190747,0.241414,0.286121,0.479848,1.370995,31.0,0.000327,0.000863,0.0,0.0,0.0,0.0,0.002532,31.0,0.21036,0.100892,0.119217,0.137099,0.160943,0.271219,0.524554,31.0,2.763185,0.87947,1.369818,2.050195,2.735088,3.345861,5.031376,31.0,5.091027,0.768367,3.879483,4.619327,4.952822,5.559368,7.44497,31.0,4.192761e-07,8.614629e-07,1.866343e-31,2.807248e-15,5.548434e-09,1.776817e-07,3.152999e-06,31.0,4.793318e-06,9.426111e-06,1.213123e-28,1.330349e-13,1.092873e-07,2.531712e-06,3.3e-05
DetectSeq_ATP8-DddA6_REP-1,14.0,73200220.0,54234430.0,1056219.0,35163961.75,76116851.5,99221390.0,201232409.0,14.0,73200250.0,54234430.0,1056232.0,35163998.0,76116884.5,99221410.0,201232430.0,14.0,11.214286,3.400873,6.0,8.25,11.0,13.75,18.0,14.0,0.142857,0.363137,0.0,0.0,0.0,0.0,1.0,14.0,11.071429,3.269069,6.0,8.25,10.5,13.75,17.0,14.0,15.357143,5.865245,9.0,11.5,13.5,18.75,31.0,14.0,35.285714,14.44048,24.0,27.5,31.0,37.25,80.0,14.0,0.438128,0.066487,0.36,0.406539,0.416667,0.450779,0.633333,14.0,20.571429,5.720601,12.0,17.25,20.5,25.5,29.0,14.0,49.5,19.60671,32.0,36.25,43.0,53.75,103.0,14.0,0.142857,0.363137,0.0,0.0,0.0,0.0,1.0,14.0,27.857143,8.908571,20.0,21.25,24.5,31.5,48.0,14.0,0.052077,0.014482,0.030378,0.043669,0.051896,0.064554,0.073414,14.0,0.318129,0.126009,0.205659,0.232973,0.276354,0.345443,0.661965,14.0,0.000362,0.000919,0.0,0.0,0.0,0.0,0.002532,14.0,0.179033,0.057254,0.128537,0.13657,0.157458,0.202446,0.308488,14.0,2.582972,0.705075,1.759147,2.125168,2.384784,2.869809,4.445647,14.0,4.982211,0.677993,4.324449,4.423941,4.875736,5.258613,6.701661,14.0,2.372577e-07,3.271692e-07,1.269741e-16,1.086194e-09,7.676918e-08,4.19162e-07,1.064301e-06,14.0,7.77342e-06,9.629707e-06,8.253317e-14,8.472959e-08,3.623942e-06,1.412051e-05,3e-05
DetectSeq_ATP8-DddAwt_REP-1,11.0,69478660.0,57117130.0,15721070.0,27271003.5,45187694.0,101870400.0,201232409.0,11.0,69478680.0,57117130.0,15721094.0,27271029.5,45187712.0,101870400.0,201232430.0,11.0,11.181818,3.429816,6.0,9.0,11.0,13.0,18.0,11.0,0.090909,0.301511,0.0,0.0,0.0,0.0,1.0,11.0,11.090909,3.238967,6.0,9.0,11.0,13.0,17.0,11.0,18.636364,5.661673,11.0,15.5,18.0,21.0,29.0,11.0,41.181818,10.571832,31.0,33.0,36.0,48.0,63.0,11.0,0.452721,0.079768,0.352941,0.381818,0.460317,0.494444,0.606061,11.0,25.272727,9.696297,12.0,19.5,26.0,28.5,49.0,11.0,55.272727,12.554608,40.0,45.0,54.0,61.0,82.0,11.0,0.181818,0.40452,0.0,0.0,0.0,0.0,1.0,11.0,29.545455,6.137811,22.0,24.0,30.0,33.0,40.0,11.0,0.063978,0.024546,0.030378,0.049365,0.065819,0.072148,0.124044,11.0,0.257347,0.058454,0.186238,0.209518,0.251422,0.284013,0.381788,11.0,0.00046,0.001024,0.0,0.0,0.0,0.0,0.002532,11.0,0.137562,0.028577,0.102431,0.111743,0.139679,0.153647,0.186238,11.0,2.064729,0.611259,0.787929,1.865594,2.108559,2.396889,3.201005,11.0,4.662551,0.658438,3.724677,4.401319,4.533561,4.790711,6.127005,11.0,1.719746e-07,3.022964e-07,8.904081e-13,1.325083e-10,1.429707e-09,1.884336e-07,8.404122e-07,11.0,8.711978e-06,1.450839e-05,2.893826e-10,1.722608e-08,1.327586e-07,1.125794e-05,3.9e-05
DetectSeq_JAK2-DddA11_REP-1,274.0,81858660.0,56911110.0,2044988.0,34589019.5,73660869.5,121537600.0,246696638.0,274.0,81858680.0,56911110.0,2044998.0,34589037.5,73660888.5,121537600.0,246696654.0,274.0,9.277372,2.459131,4.0,8.0,9.0,11.0,18.0,274.0,0.040146,0.196661,0.0,0.0,0.0,0.0,1.0,274.0,9.237226,2.450671,4.0,8.0,9.0,10.75,18.0,274.0,39.218978,27.998356,8.0,21.0,32.0,47.0,196.0,274.0,79.452555,50.68165,20.0,45.0,66.5,96.0,337.0,274.0,0.486151,0.092111,0.35,0.413949,0.47249,0.549172,0.885714,274.0,22.354015,9.339479,5.0,16.0,22.0,27.0,56.0,274.0,106.054745,69.047581,27.0,59.0,86.0,132.75,443.0,274.0,0.105839,0.308195,0.0,0.0,0.0,0.0,1.0,274.0,61.357664,42.646642,20.0,31.0,49.0,73.0,263.0,274.0,0.05659,0.023643,0.012658,0.040504,0.055693,0.068351,0.141765,274.0,0.649492,0.422856,0.165351,0.361323,0.526674,0.812977,2.712986,274.0,0.000268,0.00078,0.0,0.0,0.0,0.0,0.002532,274.0,0.375762,0.261173,0.122482,0.189848,0.300082,0.447061,1.610644,274.0,3.406059,0.876955,1.011468,2.809669,3.30555,3.93429,6.213102,274.0,5.757354,0.960699,4.055972,4.957687,5.704771,6.353241,9.022695,274.0,4.608038e-08,1.849436e-07,2.0874399999999997e-100,7.771788e-26,1.587117e-16,2.028121e-10,1.617993e-06,274.0,8.688948e-08,3.436207e-07,1.356836e-97,5.874026e-25,5.997826e-16,4.90136e-10,3e-06
DetectSeq_SIRT6-DddA11_REP-1,287.0,80157250.0,57445660.0,242415.0,33921669.5,70133094.0,119310900.0,246696638.0,287.0,80157270.0,57445660.0,242448.0,33921685.5,70133130.0,119310900.0,246696654.0,287.0,9.219512,2.56194,4.0,7.5,9.0,11.0,19.0,287.0,0.034843,0.183703,0.0,0.0,0.0,0.0,1.0,287.0,9.184669,2.541414,4.0,7.5,9.0,11.0,19.0,287.0,44.634146,31.882355,9.0,24.0,36.0,54.5,269.0,287.0,88.874564,57.708136,23.0,51.5,72.0,108.5,503.0,287.0,0.495395,0.091579,0.35,0.421683,0.5,0.550716,0.934783,287.0,21.867596,9.096793,5.0,16.0,21.0,27.0,56.0,287.0,117.261324,77.076916,31.0,68.0,93.0,143.0,669.0,287.0,0.108014,0.31094,0.0,0.0,0.0,0.0,1.0,287.0,68.397213,47.691415,20.0,36.0,54.0,84.5,341.0,287.0,0.055358,0.023029,0.012658,0.040504,0.053162,0.068351,0.141765,287.0,0.652121,0.428645,0.172399,0.378166,0.517197,0.79526,3.720484,287.0,0.000273,0.000787,0.0,0.0,0.0,0.0,0.002532,287.0,0.380375,0.265224,0.111225,0.200205,0.300308,0.469927,1.89639,287.0,3.45102,0.829845,1.800227,2.847506,3.41552,4.009866,6.004406,287.0,5.775047,0.969127,3.849767,5.047696,5.658974,6.392643,8.651111,287.0,5.600235e-08,2.546377e-07,8.941194999999999e-130,3.7173710000000003e-29,3.908722e-18,8.759697e-12,1.938486e-06,287.0,9.222598e-08,4.131651e-07,5.811777e-127,2.598162e-28,1.365208e-17,1.997826e-11,3e-06


In [95]:
df_sign_lenient = (df_old_site_new_signal
                   .query('FDR <= 0.01')  # 25118
                   .query('log2_FC_mut >= 2')  # 25118
                   .query('ctrl_mut_count <= 1')  # 22009
                   .query('`treat_mut_count.norm` * 100 >= 5')  # 22009
                   .query('treat_mut_count >= 10')  # 22009
                   .query('treat_mut_count / treat_count >= 0.15')  # 14476
                   .query('region_block_site_num <= 1')  # 13522
                   .query('region_highest_site_mut_ratio >= 0.30')  # 7001
                   )

print(df_sign_lenient.shape[0])
df_sign_lenient.groupby('<sample>').describe()

938


Unnamed: 0_level_0,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_start,region_end,region_end,region_end,region_end,region_end,region_end,region_end,region_end,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_block_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_mut_site_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_cover_num,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,region_highest_site_mut_ratio,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,ctrl_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,treat_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,ctrl_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,treat_mut_count,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,ctrl_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,treat_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,treat_mut_count.norm,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,log2_FC_mut,p_value,p_value,p_value,p_value,p_value,p_value,p_value,p_value,FDR,FDR,FDR,FDR,FDR,FDR,FDR,FDR
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
<sample>,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2,Unnamed: 121_level_2,Unnamed: 122_level_2,Unnamed: 123_level_2,Unnamed: 124_level_2,Unnamed: 125_level_2,Unnamed: 126_level_2,Unnamed: 127_level_2,Unnamed: 128_level_2,Unnamed: 129_level_2,Unnamed: 130_level_2,Unnamed: 131_level_2,Unnamed: 132_level_2,Unnamed: 133_level_2,Unnamed: 134_level_2,Unnamed: 135_level_2,Unnamed: 136_level_2,Unnamed: 137_level_2,Unnamed: 138_level_2,Unnamed: 139_level_2,Unnamed: 140_level_2,Unnamed: 141_level_2,Unnamed: 142_level_2,Unnamed: 143_level_2,Unnamed: 144_level_2,Unnamed: 145_level_2,Unnamed: 146_level_2,Unnamed: 147_level_2,Unnamed: 148_level_2,Unnamed: 149_level_2,Unnamed: 150_level_2,Unnamed: 151_level_2,Unnamed: 152_level_2,Unnamed: 153_level_2,Unnamed: 154_level_2,Unnamed: 155_level_2,Unnamed: 156_level_2,Unnamed: 157_level_2,Unnamed: 158_level_2,Unnamed: 159_level_2,Unnamed: 160_level_2
DetectSeq_ATP8-DddA11_REP-1,95.0,86066970.0,53767010.0,1056219.0,43608049.5,76681403.0,118859900.0,226608903.0,95.0,86066990.0,53767010.0,1056232.0,43608069.0,76681420.0,118859940.0,226608925.0,95.0,10.168421,2.76241,4.0,8.0,10.0,11.5,18.0,95.0,0.031579,0.175804,0.0,0.0,0.0,0.0,1.0,95.0,10.136842,2.731263,4.0,8.0,10.0,11.5,18.0,95.0,14.189474,9.837288,4.0,8.0,11.0,16.0,68.0,95.0,37.073684,25.218721,11.0,23.0,29.0,42.5,174.0,95.0,0.383037,0.062045,0.3,0.333333,0.37037,0.419872,0.545455,95.0,22.252632,9.492403,6.0,17.0,21.0,27.0,56.0,95.0,47.852632,32.786533,15.0,29.0,36.0,56.5,230.0,95.0,0.136842,0.345504,0.0,0.0,0.0,0.0,1.0,95.0,23.715789,15.158498,10.0,13.5,18.0,27.5,88.0,95.0,0.056333,0.02403,0.015189,0.043036,0.053162,0.068351,0.141765,95.0,0.285242,0.195435,0.089413,0.172865,0.21459,0.336788,1.370995,95.0,0.000346,0.000875,0.0,0.0,0.0,0.0,0.002532,95.0,0.141366,0.090357,0.059608,0.080471,0.107295,0.163923,0.524554,95.0,2.248123,0.871279,0.598087,1.584011,2.083514,2.820479,5.031376,95.0,4.430105,0.873532,3.016986,3.694221,4.336251,5.034626,7.44497,95.0,0.000314,0.00065,1.866343e-31,4.502563e-09,6.046509e-06,0.0002415574,0.002439,95.0,0.001436,0.002767,1.213123e-28,9.088318e-08,6.046509e-05,0.001388242,0.009909
DetectSeq_ATP8-DddA6_REP-1,45.0,75002170.0,53499800.0,819163.0,34500821.0,70144413.0,108063300.0,219416797.0,45.0,75002190.0,53499800.0,819192.0,34500856.0,70144447.0,108063381.0,219416815.0,45.0,10.222222,2.770698,3.0,8.0,10.0,12.0,18.0,45.0,0.044444,0.208409,0.0,0.0,0.0,0.0,1.0,45.0,10.177778,2.70764,3.0,8.0,10.0,12.0,17.0,45.0,11.2,4.827007,5.0,9.0,10.0,13.0,31.0,45.0,27.311111,11.908405,12.0,20.0,24.0,31.0,80.0,45.0,0.419022,0.091167,0.3,0.357143,0.407407,0.45,0.666667,45.0,21.666667,8.546663,6.0,17.0,21.0,27.0,49.0,45.0,37.466667,16.435272,17.0,27.0,34.0,44.0,103.0,45.0,0.088889,0.287799,0.0,0.0,0.0,0.0,1.0,45.0,20.111111,8.296281,11.0,14.0,18.0,22.0,48.0,45.0,0.054849,0.021636,0.015189,0.043036,0.053162,0.068351,0.124044,45.0,0.240792,0.105627,0.109256,0.173525,0.218513,0.282781,0.661965,45.0,0.000225,0.000729,0.0,0.0,0.0,0.0,0.002532,45.0,0.129251,0.053319,0.070695,0.089976,0.115683,0.141391,0.308488,45.0,2.13888,0.758834,0.90801,1.528534,2.040717,2.692033,4.445647,45.0,4.37565,0.688075,3.25111,3.860816,4.293545,4.710541,6.701661,45.0,0.000101,0.000218,1.269741e-16,2.221841e-07,4.900206e-06,9.911734e-05,0.000883,45.0,0.001184,0.00227,8.253317e-14,9.026229e-06,0.0001137548,0.001370772,0.00897
DetectSeq_ATP8-DddAwt_REP-1,30.0,87863070.0,57042400.0,8410441.0,40895779.0,76076125.5,119366500.0,219416797.0,30.0,87863100.0,57042400.0,8410462.0,40895795.5,76076167.0,119366554.0,219416815.0,30.0,10.566667,2.920597,6.0,9.0,10.5,12.75,18.0,30.0,0.033333,0.182574,0.0,0.0,0.0,0.0,1.0,30.0,10.533333,2.837353,6.0,9.0,10.5,12.75,17.0,30.0,13.0,5.699365,5.0,9.0,12.0,16.25,29.0,30.0,29.666667,11.771776,12.0,22.0,27.5,33.0,63.0,30.0,0.443145,0.089441,0.3,0.363636,0.443198,0.522059,0.606061,30.0,22.6,10.591473,8.0,14.75,21.5,27.0,49.0,30.0,39.433333,15.475639,18.0,29.0,35.0,45.5,82.0,30.0,0.166667,0.379049,0.0,0.0,0.0,0.0,1.0,30.0,20.6,8.084041,13.0,14.0,17.5,24.5,40.0,30.0,0.057212,0.026812,0.020252,0.03734,0.054428,0.068351,0.124044,30.0,0.1836,0.072054,0.083807,0.135023,0.162958,0.211846,0.381788,30.0,0.000422,0.00096,0.0,0.0,0.0,0.0,0.002532,30.0,0.095913,0.037639,0.060527,0.065183,0.081479,0.114071,0.186238,30.0,1.734376,0.640308,0.608988,1.186891,1.879077,2.142457,3.201005,30.0,4.061864,0.73122,3.00847,3.521385,3.860675,4.568028,6.127005,30.0,0.000147,0.0002,8.904081e-13,1.279882e-07,1.958503e-05,0.0002776883,0.000605,30.0,0.002414,0.003014,2.893826e-10,8.068019e-06,0.0005551199,0.00522016,0.008736
DetectSeq_JAK2-DddA11_REP-1,362.0,80306590.0,57490510.0,819163.0,34041683.0,69810892.5,121537600.0,246696638.0,362.0,80306610.0,57490510.0,819192.0,34041698.0,69810913.0,121537589.5,246696654.0,362.0,9.138122,2.477094,3.0,7.0,9.0,11.0,20.0,362.0,0.035912,0.186327,0.0,0.0,0.0,0.0,1.0,362.0,9.10221,2.465956,3.0,7.0,9.0,10.0,20.0,362.0,33.121547,27.035867,4.0,15.0,25.5,41.0,196.0,362.0,69.566298,49.87305,10.0,36.0,56.0,86.75,337.0,362.0,0.464163,0.103841,0.3,0.380238,0.45049,0.538316,0.885714,362.0,21.616022,8.973588,5.0,15.0,21.0,26.0,56.0,362.0,92.734807,67.660316,15.0,46.0,73.0,115.0,443.0,362.0,0.110497,0.313942,0.0,0.0,0.0,0.0,1.0,362.0,52.59116,41.301858,10.0,25.25,40.0,65.75,263.0,362.0,0.054721,0.022717,0.012658,0.037973,0.053162,0.065819,0.141765,362.0,0.567919,0.41436,0.091862,0.28171,0.447061,0.704274,2.712986,362.0,0.00028,0.000795,0.0,0.0,0.0,0.0,0.002532,362.0,0.322075,0.252938,0.061241,0.154634,0.244965,0.402661,1.610644,362.0,3.191582,0.927139,0.933465,2.530578,3.156033,3.77627,6.213102,362.0,5.46001,1.106517,2.918468,4.722598,5.382253,6.163651,9.022695,362.0,7.4e-05,0.000358,2.0874399999999997e-100,7.959148e-23,2.506066e-13,2.201052e-08,0.003405,362.0,0.000112,0.000534,1.356836e-97,4.703133e-22,7.65792e-13,4.534517e-08,0.00503
DetectSeq_SIRT6-DddA11_REP-1,406.0,81204400.0,58506080.0,242415.0,33993359.75,70429299.5,122032500.0,246696638.0,406.0,81204420.0,58506080.0,242448.0,33993378.75,70429323.5,122032497.5,246696654.0,406.0,9.024631,2.481414,3.0,7.0,9.0,11.0,19.0,406.0,0.039409,0.194806,0.0,0.0,0.0,0.0,1.0,406.0,8.985222,2.472521,3.0,7.0,9.0,10.0,19.0,406.0,36.724138,30.22578,5.0,17.0,28.5,46.75,269.0,406.0,77.41133,55.624853,12.0,41.0,61.0,97.75,503.0,406.0,0.460399,0.105733,0.3,0.375,0.444878,0.534861,0.934783,406.0,21.842365,9.103779,5.0,16.0,21.0,26.0,56.0,406.0,101.763547,74.13893,16.0,52.0,81.0,128.0,669.0,406.0,0.125616,0.331825,0.0,0.0,0.0,0.0,1.0,406.0,57.278325,45.86856,10.0,26.0,44.0,74.0,341.0,406.0,0.055294,0.023046,0.012658,0.040504,0.053162,0.065819,0.141765,406.0,0.565934,0.412306,0.08898,0.289186,0.450462,0.711841,3.720484,406.0,0.000318,0.00084,0.0,0.0,0.0,0.0,0.002532,406.0,0.31854,0.255087,0.055613,0.144593,0.244695,0.411533,1.89639,406.0,3.183387,0.930393,0.971913,2.532717,3.166664,3.806076,6.004406,406.0,5.438947,1.137137,2.773504,4.62807,5.419245,6.213994,8.651111,406.0,6.4e-05,0.00035,8.941194999999999e-130,8.315924e-26,1.286622e-14,2.34181e-08,0.004223,406.0,8.6e-05,0.000458,5.811777e-127,4.547063e-25,3.587809e-14,4.29027e-08,0.005458


In [96]:
df_sign_lenient

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
8,DetectSeq_ATP8-DddA6_REP-1,chr1,18907111,18907133,chr1_18907111_18907133,9,0,9,"chr1_18907111_GA,chr1_18907119_...",N-N-N-N-N-N-N-N-N,chr1_18907133_GA,11,18,0.611111,28,24,0,17,0.070882,0.154244,0.000000,0.109256,"0,1,2,3,4,5,6,7,8,9 28,0,0,0,0,...",1.121717,3.753610,TestOK,2.206580e-05,4.395162e-04,chr1_18907111_18907133_highest_...,.
36,DetectSeq_ATP8-DddA6_REP-1,chr1,67532557,67532593,chr1_67532557_67532593,10,0,10,"chr1_67532557_GA,chr1_67532565_...",N-N-N-N-N-N-N-N-N-N,chr1_67532592_GA,15,44,0.340909,49,60,0,33,0.124044,0.385610,0.000000,0.212086,"0,1,2,3,4,5,6,7,8,9,10 46,3,0,0...",1.636290,4.710541,TestOK,1.009651e-10,9.375327e-09,chr1_67532557_67532593_highest_...,.
56,DetectSeq_ATP8-DddA6_REP-1,chr1,153223231,153223253,chr1_153223231_153223253,12,0,12,"chr1_153223231_CT,chr1_15322323...",N-N-N-N-N-N-N-N-N-N-N-N,chr1_153223231_CT,8,12,0.666667,23,17,0,12,0.058225,0.109256,0.000000,0.077122,"0,1,2,3,4,5,6,7,8,9,10,11,12 20...",0.908010,3.251110,TestOK,7.761196e-04,8.270126e-03,chr1_153223231_153223253_highes...,.
60,DetectSeq_ATP8-DddA6_REP-1,chr1,161209983,161210006,chr1_161209983_161210006,12,0,12,"chr1_161209983_GA,chr1_16120998...",N-N-N-N-N-N-N-N-N-N-N-N,chr1_161210002_GA,10,17,0.588235,24,22,1,16,0.060756,0.141391,0.002532,0.102829,"0,1,2,3,4,5,6,7,8,9,10,11,12 19...",1.218579,5.344109,TestOK,4.566141e-05,7.933340e-04,chr1_161209983_161210006_highes...,.
75,DetectSeq_ATP8-DddA6_REP-1,chr1,219416797,219416815,chr1_219416797_219416815,8,0,8,"chr1_219416797_CT,chr1_21941679...",N-N-N-N-N-N-N-N,chr1_219416798_CT,8,23,0.347826,12,31,0,16,0.030378,0.199232,0.000000,0.102829,"0,1,2,3,4,5,6,7,8 12,0,0,0,0,0,...",2.713343,3.666147,TestOK,4.566141e-05,7.933340e-04,chr1_219416797_219416815_highes...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,DetectSeq_SIRT6-DddA11_REP-1,chrX,132292961,132292983,chrX_132292961_132292983,9,0,9,"chrX_132292961_GA,chrX_13229296...",N-N-N-N-N-N-N-N-N,chrX_132292983_GA,8,26,0.307692,40,40,1,14,0.101261,0.222450,0.002532,0.077858,"0,1,2,3,4,5,6,7,8,9 39,0,1,0,0,...",1.135412,4.942767,TestOK,1.408338e-04,2.007500e-04,chrX_132292961_132292983_highes...,.
3245,DetectSeq_SIRT6-DddA11_REP-1,chrX,136925330,136925338,chrX_136925330_136925338,5,0,5,"chrX_136925330_CT,chrX_13692533...",N-N-N-N-N,chrX_136925330_CT,42,103,0.407767,43,129,0,59,0.108855,0.717403,0.000000,0.328114,"0,1,2,3,4,5 43,0,0,0,0,0 27,43,...",2.720374,5.608487,TestOK,2.433039e-20,9.702301e-20,chrX_136925330_136925338_highes...,.
3246,DetectSeq_SIRT6-DddA11_REP-1,chrX,138128211,138128237,chrX_138128211_138128237,9,1,8,"chrX_138128211_GA,chrX_13812821...",N-N-N-B-N-N-N-N-N,chrX_138128237_GA,12,28,0.428571,33,37,1,13,0.083540,0.205767,0.002532,0.072296,"0,1,2,3,4,5,6,7,8,9 32,0,1,0,0,...",1.300471,4.835851,TestOK,2.889936e-04,4.053103e-04,chrX_138128211_138128237_highes...,.
3248,DetectSeq_SIRT6-DddA11_REP-1,chrX,153669436,153669453,chrX_153669436_153669453,10,0,10,"chrX_153669436_CT,chrX_15366943...",N-N-N-N-N-N-N-N-N-N,chrX_153669436_CT,7,22,0.318182,22,26,0,11,0.055693,0.144593,0.000000,0.061174,"0,1,2,3,4,5,6,7,8,9,10 22,0,0,0...",1.376420,3.185275,TestOK,1.185269e-03,1.578739e-03,chrX_153669436_153669453_highes...,.


In [97]:
df_sign_lenient.mpmat_index.describe()

count                        938
unique                       439
top       chr1_18907111_18907133
freq                           5
Name: mpmat_index, dtype: object

In [98]:
df_old_439 = df_sign_lenient

### data processing

In [99]:
df_old_439 = df_old_439[['<sample>', 'mpmat_index',
                         'treat_mut_count.norm', 'treat_count.norm']].copy()
df_old_439 = (
    df_old_439.
    assign(
        sample=df_old_439['<sample>'],
        detect_seq_score=df_old_439.apply(lambda x: ((x['treat_mut_count.norm'] / x['treat_count.norm']) ** 2) * x['treat_mut_count.norm'], axis=1),
    )
)

df_old_439

Unnamed: 0,<sample>,mpmat_index,treat_mut_count.norm,treat_count.norm,sample,detect_seq_score
8,DetectSeq_ATP8-DddA6_REP-1,chr1_18907111_18907133,0.109256,0.154244,DetectSeq_ATP8-DddA6_REP-1,0.054818
36,DetectSeq_ATP8-DddA6_REP-1,chr1_67532557_67532593,0.212086,0.385610,DetectSeq_ATP8-DddA6_REP-1,0.064156
56,DetectSeq_ATP8-DddA6_REP-1,chr1_153223231_153223253,0.077122,0.109256,DetectSeq_ATP8-DddA6_REP-1,0.038428
60,DetectSeq_ATP8-DddA6_REP-1,chr1_161209983_161210006,0.102829,0.141391,DetectSeq_ATP8-DddA6_REP-1,0.054389
75,DetectSeq_ATP8-DddA6_REP-1,chr1_219416797_219416815,0.102829,0.199232,DetectSeq_ATP8-DddA6_REP-1,0.027393
...,...,...,...,...,...,...
3243,DetectSeq_SIRT6-DddA11_REP-1,chrX_132292961_132292983,0.077858,0.222450,DetectSeq_SIRT6-DddA11_REP-1,0.009538
3245,DetectSeq_SIRT6-DddA11_REP-1,chrX_136925330_136925338,0.328114,0.717403,DetectSeq_SIRT6-DddA11_REP-1,0.068636
3246,DetectSeq_SIRT6-DddA11_REP-1,chrX_138128211_138128237,0.072296,0.205767,DetectSeq_SIRT6-DddA11_REP-1,0.008925
3248,DetectSeq_SIRT6-DddA11_REP-1,chrX_153669436_153669453,0.061174,0.144593,DetectSeq_SIRT6-DddA11_REP-1,0.010950


In [100]:
df_detect_seq_score = df_old_439[['mpmat_index', 'sample', 'detect_seq_score']].copy()
df_share_old = df_detect_seq_score[['mpmat_index', 'sample', 'detect_seq_score']].copy()
df_share_old

Unnamed: 0,mpmat_index,sample,detect_seq_score
8,chr1_18907111_18907133,DetectSeq_ATP8-DddA6_REP-1,0.054818
36,chr1_67532557_67532593,DetectSeq_ATP8-DddA6_REP-1,0.064156
56,chr1_153223231_153223253,DetectSeq_ATP8-DddA6_REP-1,0.038428
60,chr1_161209983_161210006,DetectSeq_ATP8-DddA6_REP-1,0.054389
75,chr1_219416797_219416815,DetectSeq_ATP8-DddA6_REP-1,0.027393
...,...,...,...
3243,chrX_132292961_132292983,DetectSeq_SIRT6-DddA11_REP-1,0.009538
3245,chrX_136925330_136925338,DetectSeq_SIRT6-DddA11_REP-1,0.068636
3246,chrX_138128211_138128237,DetectSeq_SIRT6-DddA11_REP-1,0.008925
3248,chrX_153669436_153669453,DetectSeq_SIRT6-DddA11_REP-1,0.010950


# 初步过滤 poisson_res -> df_pois 并保存备用
- log2_FC: log2(treat_count.norm/ctrl_count.norm), 如果ctrl_count.norm不存在，就用 chr 突变背景
- log2_FC_mut: log2(treat_mut_count.norm/ctrl_mut_count.norm), 如果ctrl_mut_count.norm不存在，就用 chr 突变背景
- region_block_state: B-Blocked, S-SNV, N-Non-SNV

## data processing

In [101]:
file = '../poisson_res/poisson_res_all.tsv.gz'

In [33]:
df = pd.read_csv(
        file,
        header=0,
        index_col=None,
        sep='\t'
    )
df.head()

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR
0,DetectSeq_ATP8-DddA11_REP-1,chr1,16395,16418,chr1_16395_16418,11,0,11,"chr1_16395_CT,chr1_16397_C.,chr...",N-N-N-N-N-N-N-N-N-N-N,chr1_16395_CT,3,16,0.1875,57,19,0,2,0.144296,0.113256,0.0,0.011922,"0,1,2,3,4,5,6,7,8,9,10,11 53,4,...",-0.349446,0.557555,TestOK,0.37762,0.842234
1,DetectSeq_ATP8-DddA11_REP-1,chr1,16534,16540,chr1_16534_16540,4,1,3,"chr1_16534_CT,chr1_16538_CT,chr...",B-N-N-N,chr1_16538_CT,0,7,0.0,36,8,0,0,0.091135,0.047687,0.0,0.0,"0,1,2,3,4 36,0,0,0,0 8,0,0,0,0",-0.934408,,TestOK,0.841752,0.842234
2,DetectSeq_ATP8-DddA11_REP-1,chr1,20272,20284,chr1_20272_20284,5,0,5,"chr1_20272_GA,chr1_20275_GA,chr...",N-N-N-N-N,chr1_20283_GA,2,7,0.285714,12,9,0,2,0.030378,0.053648,0.0,0.011922,"0,1,2,3,4,5 11,1,0,0,0,0 5,2,0,...",0.820479,0.557555,TestOK,0.37762,0.842234
3,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,"chr1_31029_GA,chr1_31030_G.,chr...",B-N-N-N,chr1_31031_GA,1,6,0.166667,2,6,0,1,0.005063,0.035765,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.842234
4,DetectSeq_ATP8-DddA11_REP-1,chr1,54043,54047,chr1_54043_54047,3,0,3,"chr1_54043_CT,chr1_54044_C.,chr...",N-N-N,chr1_54043_CT,4,4,1.0,0,4,0,0,0.0,0.023843,0.0,0.0,"0,1,2,3 0,0,0,0 0,4,0,0",-1.456561,,TestOK,0.841752,0.842234


Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_site_index,region_block_state,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR
0,DetectSeq_ATP8-DddA11_REP-1,chr1,16395,16418,chr1_16395_16418,11,0,11,"chr1_16395_CT,chr1_16397_C.,chr...",N-N-N-N-N-N-N-N-N-N-N,chr1_16395_CT,3,16,0.1875,57,19,0,2,0.144296,0.113256,0.0,0.011922,"0,1,2,3,4,5,6,7,8,9,10,11 53,4,...",-0.349446,0.557555,TestOK,0.37762,0.842234
1,DetectSeq_ATP8-DddA11_REP-1,chr1,16534,16540,chr1_16534_16540,4,1,3,"chr1_16534_CT,chr1_16538_CT,chr...",B-N-N-N,chr1_16538_CT,0,7,0.0,36,8,0,0,0.091135,0.047687,0.0,0.0,"0,1,2,3,4 36,0,0,0,0 8,0,0,0,0",-0.934408,,TestOK,0.841752,0.842234
2,DetectSeq_ATP8-DddA11_REP-1,chr1,20272,20284,chr1_20272_20284,5,0,5,"chr1_20272_GA,chr1_20275_GA,chr...",N-N-N-N-N,chr1_20283_GA,2,7,0.285714,12,9,0,2,0.030378,0.053648,0.0,0.011922,"0,1,2,3,4,5 11,1,0,0,0,0 5,2,0,...",0.820479,0.557555,TestOK,0.37762,0.842234
3,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,"chr1_31029_GA,chr1_31030_G.,chr...",B-N-N-N,chr1_31031_GA,1,6,0.166667,2,6,0,1,0.005063,0.035765,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.842234
4,DetectSeq_ATP8-DddA11_REP-1,chr1,54043,54047,chr1_54043_54047,3,0,3,"chr1_54043_CT,chr1_54044_C.,chr...",N-N-N,chr1_54043_CT,4,4,1.0,0,4,0,0,0.0,0.023843,0.0,0.0,"0,1,2,3 0,0,0,0 0,4,0,0",-1.456561,,TestOK,0.841752,0.842234


In [35]:
df.columns

Index(['<sample>', 'chr_name', 'region_start', 'region_end', 'mpmat_index', 'region_site_num', 'region_block_site_num', 'region_mut_site_num', 'region_site_index', 'region_block_state',
       'region_highest_site_index', 'region_highest_site_mut_num', 'region_highest_site_cover_num', 'region_highest_site_mut_ratio', 'ctrl_count', 'treat_count', 'ctrl_mut_count', 'treat_mut_count',
       'ctrl_count.norm', 'treat_count.norm', 'ctrl_mut_count.norm', 'treat_mut_count.norm', 'count_info', 'log2_FC', 'log2_FC_mut', 'test_state', 'p_value', 'FDR'],
      dtype='object')

Index(['<sample>', 'chr_name', 'region_start', 'region_end', 'mpmat_index', 'region_site_num', 'region_block_site_num', 'region_mut_site_num', 'region_site_index', 'region_block_state',
       'region_highest_site_index', 'region_highest_site_mut_num', 'region_highest_site_cover_num', 'region_highest_site_mut_ratio', 'ctrl_count', 'treat_count', 'ctrl_mut_count', 'treat_mut_count',
       'ctrl_count.norm', 'treat_count.norm', 'ctrl_mut_count.norm', 'treat_mut_count.norm', 'count_info', 'log2_FC', 'log2_FC_mut', 'test_state', 'p_value', 'FDR'],
      dtype='object')

In [36]:
df.drop(columns=['region_site_index', 'region_block_state'], inplace=True)

In [37]:
print(df.info())
print(df.isna().sum())
# NAN应该是 test sample 中的，一会过滤一下

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12279920 entries, 0 to 12279919
Data columns (total 26 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   <sample>                       object 
 1   chr_name                       object 
 2   region_start                   int64  
 3   region_end                     int64  
 4   mpmat_index                    object 
 5   region_site_num                int64  
 6   region_block_site_num          int64  
 7   region_mut_site_num            int64  
 8   region_highest_site_index      object 
 9   region_highest_site_mut_num    int64  
 10  region_highest_site_cover_num  int64  
 11  region_highest_site_mut_ratio  float64
 12  ctrl_count                     int64  
 13  treat_count                    int64  
 14  ctrl_mut_count                 int64  
 15  treat_mut_count                int64  
 16  ctrl_count.norm                float64
 17  treat_count.norm               float64
 18  

In [38]:
print(df['<sample>'].unique())

['DetectSeq_ATP8-DddA11_REP-1' 'DetectSeq_ATP8-DddA6_REP-1'
 'DetectSeq_ATP8-DddAwt_REP-1' 'DetectSeq_JAK2-DddA11_REP-1'
 'DetectSeq_JAK2-DddA11_REP-2' 'DetectSeq_SIRT6-DddA11_REP-1'
 'DetectSeq_SIRT6-DddA11_REP-2' 'test']


In [39]:
df = df.assign(
    bed_name=df.mpmat_index + '_highest_' + df.region_highest_site_index,
    strand='.'
)
df = df[df['<sample>'] != 'test']
df

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,16395,16418,chr1_16395_16418,11,0,11,chr1_16395_CT,3,16,0.187500,57,19,0,2,0.144296,0.113256,0.0,0.011922,"0,1,2,3,4,5,6,7,8,9,10,11 53,4,...",-0.349446,0.557555,TestOK,0.377620,0.842234,chr1_16395_16418_highest_chr1_1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,16534,16540,chr1_16534_16540,4,1,3,chr1_16538_CT,0,7,0.000000,36,8,0,0,0.091135,0.047687,0.0,0.000000,"0,1,2,3,4 36,0,0,0,0 8,0,0,0,0",-0.934408,,TestOK,0.841752,0.842234,chr1_16534_16540_highest_chr1_1...,.
2,DetectSeq_ATP8-DddA11_REP-1,chr1,20272,20284,chr1_20272_20284,5,0,5,chr1_20283_GA,2,7,0.285714,12,9,0,2,0.030378,0.053648,0.0,0.011922,"0,1,2,3,4,5 11,1,0,0,0,0 5,2,0,...",0.820479,0.557555,TestOK,0.377620,0.842234,chr1_20272_20284_highest_chr1_2...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,chr1_31031_GA,1,6,0.166667,2,6,0,1,0.005063,0.035765,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.842234,chr1_31029_31037_highest_chr1_3...,.
4,DetectSeq_ATP8-DddA11_REP-1,chr1,54043,54047,chr1_54043_54047,3,0,3,chr1_54043_CT,4,4,1.000000,0,4,0,0,0.000000,0.023843,0.0,0.000000,"0,1,2,3 0,0,0,0 0,4,0,0",-1.456561,,TestOK,0.841752,0.842234,chr1_54043_54047_highest_chr1_5...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10744925,DetectSeq_SIRT6-DddA11_REP-2,chrX,155984850,155984857,chrX_155984850_155984857,2,0,2,chrX_155984850_CT,4,6,0.666667,0,7,0,0,0.000000,0.032510,0.0,0.000000,"0,1,2 0,0,0 2,5,0",-0.830575,,TestOK,0.838398,0.897853,chrX_155984850_155984857_highes...,.
10744926,DetectSeq_SIRT6-DddA11_REP-2,chrX,155984970,155984999,chrX_155984970_155984999,7,0,7,chrX_155984970_CT,0,8,0.000000,0,10,0,0,0.000000,0.046442,0.0,0.000000,"0,1,2,3,4,5,6,7 0,0,0,0,0,0,0,0...",-0.316002,,TestOK,0.838398,0.897853,chrX_155984970_155984999_highes...,.
10744927,DetectSeq_SIRT6-DddA11_REP-2,chrX,155985095,155985105,chrX_155985095_155985105,3,0,3,chrX_155985095_GA,4,4,1.000000,0,4,0,2,0.000000,0.018577,0.0,0.009288,"0,1,2,3 0,0,0,0 1,1,2,0",-1.637930,0.465867,TestOK,0.372391,0.897853,chrX_155985095_155985105_highes...,.
10744928,DetectSeq_SIRT6-DddA11_REP-2,chrX,155988525,155988531,chrX_155988525_155988531,4,0,4,chrX_155988525_CT,0,14,0.000000,0,15,0,0,0.000000,0.069663,0.0,0.000000,"0,1,2,3,4 0,0,0,0,0 15,0,0,0,0",0.268960,,TestOK,0.838398,0.897853,chrX_155988525_155988531_highes...,.


In [40]:
print(df.info())
print(df.isna().sum())
# 过滤掉 test sample 之后 NAN 少了很多

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10744930 entries, 0 to 10744929
Data columns (total 28 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   <sample>                       object 
 1   chr_name                       object 
 2   region_start                   int64  
 3   region_end                     int64  
 4   mpmat_index                    object 
 5   region_site_num                int64  
 6   region_block_site_num          int64  
 7   region_mut_site_num            int64  
 8   region_highest_site_index      object 
 9   region_highest_site_mut_num    int64  
 10  region_highest_site_cover_num  int64  
 11  region_highest_site_mut_ratio  float64
 12  ctrl_count                     int64  
 13  treat_count                    int64  
 14  ctrl_mut_count                 int64  
 15  treat_mut_count                int64  
 16  ctrl_count.norm                float64
 17  treat_count.norm               float64
 18  

In [41]:
# 这里mpmat_index unique，即 frequency == 1 才合理，因为前期 call 点的时候，把所有的 region 放在一起考虑的
df.groupby('<sample>').mpmat_index.describe()
# call 点没问题
# 继续后续分析

Unnamed: 0_level_0,count,unique,top,freq
<sample>,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DetectSeq_ATP8-DddA11_REP-1,1534990,1534990,chr1_16395_16418,1
DetectSeq_ATP8-DddA6_REP-1,1534990,1534990,chr1_16395_16418,1
DetectSeq_ATP8-DddAwt_REP-1,1534990,1534990,chr1_16395_16418,1
DetectSeq_JAK2-DddA11_REP-1,1534990,1534990,chr1_16395_16418,1
DetectSeq_JAK2-DddA11_REP-2,1534990,1534990,chr1_16395_16418,1
DetectSeq_SIRT6-DddA11_REP-1,1534990,1534990,chr1_16395_16418,1
DetectSeq_SIRT6-DddA11_REP-2,1534990,1534990,chr1_16395_16418,1


## find significant region

In [42]:
df

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,16395,16418,chr1_16395_16418,11,0,11,chr1_16395_CT,3,16,0.187500,57,19,0,2,0.144296,0.113256,0.0,0.011922,"0,1,2,3,4,5,6,7,8,9,10,11 53,4,...",-0.349446,0.557555,TestOK,0.377620,0.842234,chr1_16395_16418_highest_chr1_1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,16534,16540,chr1_16534_16540,4,1,3,chr1_16538_CT,0,7,0.000000,36,8,0,0,0.091135,0.047687,0.0,0.000000,"0,1,2,3,4 36,0,0,0,0 8,0,0,0,0",-0.934408,,TestOK,0.841752,0.842234,chr1_16534_16540_highest_chr1_1...,.
2,DetectSeq_ATP8-DddA11_REP-1,chr1,20272,20284,chr1_20272_20284,5,0,5,chr1_20283_GA,2,7,0.285714,12,9,0,2,0.030378,0.053648,0.0,0.011922,"0,1,2,3,4,5 11,1,0,0,0,0 5,2,0,...",0.820479,0.557555,TestOK,0.377620,0.842234,chr1_20272_20284_highest_chr1_2...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,31029,31037,chr1_31029_31037,4,1,3,chr1_31031_GA,1,6,0.166667,2,6,0,1,0.005063,0.035765,0.0,0.005961,"0,1,2,3,4 2,0,0,0,0 5,0,1,0,0",2.820479,-0.442445,TestOK,0.582932,0.842234,chr1_31029_31037_highest_chr1_3...,.
4,DetectSeq_ATP8-DddA11_REP-1,chr1,54043,54047,chr1_54043_54047,3,0,3,chr1_54043_CT,4,4,1.000000,0,4,0,0,0.000000,0.023843,0.0,0.000000,"0,1,2,3 0,0,0,0 0,4,0,0",-1.456561,,TestOK,0.841752,0.842234,chr1_54043_54047_highest_chr1_5...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10744925,DetectSeq_SIRT6-DddA11_REP-2,chrX,155984850,155984857,chrX_155984850_155984857,2,0,2,chrX_155984850_CT,4,6,0.666667,0,7,0,0,0.000000,0.032510,0.0,0.000000,"0,1,2 0,0,0 2,5,0",-0.830575,,TestOK,0.838398,0.897853,chrX_155984850_155984857_highes...,.
10744926,DetectSeq_SIRT6-DddA11_REP-2,chrX,155984970,155984999,chrX_155984970_155984999,7,0,7,chrX_155984970_CT,0,8,0.000000,0,10,0,0,0.000000,0.046442,0.0,0.000000,"0,1,2,3,4,5,6,7 0,0,0,0,0,0,0,0...",-0.316002,,TestOK,0.838398,0.897853,chrX_155984970_155984999_highes...,.
10744927,DetectSeq_SIRT6-DddA11_REP-2,chrX,155985095,155985105,chrX_155985095_155985105,3,0,3,chrX_155985095_GA,4,4,1.000000,0,4,0,2,0.000000,0.018577,0.0,0.009288,"0,1,2,3 0,0,0,0 1,1,2,0",-1.637930,0.465867,TestOK,0.372391,0.897853,chrX_155985095_155985105_highes...,.
10744928,DetectSeq_SIRT6-DddA11_REP-2,chrX,155988525,155988531,chrX_155988525_155988531,4,0,4,chrX_155988525_CT,0,14,0.000000,0,15,0,0,0.000000,0.069663,0.0,0.000000,"0,1,2,3,4 0,0,0,0,0 15,0,0,0,0",0.268960,,TestOK,0.838398,0.897853,chrX_155988525_155988531_highes...,.


### strict selection 

In [146]:
# df_sign_strict = (
#     df
#     .query('FDR <= 0.0001')
#     .query('log2_FC_mut >= 2')
#     .query('ctrl_mut_count <= 1')
#     .query('`treat_mut_count.norm` * 100 >= 10')
#     .query('treat_mut_count >= 20')
#     .query('treat_mut_count / treat_count >= 0.15')
#     .query('region_block_site_num <= 1')
#     .query('region_highest_site_mut_ratio >= 0.35')  # 1892
# )

# print(df_sign_strict.shape[0])
# df_sign_strict.groupby('<sample>').mpmat_index.count()

In [147]:
# df_sign_strict.isna().sum().sum()

### lenient selection

In [148]:
# nature condicitons
# df_sign_lenient = (
#     df
#     .query('FDR <= 0.01')
#     .query('log2_FC_mut >= 2')
#     .query('ctrl_mut_count <= 1')
#     .query('`treat_mut_count.norm` * 100 >= 5')
#     .query('treat_mut_count >= 10')
#     .query('treat_mut_count / treat_count >= 0.15')
#     .query('region_block_site_num <= 1')
#     .query('region_highest_site_mut_ratio >= 0.30')  # 4007
# )


df_sign_lenient = (
    df
    .query('FDR <= 0.01')
    .query('log2_FC_mut >= 2')
    .query('ctrl_mut_count <= 1')
    .query('`treat_mut_count.norm` * 100 >= 5')
    .query('treat_mut_count >= 10')
    .query('treat_mut_count / treat_count >= 0.10')  # TODO
    .query('region_block_site_num <= 1')
    .query('region_highest_site_mut_ratio >= 0.25')  # TODO
)

print(df_sign_lenient.shape[0])
df_sign_lenient.groupby('<sample>').mpmat_index.count()

12349


<sample>
DetectSeq_ATP8-DddA11_REP-1      177
DetectSeq_ATP8-DddA6_REP-1       589
DetectSeq_ATP8-DddAwt_REP-1       57
DetectSeq_JAK2-DddA11_REP-1      628
DetectSeq_JAK2-DddA11_REP-2      667
DetectSeq_SIRT6-DddA11_REP-1    2801
DetectSeq_SIRT6-DddA11_REP-2    7430
Name: mpmat_index, dtype: int64

In [109]:
df_sign_lenient.isna().sum().sum()

0

In [110]:
# df = df_sign_lenient
print(df_sign_lenient.shape)
df_sign_lenient.head()

(12349, 28)


Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
83,DetectSeq_ATP8-DddA11_REP-1,chr1,630967,631005,chr1_630967_631005,7,1,6,chr1_630992_GA,2975,4635,0.641855,38,6608,1,780,0.096198,39.389272,0.002532,4.64946,"0,1,2,3,4,5,6,7 35,2,1,0,0,0,0,...",8.677587,10.842847,TestOK,2.511695e-310,5.507753e-305,chr1_630967_631005_highest_chr1...,.
84,DetectSeq_ATP8-DddA11_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1732,4662,0.371514,33,6236,0,1189,0.08354,37.171837,0.0,7.087446,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 31...",8.797528,9.773088,TestOK,0.0,0.0,chr1_631016_631037_highest_chr1...,.
89,DetectSeq_ATP8-DddA11_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631791_CT,733,1147,0.639058,40,1483,0,547,0.101261,8.839935,0.0,3.260583,"0,1,2,3 40,0,0,0 534,278,413,258",6.447891,8.652952,TestOK,2.915445e-214,4.972421e-209,chr1_631778_631791_highest_chr1...,.
95,DetectSeq_ATP8-DddA11_REP-1,chr1,632382,632387,chr1_632382_632387,2,0,2,chr1_632387_GA,4739,6544,0.724175,25,8609,0,1032,0.063288,51.316925,0.0,6.151593,"0,1,2 24,1,0 1601,5305,1703",9.66329,9.568782,TestOK,0.0,0.0,chr1_632382_632387_highest_chr1...,.
100,DetectSeq_ATP8-DddA11_REP-1,chr1,633067,633147,chr1_633067_633147,11,0,11,chr1_633100_GA,696,1232,0.564935,33,2479,0,572,0.08354,14.776938,0.0,3.409604,"0,1,2,3,4,5,6,7,8,9,10,11 33,0,...",7.466665,8.717426,TestOK,1.5963609999999997e-224,3.0629980000000005e-219,chr1_633067_633147_highest_chr1...,.


### 把 Nature 中的 region 对应过来，使用 Nature 中的 idx, fix mpmat_index (add old into it) -> df_pois

In [111]:
df_old_id = pd.read_csv(
    "../tables/20220312-DdCBE-off_target_type.FinallistV4.CheckPrimer.AddV4ID.tsv",
    sep='\t', header=0, usecols=['region_id'],)
df_old_id.region_id.describe()

count                     6881
unique                    6881
top       chr1_1471366_1471410
freq                         1
Name: region_id, dtype: object

In [112]:
df_old_id[['chrom', 'start', 'end'
           ]] = pd.Series(df_old_id.region_id.unique()).str.split('_',
                                                                  expand=True)
df_old_id = df_old_id.iloc[:, 1:4]
df_old_id[['start', 'end']] = df_old_id[['start', 'end']].astype(int)
df_old_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6881 entries, 0 to 6880
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   chrom   6881 non-null   object
 1   start   6881 non-null   int64 
 2   end     6881 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 161.4+ KB


In [113]:
df_old_id.head(2)

Unnamed: 0,chrom,start,end
0,chr1,1471366,1471410
1,chr1,1693068,1693084


In [114]:
df_new_id = pd.Series(df_sign_lenient.mpmat_index.unique()).str.split('_', expand=True)
df_new_id

Unnamed: 0,0,1,2
0,chr1,630967,631005
1,chr1,631016,631037
2,chr1,631778,631791
3,chr1,632382,632387
4,chr1,633067,633147
...,...,...,...
9961,chrX,154031921,154031945
9962,chrX,154313332,154313399
9963,chrX,154679199,154679238
9964,chrX,154817582,154817611


In [115]:
df_new_id.columns = ['chrom', 'start', 'end']
df_new_id[['start', 'end']] = df_new_id[['start', 'end']].astype(int)

print(df_new_id.info())
df_new_id.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9966 entries, 0 to 9965
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   chrom   9966 non-null   object
 1   start   9966 non-null   int64 
 2   end     9966 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 233.7+ KB
None


Unnamed: 0,chrom,start,end
0,chr1,630967,631005
1,chr1,631016,631037


In [116]:
bed_new = BedTool.from_dataframe(df_new_id)
bed_nat = BedTool.from_dataframe(df_old_id)

> https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html

![](https://tva1.sinaimg.cn/large/008vxvgGly1h7jy4v629sj30ld0j9wg7.jpg)

In [117]:
df_bed_to_fix = bed_new.intersect(bed_nat, loj=True).to_dataframe()
df_bed_to_fix.columns = ['chrom', 'start', 'end', 'chrom2', 'start2', 'end2']

df_bed_to_fix

Unnamed: 0,chrom,start,end,chrom2,start2,end2
0,chr1,630967,631005,.,-1,-1
1,chr1,631016,631037,.,-1,-1
2,chr1,631778,631791,.,-1,-1
3,chr1,632382,632387,.,-1,-1
4,chr1,633067,633147,.,-1,-1
...,...,...,...,...,...,...
9963,chrX,154031921,154031945,.,-1,-1
9964,chrX,154313332,154313399,.,-1,-1
9965,chrX,154679199,154679238,.,-1,-1
9966,chrX,154817582,154817611,.,-1,-1


In [121]:
# 新点中有 434 个点和老点有 overlap
df_bed_to_fix[df_bed_to_fix.end2 != -1]

Unnamed: 0,chrom,start,end,chrom2,start2,end2
9,chr1,18907073,18907133,chr1,18907111,18907133
11,chr1,29332389,29332405,chr1,29332374,29332405
12,chr1,67532549,67532593,chr1,67532557,67532593
13,chr1,70133031,70133115,chr1,70133094,70133130
14,chr1,78617681,78617723,chr1,78617702,78617723
...,...,...,...,...,...,...
8156,chr12,117022857,117022923,chr12,117022835,117022888
8255,chr13,49219804,49219857,chr13,49219804,49219836
8933,chr16,78770741,78770746,chr16,78770741,78770752
8995,chr17,9576625,9576657,chr17,9576625,9576648


In [122]:
df_old_share

Unnamed: 0,mpmat_index,id_ND4,id_ND5.1,id_ND6
0,chr1_2044988_2044998,,ND5.1-TAS.IND-1,
1,chr1_9702414_9702449,,ND5.1-TAS.IND-2,ND6-TAS.IND-1
2,chr1_12618165_12618174,,ND5.1-TAS.IND-3,
3,chr1_13019998_13020019,,,ND6-TAS.IND-2
4,chr1_13354705_13354738,,,ND6-TAS.IND-3
...,...,...,...,...
645,chrX_136925330_136925338,,,ND6-TAS.IND-538
646,chrX_138128211_138128237,,,ND6-TAS.IND-539
647,chrX_150683735_150683758,,ND5.1-TAS.IND-452,ND6-TAS.IND-540
648,chrX_153669436_153669453,,ND5.1-TAS.IND-453,ND6-TAS.IND-541


In [123]:
# 看一下这里面有多少个是4、5.1、6 的 IND
# df_old_share.mpmat_index.unique()
tmpdf = df_bed_to_fix[df_bed_to_fix.end2 != -1].iloc[:, -3:].copy()
tmpdf.drop_duplicates(keep='first', inplace=True)
tmpdf

Unnamed: 0,chrom2,start2,end2
9,chr1,18907111,18907133
11,chr1,29332374,29332405
12,chr1,67532557,67532593
13,chr1,70133094,70133130
14,chr1,78617702,78617723
...,...,...,...
8156,chr12,117022835,117022888
8255,chr13,49219804,49219836
8933,chr16,78770741,78770752
8995,chr17,9576625,9576648


In [124]:
tmpls = tmpdf.parallel_apply(lambda x: f'{x[0]}_{x[1]}_{x[2]}', axis=1).tolist()
tmpls = list(set(tmpls))
print(len(tmpls))

433


In [125]:
len(df_old_share.mpmat_index.unique())

650

In [126]:
print(len([i for i in tmpls if i in df_old_share.mpmat_index.unique()]))

362


In [127]:
# form Old IND456 bed
df_old_share['id_merge'] = df_old_share.iloc[:, 1:4].astype(str).agg('_'.join, axis=1)
df_old_share[list('abc')] = df_old_share['mpmat_index'].str.split('_', expand=True)
df_old_share

Unnamed: 0,mpmat_index,id_ND4,id_ND5.1,id_ND6,id_merge,a,b,c
0,chr1_2044988_2044998,,ND5.1-TAS.IND-1,,nan_ND5.1-TAS.IND-1_nan,chr1,2044988,2044998
1,chr1_9702414_9702449,,ND5.1-TAS.IND-2,ND6-TAS.IND-1,nan_ND5.1-TAS.IND-2_ND6-TAS.IND-1,chr1,9702414,9702449
2,chr1_12618165_12618174,,ND5.1-TAS.IND-3,,nan_ND5.1-TAS.IND-3_nan,chr1,12618165,12618174
3,chr1_13019998_13020019,,,ND6-TAS.IND-2,nan_nan_ND6-TAS.IND-2,chr1,13019998,13020019
4,chr1_13354705_13354738,,,ND6-TAS.IND-3,nan_nan_ND6-TAS.IND-3,chr1,13354705,13354738
...,...,...,...,...,...,...,...,...
645,chrX_136925330_136925338,,,ND6-TAS.IND-538,nan_nan_ND6-TAS.IND-538,chrX,136925330,136925338
646,chrX_138128211_138128237,,,ND6-TAS.IND-539,nan_nan_ND6-TAS.IND-539,chrX,138128211,138128237
647,chrX_150683735_150683758,,ND5.1-TAS.IND-452,ND6-TAS.IND-540,nan_ND5.1-TAS.IND-452_ND6-TAS.I...,chrX,150683735,150683758
648,chrX_153669436_153669453,,ND5.1-TAS.IND-453,ND6-TAS.IND-541,nan_ND5.1-TAS.IND-453_ND6-TAS.I...,chrX,153669436,153669453


In [128]:
df_old_share[['a', 'b', 'c', 'id_merge']].to_csv('../tables/2022-11-25_old_650_ND456.bed', header=False, index=False, sep='\t')

In [129]:
# end2 为-1 代表老点不存在，
# end2 有值说明，新老点重合，以老点 region 坐标为准进行替换

# 唯一标识mpmat_index
df_bed_to_fix['mpmat_index'] = (
    df_bed_to_fix['chrom'] +
    '_' +
    df_bed_to_fix['start'].astype(str) +
    '_' +
    df_bed_to_fix['end'].astype(str)
)


df_bed_to_fix_part1 = (
    df_bed_to_fix
    .query('end2 == -1')
    [['mpmat_index', 'chrom', 'start', 'end']]
    .copy()
)

df_bed_to_fix_part2 = (
    df_bed_to_fix
    .query('end2 != -1')
    [['mpmat_index', 'chrom2', 'start2', 'end2']]
    .copy()
)

df_bed_to_fix_part1.columns = ['mpmat_index', 'chr_name', 'region_start', 'region_end']
df_bed_to_fix_part2.columns = ['mpmat_index', 'chr_name', 'region_start', 'region_end']

df_bed_fixed_coordinate = pd.concat(
    [df_bed_to_fix_part1, df_bed_to_fix_part2],
    axis=0
)
df_bed_fixed_coordinate

Unnamed: 0,mpmat_index,chr_name,region_start,region_end
0,chr1_630967_631005,chr1,630967,631005
1,chr1_631016_631037,chr1,631016,631037
2,chr1_631778_631791,chr1,631778,631791
3,chr1_632382_632387,chr1,632382,632387
4,chr1_633067_633147,chr1,633067,633147
...,...,...,...,...
8156,chr12_117022857_117022923,chr12,117022835,117022888
8255,chr13_49219804_49219857,chr13,49219804,49219836
8933,chr16_78770741_78770746,chr16,78770741,78770752
8995,chr17_9576625_9576657,chr17,9576625,9576648


In [130]:
# 发现两个 duplicated 的 mpmat_index，check 一下为什么
# print(df_bed_fixed_coordinate[df_bed_fixed_coordinate.mpmat_index.duplicated()])

print(df_bed_fixed_coordinate.query('mpmat_index=="chr10_22989043_22989070"'))
print(df_bed_fixed_coordinate.query('mpmat_index=="chr17_67963461_67963466"'))


#                  mpmat_index chr_name  region_start  region_end
# 864  chr10_22989043_22989070    chr10      22989049    22989070
# 865  chr10_22989043_22989070    chr10      22989052    22989071
#                   mpmat_index chr_name  region_start  region_end
# 2931  chr17_67963461_67963466    chr17      67963444    67963466
# 2932  chr17_67963461_67963466    chr17      67963458    67963466

# 发现其实就是没定下来最终的导致了有多出来的点，图方便取第一个，注意以后 debug！
# DEBUG

df_bed_fixed_coordinate.drop_duplicates(subset='mpmat_index',
                                        keep='first',
                                        inplace=True)

df_bed_fixed_coordinate.head(2)

                 mpmat_index chr_name  region_start  region_end
947  chr10_22989043_22989070    chr10      22989049    22989070
948  chr10_22989043_22989070    chr10      22989052    22989071
                  mpmat_index chr_name  region_start  region_end
3491  chr17_67963461_67963466    chr17      67963444    67963466
3492  chr17_67963461_67963466    chr17      67963458    67963466


Unnamed: 0,mpmat_index,chr_name,region_start,region_end
0,chr1_630967_631005,chr1,630967,631005
1,chr1_631016_631037,chr1,631016,631037


In [131]:
df_bed_fixed_coordinate2 = (
    df_bed_fixed_coordinate
    .merge(df_sign_lenient, on=['mpmat_index'], how='left')
    .drop(
        columns=['chr_name_y', 'region_start_y', 'region_end_y']
    )
    .rename(
        columns={
            'chr_name_x': 'chr_name',
            'region_start_x': 'region_start',
            'region_end_x': 'region_end'}
    )
)

sample_names = df_bed_fixed_coordinate2.pop('<sample>')
mpmat_indexes = df_bed_fixed_coordinate2.pop('mpmat_index')
# 利用insert方法插入取出的数据列到指定位置

df_bed_fixed_coordinate2.insert(0, '<sample>', sample_names)
df_bed_fixed_coordinate2.insert(4, 'mpmat_index', mpmat_indexes)

# fix mpmat_index
df_bed_fixed_coordinate2['mpmat_index'] = df_bed_fixed_coordinate2['chr_name'] + '_' + \
    df_bed_fixed_coordinate2['region_start'].astype(str) + '_' + df_bed_fixed_coordinate2['region_end'].astype(str)

df_bed_fixed_coordinate2

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,630967,631005,chr1_630967_631005,7,1,6,chr1_630992_GA,2975,4635,0.641855,38,6608,1,780,0.096198,39.389272,0.002532,4.649460,"0,1,2,3,4,5,6,7 35,2,1,0,0,0,0,...",8.677587,10.842847,TestOK,2.511695e-310,5.507753e-305,chr1_630967_631005_highest_chr1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1732,4662,0.371514,33,6236,0,1189,0.083540,37.171837,0.000000,7.087446,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 31...",8.797528,9.773088,TestOK,0.000000e+00,0.000000e+00,chr1_631016_631037_highest_chr1...,.
2,DetectSeq_ATP8-DddA6_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1546,3564,0.433782,33,4901,0,770,0.083540,31.497951,0.000000,4.948668,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 24...",8.558576,9.254862,TestOK,4.024876e-307,1.029691e-301,chr1_631016_631037_highest_chr1...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631791_CT,733,1147,0.639058,40,1483,0,547,0.101261,8.839935,0.000000,3.260583,"0,1,2,3 40,0,0,0 534,278,413,258",6.447891,8.652952,TestOK,2.915445e-214,4.972421e-209,chr1_631778_631791_highest_chr1...,.
4,DetectSeq_ATP8-DddA6_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631787_CT,731,1055,0.692891,40,1447,0,467,0.101261,9.299640,0.000000,3.001335,"0,1,2,3 40,0,0,0 530,320,528,69",6.521031,8.533426,TestOK,3.072057e-182,6.736538e-177,chr1_631778_631791_highest_chr1...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12344,DetectSeq_SIRT6-DddA11_REP-2,chr12,117022835,117022888,chr12_117022835_117022888,11,0,11,chr12_117022887_CT,47,96,0.489583,26,158,0,38,0.065819,0.733786,0.000000,0.176480,"0,1,2,3,4,5,6,7,8,9,10,11 25,1,...",3.478776,4.888107,TestOK,2.478798e-12,2.231631e-09,chr12_117022857_117022923_highe...,.
12345,DetectSeq_SIRT6-DddA11_REP-2,chr13,49219804,49219836,chr13_49219804_49219836,15,0,15,chr13_49219811_CT,15,55,0.272727,13,85,0,37,0.032910,0.394758,0.000000,0.171836,"0,1,2,3,4,5,6,7,8,9,10,11,12,13...",3.584386,4.401052,TestOK,2.131671e-11,1.643442e-08,chr13_49219804_49219857_highest...,.
12346,DetectSeq_SIRT6-DddA11_REP-2,chr16,78770741,78770752,chr16_78770741_78770752,4,0,4,chr16_78770741_CT,31,44,0.704545,12,49,0,22,0.030378,0.227567,0.000000,0.102173,"0,1,2,3,4 12,0,0,0,0 5,22,19,3,0",2.905182,3.927556,TestOK,7.988221e-07,2.144053e-04,chr16_78770741_78770746_highest...,.
12347,DetectSeq_SIRT6-DddA11_REP-2,chr17,9576625,9576648,chr17_9576625_9576648,12,0,12,chr17_9576625_CT,16,61,0.262295,23,85,0,18,0.058225,0.394758,0.000000,0.083596,"0,1,2,3,4,5,6,7,8,9,10,11,12 20...",2.761264,3.655176,TestOK,1.286750e-05,2.301233e-03,chr17_9576625_9576657_highest_c...,.


In [132]:
# sample 内部无重合
# 若有重合，仔细检查原因
for sample, _df in df_bed_fixed_coordinate2.groupby('<sample>'):
    dup_num = _df.mpmat_index.duplicated().sum()
    print(dup_num)

    if dup_num:
        print(f'sample={sample}\tdup_num={dup_num}')
        print(_df[_df.mpmat_index.duplicated()].iloc[:, 0:6])
        print()
    else:
        pass

0
0
0
1
sample=DetectSeq_JAK2-DddA11_REP-1	dup_num=1
                          <sample> chr_name  region_start  region_end              mpmat_index  region_site_num
11921  DetectSeq_JAK2-DddA11_REP-1    chr12      81451345    81451359  chr12_81451345_81451359                7

0
1
sample=DetectSeq_SIRT6-DddA11_REP-1	dup_num=1
                           <sample> chr_name  region_start  region_end              mpmat_index  region_site_num
11922  DetectSeq_SIRT6-DddA11_REP-1    chr12      81451345    81451359  chr12_81451345_81451359                7

1
sample=DetectSeq_SIRT6-DddA11_REP-2	dup_num=1
                           <sample> chr_name  region_start  region_end              mpmat_index  region_site_num
11923  DetectSeq_SIRT6-DddA11_REP-2    chr12      81451345    81451359  chr12_81451345_81451359                7



In [134]:
# 这个点被 call 到了两次，因为视为了两个点: chr12_81451345_81451359
# 后续处理中保留这个点，不再进行去重
print(df_bed_fixed_coordinate2.query('mpmat_index=="chr1_9702414_9702449"').iloc[:, 0:7])
print(df_bed_fixed_coordinate2.query('mpmat_index=="chr12_81451345_81451359"').iloc[:, 0:7])


df_bed_fixed_coordinate2

                           <sample> chr_name  region_start  region_end           mpmat_index  region_site_num  region_block_site_num
11324   DetectSeq_JAK2-DddA11_REP-1     chr1       9702414     9702449  chr1_9702414_9702449                8                      0
11325   DetectSeq_JAK2-DddA11_REP-2     chr1       9702414     9702449  chr1_9702414_9702449                8                      0
11326  DetectSeq_SIRT6-DddA11_REP-1     chr1       9702414     9702449  chr1_9702414_9702449                8                      0
11327  DetectSeq_SIRT6-DddA11_REP-2     chr1       9702414     9702449  chr1_9702414_9702449                8                      0
                           <sample> chr_name  region_start  region_end              mpmat_index  region_site_num  region_block_site_num
11918   DetectSeq_JAK2-DddA11_REP-1    chr12      81451345    81451359  chr12_81451345_81451359               16                      0
11919  DetectSeq_SIRT6-DddA11_REP-1    chr12      81451345    8

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,630967,631005,chr1_630967_631005,7,1,6,chr1_630992_GA,2975,4635,0.641855,38,6608,1,780,0.096198,39.389272,0.002532,4.649460,"0,1,2,3,4,5,6,7 35,2,1,0,0,0,0,...",8.677587,10.842847,TestOK,2.511695e-310,5.507753e-305,chr1_630967_631005_highest_chr1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1732,4662,0.371514,33,6236,0,1189,0.083540,37.171837,0.000000,7.087446,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 31...",8.797528,9.773088,TestOK,0.000000e+00,0.000000e+00,chr1_631016_631037_highest_chr1...,.
2,DetectSeq_ATP8-DddA6_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1546,3564,0.433782,33,4901,0,770,0.083540,31.497951,0.000000,4.948668,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 24...",8.558576,9.254862,TestOK,4.024876e-307,1.029691e-301,chr1_631016_631037_highest_chr1...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631791_CT,733,1147,0.639058,40,1483,0,547,0.101261,8.839935,0.000000,3.260583,"0,1,2,3 40,0,0,0 534,278,413,258",6.447891,8.652952,TestOK,2.915445e-214,4.972421e-209,chr1_631778_631791_highest_chr1...,.
4,DetectSeq_ATP8-DddA6_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631787_CT,731,1055,0.692891,40,1447,0,467,0.101261,9.299640,0.000000,3.001335,"0,1,2,3 40,0,0,0 530,320,528,69",6.521031,8.533426,TestOK,3.072057e-182,6.736538e-177,chr1_631778_631791_highest_chr1...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12344,DetectSeq_SIRT6-DddA11_REP-2,chr12,117022835,117022888,chr12_117022835_117022888,11,0,11,chr12_117022887_CT,47,96,0.489583,26,158,0,38,0.065819,0.733786,0.000000,0.176480,"0,1,2,3,4,5,6,7,8,9,10,11 25,1,...",3.478776,4.888107,TestOK,2.478798e-12,2.231631e-09,chr12_117022857_117022923_highe...,.
12345,DetectSeq_SIRT6-DddA11_REP-2,chr13,49219804,49219836,chr13_49219804_49219836,15,0,15,chr13_49219811_CT,15,55,0.272727,13,85,0,37,0.032910,0.394758,0.000000,0.171836,"0,1,2,3,4,5,6,7,8,9,10,11,12,13...",3.584386,4.401052,TestOK,2.131671e-11,1.643442e-08,chr13_49219804_49219857_highest...,.
12346,DetectSeq_SIRT6-DddA11_REP-2,chr16,78770741,78770752,chr16_78770741_78770752,4,0,4,chr16_78770741_CT,31,44,0.704545,12,49,0,22,0.030378,0.227567,0.000000,0.102173,"0,1,2,3,4 12,0,0,0,0 5,22,19,3,0",2.905182,3.927556,TestOK,7.988221e-07,2.144053e-04,chr16_78770741_78770746_highest...,.
12347,DetectSeq_SIRT6-DddA11_REP-2,chr17,9576625,9576648,chr17_9576625_9576648,12,0,12,chr17_9576625_CT,16,61,0.262295,23,85,0,18,0.058225,0.394758,0.000000,0.083596,"0,1,2,3,4,5,6,7,8,9,10,11,12 20...",2.761264,3.655176,TestOK,1.286750e-05,2.301233e-03,chr17_9576625_9576657_highest_c...,.


In [135]:
df_pois = df_bed_fixed_coordinate2.copy()
df_pois.head()

# 现在只要是有 nature 当中的 region，都和 nature 的 coordinate
# 和mpmat_index保持一致了，后面如果回溯命名，直接和 v4 的 table merge一下即可

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,630967,631005,chr1_630967_631005,7,1,6,chr1_630992_GA,2975,4635,0.641855,38,6608,1,780,0.096198,39.389272,0.002532,4.64946,"0,1,2,3,4,5,6,7 35,2,1,0,0,0,0,...",8.677587,10.842847,TestOK,2.511695e-310,5.507753e-305,chr1_630967_631005_highest_chr1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1732,4662,0.371514,33,6236,0,1189,0.08354,37.171837,0.0,7.087446,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 31...",8.797528,9.773088,TestOK,0.0,0.0,chr1_631016_631037_highest_chr1...,.
2,DetectSeq_ATP8-DddA6_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1546,3564,0.433782,33,4901,0,770,0.08354,31.497951,0.0,4.948668,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 24...",8.558576,9.254862,TestOK,4.0248759999999995e-307,1.029691e-301,chr1_631016_631037_highest_chr1...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631791_CT,733,1147,0.639058,40,1483,0,547,0.101261,8.839935,0.0,3.260583,"0,1,2,3 40,0,0,0 534,278,413,258",6.447891,8.652952,TestOK,2.915445e-214,4.972421e-209,chr1_631778_631791_highest_chr1...,.
4,DetectSeq_ATP8-DddA6_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631787_CT,731,1055,0.692891,40,1447,0,467,0.101261,9.29964,0.0,3.001335,"0,1,2,3 40,0,0,0 530,320,528,69",6.521031,8.533426,TestOK,3.072057e-182,6.736538e-177,chr1_631778_631791_highest_chr1...,.


In [136]:
df_pois.isna().sum().sum()

0

In [137]:
# df_pois.to_csv('/Users/zhaohuanan/Downloads/2022-11-25_for_home_use.csv.gz', index=None)
df_pois.to_csv('../tables/2022-11-25_for_home_use.csv.gz', index=None)

In [138]:
# del df, df_mpmat, df_bed_fixed_coordinate2

In [140]:
# %reset

# 重载预处理好的数据 reload data -> df_pois

In [149]:
# df_pois = pd.read_csv('/Users/zhaohuanan/Downloads/2022-11-25_for_home_use.csv.gz', header=0, index_col=None)
df_pois = pd.read_csv('../tables/2022-11-25_for_home_use.csv.gz', header=0, index_col=None)

In [150]:
df_pois

Unnamed: 0,<sample>,chr_name,region_start,region_end,mpmat_index,region_site_num,region_block_site_num,region_mut_site_num,region_highest_site_index,region_highest_site_mut_num,region_highest_site_cover_num,region_highest_site_mut_ratio,ctrl_count,treat_count,ctrl_mut_count,treat_mut_count,ctrl_count.norm,treat_count.norm,ctrl_mut_count.norm,treat_mut_count.norm,count_info,log2_FC,log2_FC_mut,test_state,p_value,FDR,bed_name,strand
0,DetectSeq_ATP8-DddA11_REP-1,chr1,630967,631005,chr1_630967_631005,7,1,6,chr1_630992_GA,2975,4635,0.641855,38,6608,1,780,0.096198,39.389272,0.002532,4.649460,"0,1,2,3,4,5,6,7 35,2,1,0,0,0,0,...",8.677587,10.842847,TestOK,2.511695e-310,5.507753e-305,chr1_630967_631005_highest_chr1...,.
1,DetectSeq_ATP8-DddA11_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1732,4662,0.371514,33,6236,0,1189,0.083540,37.171837,0.000000,7.087446,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 31...",8.797528,9.773088,TestOK,0.000000e+00,0.000000e+00,chr1_631016_631037_highest_chr1...,.
2,DetectSeq_ATP8-DddA6_REP-1,chr1,631016,631037,chr1_631016_631037,6,0,6,chr1_631036_CT,1546,3564,0.433782,33,4901,0,770,0.083540,31.497951,0.000000,4.948668,"0,1,2,3,4,5,6 33,0,0,0,0,0,0 24...",8.558576,9.254862,TestOK,4.024876e-307,1.029691e-301,chr1_631016_631037_highest_chr1...,.
3,DetectSeq_ATP8-DddA11_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631791_CT,733,1147,0.639058,40,1483,0,547,0.101261,8.839935,0.000000,3.260583,"0,1,2,3 40,0,0,0 534,278,413,258",6.447891,8.652952,TestOK,2.915445e-214,4.972421e-209,chr1_631778_631791_highest_chr1...,.
4,DetectSeq_ATP8-DddA6_REP-1,chr1,631778,631791,chr1_631778_631791,3,0,3,chr1_631787_CT,731,1055,0.692891,40,1447,0,467,0.101261,9.299640,0.000000,3.001335,"0,1,2,3 40,0,0,0 530,320,528,69",6.521031,8.533426,TestOK,3.072057e-182,6.736538e-177,chr1_631778_631791_highest_chr1...,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12344,DetectSeq_SIRT6-DddA11_REP-2,chr12,117022835,117022888,chr12_117022835_117022888,11,0,11,chr12_117022887_CT,47,96,0.489583,26,158,0,38,0.065819,0.733786,0.000000,0.176480,"0,1,2,3,4,5,6,7,8,9,10,11 25,1,...",3.478776,4.888107,TestOK,2.478798e-12,2.231631e-09,chr12_117022857_117022923_highe...,.
12345,DetectSeq_SIRT6-DddA11_REP-2,chr13,49219804,49219836,chr13_49219804_49219836,15,0,15,chr13_49219811_CT,15,55,0.272727,13,85,0,37,0.032910,0.394758,0.000000,0.171836,"0,1,2,3,4,5,6,7,8,9,10,11,12,13...",3.584386,4.401052,TestOK,2.131671e-11,1.643442e-08,chr13_49219804_49219857_highest...,.
12346,DetectSeq_SIRT6-DddA11_REP-2,chr16,78770741,78770752,chr16_78770741_78770752,4,0,4,chr16_78770741_CT,31,44,0.704545,12,49,0,22,0.030378,0.227567,0.000000,0.102173,"0,1,2,3,4 12,0,0,0,0 5,22,19,3,0",2.905182,3.927556,TestOK,7.988221e-07,2.144053e-04,chr16_78770741_78770746_highest...,.
12347,DetectSeq_SIRT6-DddA11_REP-2,chr17,9576625,9576648,chr17_9576625_9576648,12,0,12,chr17_9576625_CT,16,61,0.262295,23,85,0,18,0.058225,0.394758,0.000000,0.083596,"0,1,2,3,4,5,6,7,8,9,10,11,12 20...",2.761264,3.655176,TestOK,1.286750e-05,2.301233e-03,chr17_9576625_9576657_highest_c...,.


In [151]:
df_pois.groupby('<sample>').mpmat_index.count()

<sample>
DetectSeq_ATP8-DddA11_REP-1      177
DetectSeq_ATP8-DddA6_REP-1       589
DetectSeq_ATP8-DddAwt_REP-1       57
DetectSeq_JAK2-DddA11_REP-1      628
DetectSeq_JAK2-DddA11_REP-2      667
DetectSeq_SIRT6-DddA11_REP-1    2801
DetectSeq_SIRT6-DddA11_REP-2    7430
Name: mpmat_index, dtype: int64

In [None]:
# 看一下有没有重合的
# 有的话去上一步 fix 掉
df_pois.groupby('<sample>').mpmat_index.describe()

In [None]:
# 这里 fix 一个连起来的点chr12_81451345_81451359
df_pois[df_pois['mpmat_index']!='chr12_81451345_81451359'].groupby('<sample>').mpmat_index.describe()

# [TODO] share analysis of different DdCBE treatment -> upset plot

## [TODO]plot upset-plot: share info

In [None]:
# %%R
# 使用ggpubr包中的ggarrange()函数来排版多个图形：
# ggarrange(bxp, dp, bp + rremove("x.text"), labels=c('A', 'B', 'C'), ncol=2, nrow=2)

# 这个简书写的太好了！
# https://www.jianshu.com/p/c154ca35530b
# 注释排版的图形
# 对齐绘图区
# 更改图形的行列跨度
# 使用cowplot包中的draw_plot_label()函数注释图形


# %%R -i df2r
# df = as.data.frame(df2r)
# # head(df, 10)

# g_a = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddAwt_1,
#             y=ATP8_DddA6_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_continuous(
#         name="Detect-seq_signals of ATP8_DddAwt_1",
#         limits=c(0, 0.5),
#     ) +
#     scale_y_continuous(
#         name="Detect-seq_signals of ATP8_DddA6_1",
#         limits=c(0, 0.5),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("ATP8_DddAwt_1 v.s. ATP8_DddA6_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddAwt_1v.s.ATP8_DddA6_1.pdf', plot=g_a, width=4.6, height=4)


# df = df %>%
#     group_by(color) %>% summarise(count=n()) %>%
#     mutate(
#         group=color,
#         csum=rev(cumsum(rev(count))),
#         total=sum(count),
#         ratio=count/total*100,
#         pos=count/2 + lead(csum, 1),
#         pos=if_else(is.na(pos), count/2, pos)
#     )
# g_b = ggplot(data=df, mapping=aes(x="", y=count, fill=color, group=fct_inorder(color))) +
#     geom_col(color="white") +
#     geom_text(
#         aes(label=sprintf("%i\n%.0f%%", count, ratio)),  # 标注
#         position=position_stack(vjust=0.5),
#         size=3
#     ) +
#     coord_polar(theta="y") +
#     scale_fill_brewer(palette="Pastel2") +
#     ggtitle("Pie: ATP8_DddAwt_1 v.s. ATP8_DddA6_1") +
#     theme_void()

# ggsave('2022-10-30_Detect-seq_pie-plot_ATP8_DddAwt_1v.s.ATP8_DddA6_1.pdf', plot=g_b, width=5.2, height=4)
# print(g_a)
# print(g_b)

In [None]:
# df_upset = df_wide_score.set_index('mpmat_index').applymap(lambda x: True if x >= 0.001 else False)
# # .T
# df_upset

In [None]:
# df_upset

In [None]:
# %%R -i df_upset

# df_upset = tibble(df_upset)

# g = df_upset %>%
#     group_by(OTs) %>%
#     summarize(Treatments=list(Treatment)) %>%
#     ggplot(aes(x=Treatments)) +
#         geom_bar() +
#         scale_x_upset() +
#         scale_y_log10(
#             breaks=c(10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 2000, 2500)
#         ) +
#         ylab('log10_OTs\' count') +
#         ggtitle('Off-target sharing of different DdCBEs')
#         theme_bw()
# ggsave('2022-10-30_Detect-seq_upset-plot_all.pdf', plot=g, width=8, height=9)
# g

In [None]:
# %%R
# g = ggplot(df_upset) +
#     geom_bar(
#         aes(x=Treatment, fill=Treatment, group=Treatment),
#         stat='count',
#         width=0.8
#     ) +
#     scale_y_continuous(
#         limits=c(0, 2800),
#         breaks=c(0, 50, 100, 200, 500, 1000, 2000, 2650)
#     ) +
#     ylab("total counts") +
#     ggtitle("Total OTs induced by DdCBEs") +
#     coord_flip() +
#     theme_classic() +
#     theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1, size=6))

# ggsave('2022-10-30_Detect-seq_bar-plot_all.pdf', plot=g, width=8, height=4)
# g

In [None]:
# %%R
# df_upset %>%
#     group_by(OTs) %>%
#     filter(grepl("ATP8", Treatment))

In [None]:
# %%R
# g = df_upset %>%
#     group_by(OTs) %>%
#     filter(grepl("ATP8", Treatment)) %>%
#     summarize(Treatments=list(Treatment)) %>%
#     ggplot(aes(x=Treatments)) +
#         geom_bar() +
#         scale_x_upset() +
#         scale_y_continuous(
#             breaks=seq(0, 400, 50)
#         ) +
#         ggtitle('Off-target sharing of different ATP8-DdCBEs')
#         theme_bw()
# ggsave('2022-10-30_Detect-seq_upset-plot_all_ATP8.pdf', plot=g, width=8, height=9)
# g

# Detect-seq signal comparation -> scatter plot

## data processing

In [None]:
df_pois

In [None]:
df_pois.info()

In [None]:
df_pois['test_state'].describe()

In [None]:
df_pois_sel = df_pois[['<sample>', 'mpmat_index',
                       'treat_mut_count.norm', 'treat_count.norm']]
df_pois_sel

In [None]:
# 计算detect_seq_score
def calculate_detect_seq_score(x):
    treat_mut_count_norm = x['treat_mut_count.norm']
    treat_count_norm = x['treat_count.norm']
    treat_mut_ratio = treat_mut_count_norm / treat_count_norm

    if treat_count_norm != 0:
        score = (treat_mut_ratio ** 2) * treat_mut_count_norm * 100
    else:
        print(x)
        raise ValueError
    return score


df_pois_sel = df_pois_sel.assign(
    sample=df_pois_sel['<sample>'],
    detect_seq_score=df_pois_sel.parallel_apply(
        calculate_detect_seq_score, axis=1
    ),
)


df_pois_sel

In [None]:
df_detect_seq_score = df_pois_sel[['mpmat_index', 'sample', 'detect_seq_score']].copy()
df_detect_seq_score

In [None]:
df_detect_seq_score.detect_seq_score.describe()

In [None]:
df_detect_seq_score['cut'] = pd.cut(df_detect_seq_score.detect_seq_score, bins=range(0,180, 10))
df_detect_seq_score

In [None]:
df_detect_seq_score.groupby(['sample', 'cut']).count().reset_index()

In [None]:
# 画个图了解一下 Detect-seq signal 的分布情况
for sample, _df in df_detect_seq_score.groupby(['sample', 'cut']).count().reset_index().groupby('sample'):
    # print(_df.reset_index())
    _df = _df.reset_index()
    _df['cut'] = _df['cut'].map(lambda x: str(x).split(',')[-1].replace(']', '')).astype(int)
    ax = _df.plot(kind='bar', x='cut', y='detect_seq_score', title=sample)
    ax.set_xlabel('score')
    ax.set_ylabel('count')

In [None]:
_df.plot?

In [None]:
df_wide = pd.DataFrame(
    df_detect_seq_score.mpmat_index.unique(),
    columns=['mpmat_index'])


for _dfn, _df in df_detect_seq_score.groupby('sample'):
    print(_dfn)
    _df.columns = ['mpmat_index', _dfn, f'score_{_dfn}']
    df_wide = pd.merge(
        left=df_wide, right=_df.iloc[:, [0, 2]],
        on='mpmat_index', how='left')

# df_wide_score = df_wide.fillna(0)
df_wide_score = df_wide
df_wide_score
# df_wide

In [None]:
# 重命名列名
df_wide_score.columns = [
    "mpmat_index",
    "ATP8_DddA11_1",
    "ATP8_DddA6_1",
    "ATP8_DddAwt_1",
    "JAK2_DddA11_1",
    "JAK2_DddA11_2",
    "SIRT6_DddA11_1",
    "SIRT6_DddA11_2",
]

df_wide_score

In [None]:
# df_wide_score = df_wide_score[df_wide_score.iloc[:,1:].sum(axis=1) > 0.1].copy()
# df_wide_score
df_wide_score.query('mpmat_index=="chr2_59759874_59759954"')

In [None]:
df_pois.query('mpmat_index=="chr2_59759874_59759954"')

In [None]:
df.query('mpmat_index=="chr2_59759874_59759954"')

In [None]:
def map_color(x):
    color = ''
    if x.iloc[1] >= 0.01:
        color += 'a'
    if x.iloc[2] >= 0.01:
        color += 'b'
    # a, b, ab, ''
    if len(color) == 0:
        color = np.NaN

    return color

## plot comparison between different DddA variants

### {scatter plot} ATP8_DddAwt_1 v.s. ATP8_DddA6_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'ATP8_DddAwt_1', 'ATP8_DddA6_1']].copy()
df2r['color'] = df2r.parallel_apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
df2r.color.value_counts()

In [None]:
df2r.query('mpmat_index=="chr2_88803448_88803454"')

In [None]:
# 有可疑的点在这里 check
# df_pois.query('mpmat_index=="chr22_13878035_13878043"')

In [None]:
# 因为图放不下，所以临时 fix 以下
# df2r.iloc[5, 1:3] = df2r.iloc[5, 1:3] / 2

In [None]:
# df2r.iloc[5, 1:3]

In [None]:
%%R -i df2r -w 500 -h 500
df = tibble(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=ATP8_DddAwt_1,
            y=ATP8_DddA6_1,
            alpha=0.99,
            # size=1,
            color=color,
            fill=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of ATP8_DddAwt_1",
        # limits=c(0, 66),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of ATP8_DddA6_1",
        # limits=c(0, 66),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("ATP8_DddAwt_1 v.s. ATP8_DddA6_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddAwt_1v.s.ATP8_DddA6_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddAwt_1,
#             y=ATP8_DddA6_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of ATP8_DddAwt_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA6_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_ATP8_DddAwt_1 v.s. ATP8_DddA6_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_ATP8_DddAwt_1v.s.ATP8_DddA6_1.pdf', plot=g, width=4.6, height=4)
# g

#### 问题：关于 wt 独立的点需要 check 一下 IGV

In [None]:
df_igv = df2r.query('color=="a"').copy()
df_igv[list('abc')] = df_igv.mpmat_index.str.split('_', expand=True)
print(df_igv.shape[0])
df_igv

In [None]:
df_pois.query('mpmat_index=="chr2_59759874_59759954"')

In [None]:
df_igv = df_igv.merge(df_old_share, how='left')
df_igv

In [None]:
df_igv = df_igv[['a', 'b', 'c', 'id_merge']].fillna('nan').copy()
df_igv[['b', 'c']] = df_igv[['b', 'c']].astype(int)
df_igv

In [None]:
df_igv

In [None]:
# 填写相关信息

path_out = '/Volumes/Data-a/Bio/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/igv'
date = 20221125
format_ = "png"
height = 1500

# 格式化脚本
text = f"maxPanelHeight {height}\nsnapshotDirectory {path_out}/off-targets_{date}\n\n"
# print(text)


df_snapshot = df_igv.iloc[:, 0:4]

for index, row_info in df_snapshot.iterrows():
    chrom, start, stop, bed_name = row_info

    path_out_png = f'{chrom}_{start}_{stop}_{bed_name}.snapshot.{format_}'
    middle = int((start + stop) / 2)

    text += f"goto {chrom}:{middle - 100}-{middle + 100}\nsort position\nexpand\nviewaspairs\nsnapshot {path_out_png}\n\n"
print(text[:1000])


with open(f'{path_out}/{date}_off-targets_snapshot.igv_shot_script', 'wt') as f:
    f.write(text)

### {scatter plot} ATP8_DddAwt_1 vs ATP8_DddA11_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'ATP8_DddAwt_1', 'ATP8_DddA11_1']].copy()
df2r['color'] = df2r.apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
%%R -i df2r -w 500 -h 500
df = as.data.frame(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=ATP8_DddAwt_1,
            y=ATP8_DddA11_1,
            # alpha=0.99,
            # size=1,
            color=color,
            fill=color,
            # shape=color,
            # group=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of ATP8_DddAwt_1",
        limits=c(0, 0.5),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of ATP8_DddA11_1",
        limits=c(0, 0.5),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("ATP8_DddAwt_1 v.s. ATP8_DddA11_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddAwt_1v.s.ATP8_DddA11_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddAwt_1,
#             y=ATP8_DddA11_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of ATP8_DddAwt_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_ATP8_DddAwt_1 v.s. ATP8_DddA11_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_ATP8_DddAwt_1v.s.ATP8_DddA11_1.pdf', plot=g, width=4.6, height=4)
# g

### {scatter plot} ATP8_DddA6_1 vs ATP8_DddA11_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'ATP8_DddA6_1', 'ATP8_DddA11_1']].copy()
df2r['color'] = df2r.apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
%%R -i df2r
df = as.data.frame(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=ATP8_DddA6_1,
            y=ATP8_DddA11_1,
            # alpha=0.99,
            # size=1,
            color=color,
            fill=color,
            # shape=color,
            # group=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of ATP8_DddA6_1",
        limits=c(0, 0.5),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of ATP8_DddA11_1",
        limits=c(0, 0.5),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("ATP8_DddA6_1 v.s. ATP8_DddA11_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddA6_1v.s.ATP8_DddA11_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddA6_1,
#             y=ATP8_DddA11_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA6_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_ATP8_DddA6_1 v.s. ATP8_DddA11_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_ATP8_DddA6_1v.s.ATP8_DddA11_1.pdf', plot=g, width=4.6, height=4)
# g

### plot ATP8_DddA11_1 vs JAK2_DddA11_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'ATP8_DddA11_1', 'JAK2_DddA11_1']].copy()
df2r['color'] = df2r.apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
%%R -i df2r
df = as.data.frame(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=ATP8_DddA11_1,
            y=JAK2_DddA11_1,
            # alpha=0.99,
            # size=1,
            color=color,
            fill=color,
            # shape=color,
            # group=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of ATP8_DddA11_1",
        limits=c(0, 0.8),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of JAK2_DddA11_1",
        limits=c(0, 0.8),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("ATP8_DddA11_1 v.s. JAK2_DddA11_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddA11_1v.s.JAK2_DddA11_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddA11_1,
#             y=JAK2_DddA11_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of JAK2_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_ATP8_DddA11_1 v.s. JAK2_DddA11_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_ATP8_DddA11_1v.s.JAK2_DddA11_1.pdf', plot=g, width=4.6, height=4)
# g

### plot ATP8_DddA11_1 vs SIRT6_DddA11_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'ATP8_DddA11_1', 'SIRT6_DddA11_1']].copy()
df2r['color'] = df2r.apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
%%R -i df2r
df = as.data.frame(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=ATP8_DddA11_1,
            y=SIRT6_DddA11_1,
            # alpha=0.99,
            # size=1,
            color=color,
            fill=color,
            # shape=color,
            # group=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of ATP8_DddA11_1",
        limits=c(0, 0.8),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of SIRT6_DddA11_1",
        limits=c(0, 0.8),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("ATP8_DddA11_1 v.s. SIRT6_DddA11_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_ATP8_DddA11_1v.s.SIRT6_DddA11_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=ATP8_DddA11_1,
#             y=SIRT6_DddA11_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of ATP8_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of SIRT6_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_ATP8_DddA11_1 v.s. SIRT6_DddA11_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_ATP8_DddA11_1v.s.SIRT6_DddA11_1.pdf', plot=g, width=4.6, height=4)
# g

### plot JAK2_DddA11_1 vs SIRT6_DddA11_1

In [None]:
# load data in R env
df2r = df_wide_score[['mpmat_index', 'JAK2_DddA11_1', 'SIRT6_DddA11_1']].copy()
df2r['color'] = df2r.apply(map_color, axis=1)
df2r = df2r.query('color.notnull()')
df2r

In [None]:
%%R -i df2r
df = as.data.frame(df2r)
# head(df, 10)

g = ggplot(data=df) +
    geom_point(
        mapping=aes(
            x=JAK2_DddA11_1,
            y=SIRT6_DddA11_1,
            # alpha=0.99,
            # size=1,
            color=color,
            fill=color,
            # shape=color,
            # group=color,
        )
    ) +
    scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
    scale_x_continuous(
        name="Detect-seq_signals of JAK2_DddA11_1",
        limits=c(0, 0.9),
    ) +
    scale_y_continuous(
        name="Detect-seq_signals of SIRT6_DddA11_1",
        limits=c(0, 0.9),
    ) +
    geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
    annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
    ggtitle("JAK2_DddA11_1 v.s. SIRT6_DddA11_1") +
    theme_classic() +
    theme(plot.title=element_text(hjust=0.5))

ggsave('2022-10-30_Detect-seq_scatter-plot_JAK2_DddA11_1v.s.SIRT6_DddA11_1.pdf', plot=g, width=4.6, height=4)
g

In [None]:
# %%R

# g = ggplot(data=df) +
#     geom_point(
#         mapping=aes(
#             x=JAK2_DddA11_1,
#             y=SIRT6_DddA11_1,
#             # alpha=0.99,
#             # size=1,
#             color=color,
#             fill=color,
#             # shape=color,
#             # group=color,
#         )
#     ) +
#     scale_color_manual(values=c("#ff9f1c", "#447B9D", "#e36414")) +
#     scale_x_log10(
#         name="log10_Detect-seq_signals of JAK2_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.0001, 0.001, 0.01, 0.1, 1),
#     ) +
#     scale_y_log10(
#         name="log10_Detect-seq_signals of SIRT6_DddA11_1",
#         limits=c(0.01, 1.5),
#         # breaks=c(0.01, 0.1, 1, 10, 100, 1000),
#     ) +
#     geom_abline(intercept=0, slope=1, linetype="dashed", color="#333333") +
#     annotate("text", x=40, y=9, hjust=0, parse=F, colour="black", label=sprintf("N = %i", dim(df)[1])) +
#     ggtitle("log10_JAK2_DddA11_1 v.s. SIRT6_DddA11_1") +
#     theme_classic() +
#     theme(plot.title=element_text(hjust=0.5))

# ggsave('2022-10-30_Detect-seq_scatter-plot_log10_JAK2_DddA11_1v.s.SIRT6_DddA11_1.pdf', plot=g, width=4.6, height=4)
# g

### new share list

In [None]:
df_share_old

In [None]:
df_if_share = df_upset.T
df_share_new_id = pd.DataFrame(df_if_share[df_if_share.sum(axis=1).map(lambda x: x >= 2)].index)
df_share_new_id

In [None]:
ls_share_old_id = list(set(df_share_old.mpmat_index.values.tolist()))
ls_share_new_id = list(set(df_share_new_id.mpmat_index.values.tolist()))
print(len(ls_share_new_id))
print(len(ls_share_old_id))
print(len(set(ls_share_old_id + ls_share_new_id)))

print(len(set(ls_share_old_id + ls_share_new_id) - set(ls_share_old_id)))
print(len(set(ls_share_old_id + ls_share_new_id) - set(ls_share_new_id)))

In [None]:
# venn3(
#     subsets,
#     set_labels=('A', 'B', 'C'),
#     set_colors=('r', 'g', 'b'),
#     alpha=0.4,
#     normalize_to=1.0,
#     ax=None,
#     subset_label_formatter=None,
# )

# TODO ?
v = venn2([set(ls_share_new_id), set(ls_share_old_id)], set_labels=("share", "share-reported"))
plt.show()

In [None]:
df_new_share_bed = pd.DataFrame([i.split('_') for i in ls_share_new_id], columns=['chrom', 'start', 'stop'])
df_new_share_bed['start'] = df_new_share_bed['start'].astype(int)
df_new_share_bed['stop'] = df_new_share_bed['stop'].astype(int)
# df_new_share_bed.info()
df_new_share_bed

In [None]:
df_ctcf = pd.read_csv('../bed/ENCFF285QVL_CTCF_binding_sites.bed', sep='\t', header=None)
df_ctcf

In [None]:
bed_ctcf_like = BedTool.from_dataframe(df_new_share_bed)
bed_ctcf_true = BedTool.from_dataframe(df_ctcf.iloc[:, 0:3])

df_bed_to_fix = bed_ctcf_like.intersect(bed_ctcf_true, loj=True).to_dataframe()
df_bed_to_fix.columns = ['chrom', 'start', 'end', 'chrom2', 'start2', 'end2']

df_bed_to_fix

In [None]:
bed_ctcf_true_in_new = df_bed_to_fix.query('end2!=-1')
bed_ctcf_true_in_new

In [None]:
ls_share_new_ctcf_true = (bed_ctcf_true_in_new['chrom'] + '_' + bed_ctcf_true_in_new['start'].astype(str) + '_' + bed_ctcf_true_in_new['end'].astype(str)).tolist()

In [None]:
df_boxplot = df_detect_seq_score[df_detect_seq_score.mpmat_index.map(lambda x: x in ls_share_new_ctcf_true)].copy()
df_boxplot
df_boxplot.groupby('sample').describe()

In [None]:
df_boxplot['sample'] = df_boxplot['sample'].str.replace('DetectSeq_', '')
df_boxplot = df_boxplot.query('detect_seq_score>=0.001')
df_boxplot

In [None]:
order = [
    'ATP8-DddAwt_REP-1',
    'ATP8-DddA6_REP-1',
    'ATP8-DddA11_REP-1',
    'SIRT6-DddA11_REP-1',
    'JAK2-DddA11_REP-1']

ax = sns.boxplot(data=df_boxplot, x="sample", y="detect_seq_score", order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.ylim(0, 0.4)


plt.savefig("2022-11-02_Detect-seq_signal_strength.pdf", dpi=200, bbox_inches='tight')
ax.get_xticklabels(),

## 2022-10-21 co-localization between all CTCF peaks and Detect-seq signals from different DdCBE treatments

# IGV 截图脚本
> https://www.jianshu.com/p/d77f2d34b7fd

In [None]:
df_igv = df_pois[[
    'chr_name', 'region_start', 'region_end', 'bed_name', 'log2_FC_mut'
]]
df_igv.head()

In [None]:
# 理论上这里得到的是所有 samples 的 candidate list
# 经过先 merge mpmat 的处理，再 call 点，应该不存在能 overlap 到一起的 region，只能是样本间 share
# 相同位置或者 not share，不存在 overlap 又不相同的情况了

In [None]:
df_igv.head()

In [None]:
# 对call 到的所有点进行随机采样
np.random.seed = 2022
df_igv = df_igv.sample(n=300)
df_igv

In [None]:
# 整理df格式为bed文件格式
print(df_igv)
#        0          1          2
# 0   chr5   69093805   69093830
# 1   chr8   37153384   37153424
# 2  chr15   57559994   57560017
# 3  chr15   68651256   68651277
# 4  chr10  119445511  119445546
# 5   chr8   20184990   20185028
# 6  chr19   45187694   45187712
# 7  chr15   81265992   81266016
# 8   chr2  201232409  201232430
# 9   chr9   98034893   98034930

In [None]:
# 填写相关信息

path_out = '/Volumes/Data-a/Bio/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/igv'
# path_out = '/Volumes/zhaohn_HD/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/igv'
date = 20221125
format_ = "png"
height = 1500

# 格式化脚本
text = f"maxPanelHeight {height}\nsnapshotDirectory {path_out}/off-targets_{date}\n\n"
# print(text)


df_snapshot = df_igv.iloc[:, 0:5]

for index, row_info in df_snapshot.iterrows():
    chrom, start, stop, bed_name, score = row_info

    path_out_png = f'{score}_{bed_name}.snapshot.{format_}'
    middle = int((start + stop) / 2)

    text += f"goto {chrom}:{middle - 100}-{middle + 100}\nsort position\ncollapsesnapshot {path_out_png}\n\n"
print(text[:1000])


# with open(f'{path_out}/off-targets_{date}_snapshot.igv_shot_script', 'wt') as f:
#     f.write(text)

In [None]:
# DetectSeq_JAK2
# DetectSeq_SIRT6
# IND share?

# off-target analysis

## off-target list

### circos plot

#### learn circos plot
- [tutorial1](https://colab.research.google.com/drive/1xmAnv7AHWUTA2HWfjqV1lFWkFMSLJHG0?usp=sharing)
- [tutorial2](https://colab.research.google.com/drive/1RYSo4aXpDIZlSQ9EhO2kPCeF8FOwyvXv?usp=sharing)
- [tutorial3](https://colab.research.google.com/drive/1EPxCQCgOouVxtXcGyxu2ZqQvfucVnOJ-?usp=sharing)
- [tutorial4(Drawing pylogenetic tree)](https://colab.research.google.com/drive/140m2jpQpgSZwSlP-3u3Oj8IcJUbP2NGD?usp=sharing)

In [None]:
# pip install python-circos

In [None]:
# !mkdir -p../ temp_file
# % cd../ temp_file
# !mkdir -p pycircos
# % cd pycircos
# !mkdir -p sample_data
# % cd sample_data

In [None]:
# #The following example data was downloaded from https://venyao.xyz/shinyCircos/.
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_barplot.csv
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_chromosome_cytoband.csv
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_chromosome_general.csv
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_links.csv
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_point.csv
# !wget https: // github.com / ponnhide / pyCircos-examples / raw / main / example_notebooks / sample_data / example_data_rect_gradual.csv

In [None]:
# % cd / Volumes / zhaohn_HD / 3.project / 2022_DdCBE-3D-Genome_topic / 2022-09-30_Detect-seq_batch-1 / snakepipes_detect-seq

#### circos plot

##### 准备 data

In [None]:
# 基因组 fai 文件（染色体长度信息），区分 hg19 和 hg38
path_genome_length = '/Users/zhaohuanan/Bio/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.fai'
df_genome_length = pd.read_csv(
    path_genome_length,
    header=None,
    sep='\t',
    usecols=[0, 1],
    names=['chrom', 'end'],
)
df_genome_length.insert(loc=1, column='start', value=1)

df_genome_length = df_genome_length\
    .query("chrom != 'chrY'")\
    .query("chrom != 'chrM'")

print(df_genome_length.head(2))

In [None]:
# cytoband 信息，可以在 ucsc 下载，区分 hg19 和 hg38
df_genome_cytoband = pd.read_csv(
    "http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz",
    header=None,
    sep='\t',
    usecols=range(5),
    names=['chrom', 'start', 'end', 'value1', 'value2'])
# 对 chromosome 进行过滤，保留标准染色体
df_genome_cytoband = df_genome_cytoband\
    .query("not chrom.str.endswith('fix')")\
    .query("not chrom.str.endswith('alt')")\
    .query("not chrom.str.endswith('random')")\
    .query("not chrom.str.startswith('chrUn')")\
    .query("chrom != 'chrY'")\
    .query("chrom != 'chrM'")
print(sorted(df_genome_cytoband.chrom.unique()))
print()
print(df_genome_cytoband.head(2))

In [None]:
df_circos_point = df[[
    '<sample>', 'chr_name', 'region_start', 'region_end', 'log2_FC_mut'
]].copy()
df_circos_point.columns = ['sample', 'chrom', 'start', 'end', 'score']
df_circos_point.reset_index(inplace=True, drop=True)
df_circos_point

##### 设置比例尺

In [None]:
scale = 1.1

In [None]:
arc_range_i = df_circos_point['sample'].unique().__len__() + 1
arc_raixs_range = ((arc_range_i * 100.0) * scale,
                   (arc_range_i * 100.0 + 20) * scale)
arc_raixs_range

##### 设置染色体信息

In [None]:
# set chromeosomes
circle = Gcircle()

for idx, row in df_genome_length.iterrows():
    chrom, start, end = row
    length = end - start + 1

    arc = Garc(
        arc_id=chrom,  # 染色体名称
        size=length,  # 染色体长度
        interspace=1,  # 间隔距离
        raxis_range=arc_raixs_range,  # 内外半径长度
        labelposition=60,  # 越往里越小，越往外越大
        label_visible=True,  # 是否展示 label，也就是染色体名称
    )
    circle.add_garc(arc)  # 添加一个染色体信息

circle.set_garcs()  # 整合设置所有添加过的染色体信息

##### 整理 cytoband 信息到arcdata\_dict

In [None]:
# 定义不同 cytoband 的颜色
color_dict = {
    "gneg": "#FFFFFF00",
    "gpos25": "#EEEEEE",
    "gpos50": "#BBBBBB",
    "gpos75": "#777777",
    "gpos100": "#000000",
    "gvar": "#FFFFFF00",
    "stalk": "#C01E27",
    "acen": "#D82322"
}

# defaultdict的说明
# https://zhuanlan.zhihu.com/p/46476348
# 其实就是当引用的 key 不存在时返回默认值,这里是默认返回一个 dict 而不抛出 KeyError
arcdata_dict = collections.defaultdict(dict)  # 传入 dict 函数，调用时返回空 dict
# or
# arcdata_dict = collections.defaultdict(lambda: {})
# arcdata_dict

for idx, row in df_genome_cytoband.iterrows():
    chrom, start, end, value1, value2 = row
    width = end - start + 1
    # 在进行arcdata_dict对键chrom 取值取不到的时候
    # 默认创建空 dict 而不是 raise KeyError
    if chrom not in arcdata_dict:
        arcdata_dict[chrom]['positions'] = []
        arcdata_dict[chrom]['widths'] = []
        arcdata_dict[chrom]['colors'] = []
    else:
        arcdata_dict[chrom]['positions'].append(start)
        arcdata_dict[chrom]['widths'].append(width)
        arcdata_dict[chrom]['colors'].append(color_dict[value2])

print(arcdata_dict.__str__()[:1000])

##### 将 cytoband 信息加到 circle 对象中去

In [None]:
for chrom in arcdata_dict:
    circle.barplot(chrom,
                   data=[1] * len(arcdata_dict[chrom]['positions']),
                   positions=arcdata_dict[chrom]['positions'],
                   width=arcdata_dict[chrom]['widths'],
                   raxis_range=arc_raixs_range,
                   facecolor=arcdata_dict[chrom]['colors'])

##### 查看绘制的circos plot骨架

In [None]:
# circle.figure?
circle.figure

##### 添加每个样本中的 off-target sites 信息

In [None]:
# TODO
# sample lable
# background color
# point color
# edge color of point

# scatter plot
counter_circle = 0
# color

for sample, sample_df in df_circos_point.groupby('sample'):
    print(sample, sample_df.shape[0])
    counter_circle += 1

    values_all = []
    arcdata_dict = collections.defaultdict(dict)

    for idx, row in sample_df.iterrows():
        _, chrom, start, end, score = row
        middle = (start + end) / 2
        values_all.append(score)

        if chrom not in arcdata_dict:
            arcdata_dict[chrom]["positions"] = []
            arcdata_dict[chrom]["values"] = []
        else:
            arcdata_dict[chrom]["positions"].append(middle)
            arcdata_dict[chrom]["values"].append(score)

    vmin, vmax = min(values_all), max(values_all)

    arc_raixs_range_sample = ((counter_circle * 100.0) * scale,
                              (counter_circle * 100.0 + 80) * scale)

    for chrom in arcdata_dict:
        circle.scatterplot(
            chrom,
            data=arcdata_dict[chrom]["values"],
            positions=arcdata_dict[chrom]["positions"],
            rlim=[vmin - 0.05 * abs(vmin), vmax + 0.05 * abs(vmax)],
            markershape='o',
            markersize=1,
            raxis_range=arc_raixs_range_sample,
            facecolor="#468FBE",
            edgecolor="#000000",
            linewidth=0.03,
            spine=True)

circle.figure

In [None]:
circle.figure?


### upset plot

## signal strength

### scatter plot

## alignment
### art plot

## editing window
### indel comparation

## ctcf analysis
### shared off-target motif
### DddAwt,6,11 co-localization with ctcf