# Packages

In [1]:
import os
from glob import glob

import pandas as pd
from bioat.lib.libpandas import set_option
from bioat.lib.libpath import HOME
from pybedtools import BedTool

set_option()

INFO  @ 2023-08-28 21:32:08 bioat.lib.libpandas.set_option: set pandas: max_colwidth=40
INFO  @ 2023-08-28 21:32:08 bioat.lib.libpandas.set_option: set pandas: display.width=120
INFO  @ 2023-08-28 21:32:08 bioat.lib.libpandas.set_option: set pandas: display.max_columns=None
INFO  @ 2023-08-28 21:32:08 bioat.lib.libpandas.set_option: set pandas: display.max_rows=50


# classification for TAS-dependent

## get art files

```bash
# 使用TALE的左右两边的序列来align
# 拿到align后的art file
# plot art生成pdf
/gpfs/user/zhaohuanan/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1_ATP8_JAK2_SIRT6/art/plot_art.sh
```

```bash
# for backup
       │ File: plot_art.sh
───────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1   │ # bipython==1.78!!!
   2   │ 
   3   │ 
   4   │ GENOME=$HOME/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa
   5   │ K_PARAM=align_total_mismatch,align_degen_total_mismatch,align_total_gap
   6   │ K_R=True,True,True
   7   │ A_PARAM=align_coordinate,align_strand,align_total_mismatch,align_degen_total_mismatch,region_index
   8   │ 
   9   │ 
  10   │ # ATP8
  11   │ # L_TALE=ATTAAACACAAACTAC
  12   │ # R_TALE=ATGGGCTTTGGT
  13   │ # JAK2
  14   │ # L_TALE=CTGAAAAAGACTCTGCA
  15   │ # R_TALE=CCATTTCTGTCATCGTA
  16   │ # SIRT6
  17   │ L_TALE=TACGCGGCGGGGCTGTC
  18   │ R_TALE=CCGGGAGGCCGCACTTG
  19   │ 
  20   │ 
  21   │ 
  22   │ 
  23   │ # for SAMPLE in   DetectSeq_ATP8-DddAwt_REP-1 \
  24   │ #               DetectSeq_ATP8-DddA6_REP-1 \
  25   │ #               DetectSeq_ATP8-DddA11_REP-1;
  26   │ # for SAMPLE in   DetectSeq_JAK2-DddA11_REP-1 \
  27   │ #               DetectSeq_JAK2-DddA11_REP-2;
  28   │ for SAMPLE in   DetectSeq_SIRT6-DddA11_REP-1 \
  29   │                 DetectSeq_SIRT6-DddA11_REP-2;
  30   │     do
  31   │     MPMAT=../final_list_after_igv_check/2023-04-20_${SAMPLE}_final_list.mpmat
  32   │     L_ART=${SAMPLE}_TALE_align.Left.art
  33   │     R_ART=${SAMPLE}_TALE_align.Right.art
  34   │     L_ARTPLOT=Plot_Fig_${SAMPLE}_TALE_align.Left.pdf
  35   │     R_ARTPLOT=Plot_Fig_${SAMPLE}_TALE_align.Right.pdf
  36   │ 
  37   │     python mpmat-to-art-TALE.py -r ${GENOME} -i ${MPMAT} -q ${L_TALE} -m region -e 50 --input_filetype mpmat --input_header False -o ${L_ART}
  38   │     python mpmat-to-art-TALE.py -r ${GENOME} -i ${MPMAT} -q ${R_TALE} -m region -e 50 --input_filetype mpmat --input_header False -o ${R_ART} 
  39   │     python plot-art.py -i ${L_ART} -o ${L_ARTPLOT} --align_seq ${L_TALE} -k ${K_PARAM}  -r ${K_R} -a ${A_PARAM}
  40   │     python plot-art.py -i ${R_ART} -o ${R_ARTPLOT} --align_seq ${R_TALE} -k ${K_PARAM}  -r ${K_R} -a ${A_PARAM}
  41   │ done;
```

## get df_arts

In [2]:
# %% classification for TAS-dependent and TAS-independent off-target
# use information of art files
arts = sorted(glob('../art/DetectSeq_*.art'))

In [3]:
ls_arts = []

for art in arts:
    # print(art)
    df = pd.read_csv(art, sep='\t')
    df['file_info'] = art
    ls_arts.append(df)

df_arts = pd.concat(ls_arts)
df_arts.shape

(30794, 21)

In [4]:
df_arts[['<sample>', 'TAS-Dep_stat']] = (
    df_arts['file_info']
    .str.split('/')
    .str[-1]
    .str.replace('.art', '')
    .str.split('_TALE_align.', expand=True)
)
df_arts

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat
0,chr11,28107073,28107246,chr11_28107123_28107196,chr11,28107158,28107173,+,-50,T,9,7,9,7,0,0,27.0,TTTATAAAAACCTTAC,X|||X|X|X|XXX|||,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left
1,chr4,127523422,127523578,chr4_127523472_127523528,chr4,127523489,127523504,+,-32,T,8,8,8,8,0,0,18.0,TTGAAATTTGATCTCC,X|X|||XXXX|X||X|,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left
2,chr10,79996509,79996654,chr10_79996559_79996604,chr10,79996621,79996636,+,-77,C,10,6,10,6,0,0,26.0,CCTTTACAAAAACTCC,XX|XX|||X|||||X|,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left
3,chr20,36808662,36808791,chr20_36808712_36808741,chr20,36808693,36808708,+,4,C,10,6,10,6,0,0,26.0,CTGCAGCTCAAACTCC,X|XX|X|X||||||X|,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left
4,chr9,115952192,115952353,chr9_115952242_115952303,chr9,115952317,115952332,-,14,T,9,7,9,7,0,0,27.0,TTTAAGCCCCACATGC,X||||X|X|X|XX|X|,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9388,chr1,148725196,148725316,chr1_148725246_148725266,chr1,148725219,148725235,+,11,T,4,13,11,6,7,0,27.0,TTAAAAAAACAGAGGTA,XX...|..X|.X|XX|.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right
9389,chr8,81158675,81158808,chr8_81158725_81158758,chr8,81158721,81158737,+,21,C,9,8,11,6,2,0,27.0,CCCTGAAAGGGCACTAC,||XX||..XX|||||XX,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right
9390,chr8,100191766,100191880,chr8_100191816_100191830,chr8,100191813,100191829,-,-17,C,8,9,11,6,3,0,25.0,CACAGAGTTCACCCCTA,|XX.|||XX|.|X|X|.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right
9391,chr17,32831928,32832041,chr17_32831978_32831991,chr17,32831942,32831958,+,20,A,9,8,12,5,3,0,34.0,AGGGGAAACTAACCTTG,XX||||..|X.XX||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right


In [5]:
df_arts = df_arts.sort_values(
    by='align_score',
    ascending=False
).drop_duplicates(  # 去除multialignment
    subset=[
        'region_index',  # 信号位点相同
        'align_chr_name',  # TALE align位置相同
        'align_chr_start',
        'align_chr_end',
    ],
    keep='first'
).copy()
df_arts

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182511,4182527,-,17,T,17,0,17,0,0,0,95.0,TACGCGGCGGGGCTGTC,|||||||||||||||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left
886,chr9,129203049,129203171,chr9_129203099_129203121,chr9,129203129,129203145,-,8,T,13,4,17,0,4,0,87.0,TACGCGACAGAACTGTC,||||||.|.|..|||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Left
45,chr9,5021948,5022146,chr9_5021998_5022096,chr9,5022005,5022021,-,7,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Right
559,chr9,5021875,5022047,chr9_5021925_5021997,chr9,5022005,5022021,-,8,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Right
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182476,4182492,+,2,C,17,0,17,0,0,0,85.0,CCGGGAGGCCGCACTTG,|||||||||||||||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,chr16,89268826,89268947,chr16_89268876_89268897,chr16,89268919,89268935,-,43,C,8,9,8,9,0,0,4.0,CAGAGCAGGGCAGGGCC,|X||XX|X|X|XXX||X,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Left
534,chr3,47475992,47476103,chr3_47476042_47476053,chr3,47476065,47476081,-,23,C,8,9,8,9,0,0,4.0,CCATACTTCCCTCCGGT,||||XXX|XX|XX||XX,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Right
208,chr6,30556597,30556713,chr6_30556647_30556663,chr6,30556692,30556708,-,29,C,8,9,8,9,0,0,4.0,CCACTAAGGTCAAGCCC,|||X|XXX||||XXXXX,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Right
75,chr15,68651206,68651327,chr15_68651256_68651277,chr15,68651272,68651287,-,16,G,7,9,7,9,0,0,-1.0,GCCCAGCCCTGACCAC,XXXX|X|X|XX||X||,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left


In [6]:
df_arts.loc[df_arts['region_index'] == "chr19_4182494_4182519", 'TAS-Dep_stat'] = 'on-target: SIRT6'
df_arts.loc[df_arts['region_index'] == "chr9_5021998_5022096", 'TAS-Dep_stat'] = 'on-target: JAK2'
# ATP8 backup if using in future
# bowtie -x $HOME/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.bowtie1_index -f -1 test_R1.fa -2 test_R2.fa
# 1/1     +       chrM    8448    ATTAAACACAAACTAC        IIIIIIIIIIIIIIII        3
# 1/2     -       chrM    8478    ACCAAAGCCCAT    IIIIIIIIIIII    3

In [7]:
df_arts.query('region_index=="chr19_4182494_4182519"')

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182511,4182527,-,17,T,17,0,17,0,0,0,95.0,TACGCGGCGGGGCTGTC,|||||||||||||||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182476,4182492,+,2,C,17,0,17,0,0,0,85.0,CCGGGAGGCCGCACTTG,|||||||||||||||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6


In [8]:
df_arts.query('region_index=="chr9_5021998_5022096"')

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat
45,chr9,5021948,5022146,chr9_5021998_5022096,chr9,5022005,5022021,-,7,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,on-target: JAK2
45,chr9,5021948,5022146,chr9_5021998_5022096,chr9,5021972,5021988,+,10,C,17,0,17,0,0,0,85.0,CTGAAAAAGACTCTGCA,|||||||||||||||||,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,on-target: JAK2


In [9]:
# 没有chrM的off？
# 以后再看看是不是系统性的丢了？
df_arts.sort_values(by='align_score', ascending=False).query('align_chr_name=="chrM"')

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat


In [10]:
df_arts['id'] = range(df_arts.shape[0])
df_arts

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182511,4182527,-,17,T,17,0,17,0,0,0,95.0,TACGCGGCGGGGCTGTC,|||||||||||||||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6,0
886,chr9,129203049,129203171,chr9_129203099_129203121,chr9,129203129,129203145,-,8,T,13,4,17,0,4,0,87.0,TACGCGACAGAACTGTC,||||||.|.|..|||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Left,1
45,chr9,5021948,5022146,chr9_5021998_5022096,chr9,5022005,5022021,-,7,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,on-target: JAK2,2
559,chr9,5021875,5022047,chr9_5021925_5021997,chr9,5022005,5022021,-,8,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Right,3
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182476,4182492,+,2,C,17,0,17,0,0,0,85.0,CCGGGAGGCCGCACTTG,|||||||||||||||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,chr16,89268826,89268947,chr16_89268876_89268897,chr16,89268919,89268935,-,43,C,8,9,8,9,0,0,4.0,CAGAGCAGGGCAGGGCC,|X||XX|X|X|XXX||X,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Left,27132
534,chr3,47475992,47476103,chr3_47476042_47476053,chr3,47476065,47476081,-,23,C,8,9,8,9,0,0,4.0,CCATACTTCCCTCCGGT,||||XXX|XX|XX||XX,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Right,27133
208,chr6,30556597,30556713,chr6_30556647_30556663,chr6,30556692,30556708,-,29,C,8,9,8,9,0,0,4.0,CCACTAAGGTCAAGCCC,|||X|XXX||||XXXXX,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Right,27134
75,chr15,68651206,68651327,chr15_68651256_68651277,chr15,68651272,68651287,-,16,G,7,9,7,9,0,0,-1.0,GCCCAGCCCTGACCAC,XXXX|X|X|XX||X||,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left,27135


### save df_arts

In [11]:
df_arts.to_csv('../art_seqinfo/df_arts.csv', index=False)

In [12]:
df_ctcf = pd.read_csv(f'{HOME}/1.database/public_data/CTCF_ATAC_data/ENCFF285QVL_CTCF_binding_sites_fix_range.bed',
                      sep='\t')
df_ctcf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38035 entries, 0 to 38034
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   chrom   38035 non-null  object 
 1   start   38035 non-null  int64  
 2   end     38035 non-null  int64  
 3   name    38035 non-null  object 
 4   RPKM    38035 non-null  float64
 5   strand  38035 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.7+ MB


## TAS-dependent

<!-- ### 检查是否有和CTCF overlap的（潜在的IND）[先不做] -->

In [13]:
# df_coor_all_region = df_arts['region_index'].str.split('_', expand=True)
# df_coor_all_region[[1, 2]] = df_coor_all_region[[1, 2]].astype(int)
# df_coor_all_region.info()

In [14]:
# df_coor_all_region.columns = ['chrom', 'start', 'end']
# df_coor_all_region

In [15]:
# coor_all_region_bed = BedTool.from_dataframe(df_coor_all_region)
# coor_all_region_bed

In [16]:
# coor_ctcf_bed = BedTool.from_dataframe(df_ctcf.iloc[:, :3])
# coor_ctcf_bed.to_dataframe()

In [17]:
# df_coor_intersect = coor_all_region_bed.intersect(coor_ctcf_bed, loj=True).to_dataframe()
# df_coor_intersect.columns = ['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2']
# df_coor_intersect

In [18]:
# # 发现有不少和CTCF binding site有overlap的点，归为IND？暂时归为IND，进一步确认需要实验验证了
# df_coor_intersect_indep = df_coor_intersect.query('start2!=-1').drop_duplicates(subset=['chrom1', 'start1', 'end1'])
# df_coor_intersect_indep

In [19]:
# index_indep = df_coor_intersect_indep.apply(lambda x: '_'.join(x[:3].astype(str)), axis=1).tolist()
# index_indep[:10]

In [20]:
# # 把定义为IND的Dep去除
# df_arts_rm_ind = df_arts.query('region_index not in @index_indep').copy()
# df_arts_rm_ind

In [21]:
# df_arts_rm_ind

In [22]:
# # test align length?
# tmpdf = df_arts_rm_ind.query('~`TAS-Dep_stat`.str.contains("on-target")').copy()
# tmpdf['align_query_seq'] = tmpdf['align_query_seq'].str.replace('-', '')
# print(tmpdf.groupby(['<sample>', 'TAS-Dep_stat'])['align_query_seq'].unique().str[0].reset_index().to_markdown())

### 硬cutoff卡, 初步确认TAS dep

In [23]:
# df_arts_tas_dep = (
#     df_arts_rm_ind.sort_values(by=['align_total_mismatch', 'align_degen_total_mismatch', 'align_total_gap'])
#     .query('align_total_mismatch<=4')
#     .query('align_degen_total_mismatch<=4')
#     .query('align_total_gap<=2')
# ).copy()
# df_arts_tas_dep

df_arts_tas_dep = (
    df_arts.sort_values(by=['align_total_mismatch', 'align_degen_total_mismatch', 'align_total_gap'])
    .query('align_total_mismatch<=4')
    .query('align_degen_total_mismatch<=4')
    .query('align_total_gap<=2')
).copy()
df_arts_tas_dep

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182511,4182527,-,17,T,17,0,17,0,0,0,95.0,TACGCGGCGGGGCTGTC,|||||||||||||||||,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6,0
45,chr9,5021948,5022146,chr9_5021998_5022096,chr9,5022005,5022021,-,7,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,on-target: JAK2,2
559,chr9,5021875,5022047,chr9_5021925_5021997,chr9,5022005,5022021,-,8,C,17,0,17,0,0,0,85.0,CCATTTCTGTCATCGTA,|||||||||||||||||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Right,3
6994,chr19,4182444,4182569,chr19_4182494_4182519,chr19,4182476,4182492,+,2,C,17,0,17,0,0,0,85.0,CCGGGAGGCCGCACTTG,|||||||||||||||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,on-target: SIRT6,4
559,chr9,5021875,5022047,chr9_5021925_5021997,chr9,5021972,5021988,+,9,C,17,0,17,0,0,0,85.0,CTGAAAAAGACTCTGCA,|||||||||||||||||,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Left,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,chr9,125262233,125262356,chr9_125262283_125262306,chr9,125262308,125262323,-,2,C,12,4,12,4,0,1,20.0,CCATTTC-GTTCTCGCC,|||||||-||XX|||XX,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Right,26386
688,chr2,110053150,110053342,chr2_110053200_110053292,chr2,110053260,110053276,-,-32,A,12,4,12,4,0,1,20.0,ATAATAACAGACACTTC,||X|-||||X|X|||X|,ATTA-AACACAAACTAC,../art/DetectSeq_ATP8-DddA6_REP-1_TA...,DetectSeq_ATP8-DddA6_REP-1,Left,26388
139,chr2,117786821,117786966,chr2_117786871_117786916,chr2,117786875,117786891,+,25,A,12,4,12,4,0,1,20.0,ATTACAATTCAGATTAC,||||-||XX||X|X|||,ATTA-AACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Left,26430
606,chr14,49865505,49865617,chr14_49865555_49865567,chr14,49865563,49865578,-,-4,C,12,4,12,4,0,1,20.0,CTGCAGTGGA-TCTGCA,|||X|XXX||-||||||,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Left,26454


In [24]:
# Fix TAS-Dep两边的！
ls_df_arts_tas_dep_fix = []

for sample, _df in df_arts_tas_dep.groupby('<sample>'):
    print(sample)
    print(_df.region_index.duplicated().any())
    # True代表既是左边依赖又是右边依赖 ！！！！！！！ # 在不做实验的情况下
    # DetectSeq_ATP8-DddA11_REP-1
    # True
    # DetectSeq_ATP8-DddA6_REP-1
    # True
    # DetectSeq_ATP8-DddAwt_REP-1
    # False
    # DetectSeq_JAK2-DddA11_REP-1
    # True
    # DetectSeq_JAK2-DddA11_REP-2
    # True
    # DetectSeq_SIRT6-DddA11_REP-1
    # False
    # DetectSeq_SIRT6-DddA11_REP-2
    # True
    ls_dup_index = _df[_df.region_index.duplicated()].query(
        '~`TAS-Dep_stat`.str.contains("on-target")').region_index.tolist()
    _df.loc[_df.region_index.map(lambda x: x in ls_dup_index), 'TAS-Dep_stat'] = 'Both'
    ls_df_arts_tas_dep_fix.append(_df)
df_arts_tas_dep_fix = pd.concat(ls_df_arts_tas_dep_fix)
df_arts_tas_dep_fix

DetectSeq_ATP8-DddA11_REP-1
True
DetectSeq_ATP8-DddA6_REP-1
True
DetectSeq_ATP8-DddAwt_REP-1
False
DetectSeq_JAK2-DddA11_REP-1
True
DetectSeq_JAK2-DddA11_REP-2
True
DetectSeq_SIRT6-DddA11_REP-1
False
DetectSeq_SIRT6-DddA11_REP-2
True


Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
209,chr10,123171455,123171602,chr10_123171505_123171552,chr10,123171536,123171547,+,5,A,12,0,12,0,0,0,60.0,ATGGGCTTTGGT,||||||||||||,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Right,1159
149,chr9,109215639,109215812,chr9_109215689_109215762,chr9,109215772,109215783,-,10,A,11,1,12,0,1,0,58.0,ATGGGCTTTGAT,||||||||||.|,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Right,1375
108,chr3,37937199,37937325,chr3_37937249_37937275,chr3,37937253,37937264,-,4,A,11,1,12,0,1,0,58.0,ATGAGCTTTGGT,|||.||||||||,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Right,1456
157,chr1,170722730,170722881,chr1_170722780_170722831,chr1,170722814,170722825,+,6,A,11,1,12,0,1,0,58.0,ATGGGCTTTGAT,||||||||||.|,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Right,1466
30,chr11,15381350,15381505,chr11_15381400_15381455,chr11,15381405,15381416,-,5,A,11,1,11,1,0,0,51.0,ATGGGCTTTGGA,|||||||||||X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Right,3526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1765,chr6,6004554,6004714,chr6_6004604_6004664,chr6,6004663,6004679,+,-75,C,13,4,13,4,0,0,49.0,CCGGGAGGCTGCGCGCG,|||||||||X||X|XX|,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,4852
6789,chr17,82875381,82875521,chr17_82875431_82875471,chr17,82875434,82875451,+,-20,T,13,4,13,4,0,1,35.0,TACGCTGCTGCTGCTGCC,|||||X||-|XX||||X|,TACGCGGC-GGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left,15133
6929,chr1,202189067,202189207,chr1_202189117_202189157,chr1,202189080,202189097,-,-77,T,13,4,13,4,0,1,35.0,TACCCGGGTGGTGCTGCC,|||X|-||X||X||||X|,TACGC-GGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left,15201
2173,chr17,62656947,62657070,chr17_62656997_62657020,chr17,62656948,62656965,-,-49,T,13,4,13,4,0,1,35.0,TGCTAGGCTGGGGCTGCC,|X|XX|||-|||||||X|,TACGCGGC-GGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left,15936


In [25]:
# TAS-dep
for sample, df in df_arts_tas_dep_fix.query('~`TAS-Dep_stat`.str.contains("on-target")').groupby(
        ['<sample>', 'TAS-Dep_stat']):
    sample, tas_dep_stat = sample
    print('查看是否有duplication', 'tas_dep_stat =', tas_dep_stat, df.duplicated(subset='region_index').sum())
# 查看是否有duplication tas_dep_stat = Both 5
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 19
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 1
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 9
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 4
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0

查看是否有duplication tas_dep_stat = Both 6
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 20
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 1
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 9
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 5
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0


In [26]:
# TAS-dep
ls_df_arts_tas_dep_fix = []

for sample, df in df_arts_tas_dep_fix.groupby(['<sample>', 'TAS-Dep_stat']):
    sample, tas_dep_stat = sample
    # print('查看是否有duplication', 'tas_dep_stat =', tas_dep_stat, df.duplicated(subset='region_index').sum())
    # if tas_dep_stat == 'Both' or 'on-target' in tas_dep_stat:
    # df = df.sort_values(by=['region_index', 'align_score'], ascending=[True, False]).drop_duplicates(subset=['region_index', '<sample>'], keep='first')
    df = df.sort_values(by=['region_index', 'align_score'], ascending=[True, False]).drop_duplicates(
        subset=['region_index', '<sample>'], keep='first')
    ls_df_arts_tas_dep_fix.append(df)
df_arts_tas_dep_fix = pd.concat(ls_df_arts_tas_dep_fix)

for sample, df in df_arts_tas_dep_fix.groupby(['<sample>', 'TAS-Dep_stat']):
    sample, tas_dep_stat = sample
    print('查看是否有duplication', 'tas_dep_stat =', tas_dep_stat, df.duplicated(subset='region_index').sum())
# 查看是否有duplication tas_dep_stat = Both 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
# 查看是否有duplication tas_dep_stat = Both 0
# 查看是否有duplication tas_dep_stat = Left 0
# 查看是否有duplication tas_dep_stat = Right 0
df_arts_tas_dep_fix

查看是否有duplication tas_dep_stat = Both 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = on-target: JAK2 0
查看是否有duplication tas_dep_stat = Both 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Both 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = on-target: SIRT6 0


Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
53,chr11,113102877,113103036,chr11_113102927_113102986,chr11,113102935,113102950,+,36,A,13,3,13,3,0,0,53.0,ATTAGACACAAAATAA,||||X|||||||X||X,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,2645
211,chr12,51595410,51595573,chr12_51595460_51595523,chr12,51595507,51595518,+,5,A,11,1,11,1,0,0,51.0,ATGGGCTTTGGA,|||||||||||X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,3698
33,chr18,44726511,44726657,chr18_44726561_44726607,chr18,44726566,44726577,-,5,A,8,4,11,1,3,0,45.0,ATGAACTTTGAA,|||..|||||.X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,6384
61,chr19,45742345,45742506,chr19_45742395_45742456,chr19,45742480,45742495,+,-39,T,12,4,12,4,0,0,54.0,TTTAAAAACAAACAAG,X|||||X||||||X|X,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,2380
139,chr2,117786821,117786966,chr2_117786871_117786916,chr2,117786900,117786911,+,5,A,10,2,11,1,1,0,49.0,ATGAGCTTTGGG,|||.|||||||X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,4411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7521,chr9,83274918,83275045,chr9_83274968_83274995,chr9,83274955,83274971,-,-13,C,13,4,15,2,2,0,63.0,CCTGGAAGCCACCCTTG,||X|||.|||.|X||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,605
2734,chr9,91161331,91161450,chr9_91161381_91161400,chr9,91161422,91161438,-,41,C,13,4,14,3,1,0,56.0,CCAGGAGGACGCACAGG,||.|||||X|||||XX|,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,1814
5852,chrX,138128161,138128287,chrX_138128211_138128237,chrX,138128167,138128183,+,54,C,13,4,13,4,0,0,49.0,CAGGGAGGGTGTACTTG,|X||||||XX|X|||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,4555
8878,chrX,40103635,40103766,chrX_40103685_40103716,chrX,40103660,40103676,+,9,C,13,4,14,3,1,0,56.0,CCGGGAGCCCACATTTT,|||||||X||.||X||X,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,1863


In [27]:
print('JAK2-----')
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-1"').query(
    '`TAS-Dep_stat`=="Left"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-2"').query(
    '`TAS-Dep_stat`=="Left"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-1"').query(
    '`TAS-Dep_stat`=="Right"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-2"').query(
    '`TAS-Dep_stat`=="Right"').region_index.duplicated().any())

print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-1"').query('`TAS-Dep_stat`!="Both"').query(
    '~`TAS-Dep_stat`.str.contains("on-target")').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-2"').query('`TAS-Dep_stat`!="Both"').query(
    '~`TAS-Dep_stat`.str.contains("on-target")').region_index.duplicated().any())

print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-1"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-2"').region_index.duplicated().any())
print('SIRT6-----')
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-1"').query(
    '`TAS-Dep_stat`=="Left"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-2"').query(
    '`TAS-Dep_stat`=="Left"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-1"').query(
    '`TAS-Dep_stat`=="Right"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-2"').query(
    '`TAS-Dep_stat`=="Right"').region_index.duplicated().any())

print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-1"').query('`TAS-Dep_stat`!="Both"').query(
    '~`TAS-Dep_stat`.str.contains("on-target")').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-2"').query('`TAS-Dep_stat`!="Both"').query(
    '~`TAS-Dep_stat`.str.contains("on-target")').region_index.duplicated().any())

print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-1"').region_index.duplicated().any())
print(df_arts_tas_dep_fix.query('`<sample>`=="DetectSeq_SIRT6-DddA11_REP-2"').region_index.duplicated().any())
# JAK2-----
# False
# False
# False
# False
# False
# False
# False
# False
# SIRT6-----
# False
# False
# False
# False
# False
# False
# False
# False
# 符合预期，dup都是Both和on-target造成的

JAK2-----
False
False
False
False
False
False
False
False
SIRT6-----
False
False
False
False
False
False
False
False


In [28]:
df_arts_tas_dep_fix.groupby(['<sample>', 'TAS-Dep_stat'])['TAS-Dep_stat'].count()

<sample>                      TAS-Dep_stat    
DetectSeq_ATP8-DddA11_REP-1   Both                  6
                              Left                  1
                              Right                99
DetectSeq_ATP8-DddA6_REP-1    Both                 20
                              Left                 14
                              Right               454
DetectSeq_ATP8-DddAwt_REP-1   Left                  1
                              Right                 7
DetectSeq_JAK2-DddA11_REP-1   Both                  1
                              Left                 22
                              Right                11
                              on-target: JAK2       1
DetectSeq_JAK2-DddA11_REP-2   Both                  9
                              Left                 19
                              Right               162
DetectSeq_SIRT6-DddA11_REP-1  Left                 44
                              Right                28
DetectSeq_SIRT6-DddA11_REP-2  Both 

In [29]:
tmp = df_arts_tas_dep_fix.query('~`TAS-Dep_stat`.str.contains("on-target")').groupby(['<sample>', 'TAS-Dep_stat'])[
    'chrom'].count().to_frame().reset_index()
tmp_on = df_arts_tas_dep_fix.query('`TAS-Dep_stat`.str.contains("on-target")').groupby(['<sample>', 'TAS-Dep_stat'])[
    'chrom'].count().to_frame().reset_index()
tmp['<sample>'] = tmp['<sample>'].str.replace('DetectSeq_', '')
tmp.columns = ['<sample>', 'TAS-Dep_stat', 'count']
tmp_on.columns = ['<sample>', 'TAS-Dep_stat', 'count']
print(df_arts_tas_dep_fix.shape[0], 'before_rm_on-target')
print(tmp['count'].sum(), 'after_rm_on-target')
print(tmp.sort_values(by=['<sample>', 'TAS-Dep_stat']).reset_index(drop=True).to_markdown())
print(tmp_on.to_markdown())

1152 before_rm_on-target
1150 after_rm_on-target
|    | <sample>           | TAS-Dep_stat   |   count |
|---:|:-------------------|:---------------|--------:|
|  0 | ATP8-DddA11_REP-1  | Both           |       6 |
|  1 | ATP8-DddA11_REP-1  | Left           |       1 |
|  2 | ATP8-DddA11_REP-1  | Right          |      99 |
|  3 | ATP8-DddA6_REP-1   | Both           |      20 |
|  4 | ATP8-DddA6_REP-1   | Left           |      14 |
|  5 | ATP8-DddA6_REP-1   | Right          |     454 |
|  6 | ATP8-DddAwt_REP-1  | Left           |       1 |
|  7 | ATP8-DddAwt_REP-1  | Right          |       7 |
|  8 | JAK2-DddA11_REP-1  | Both           |       1 |
|  9 | JAK2-DddA11_REP-1  | Left           |      22 |
| 10 | JAK2-DddA11_REP-1  | Right          |      11 |
| 11 | JAK2-DddA11_REP-2  | Both           |       9 |
| 12 | JAK2-DddA11_REP-2  | Left           |      19 |
| 13 | JAK2-DddA11_REP-2  | Right          |     162 |
| 14 | SIRT6-DddA11_REP-1 | Left           |      44 |
| 15 | SIRT6-Ddd

### Final TAS-dependent off-target

In [30]:
# 最终TAS Dep list
df_arts_tas_dep = df_arts_tas_dep_fix.query('~`TAS-Dep_stat`.str.contains("on-target")').copy()
df_arts_tas_dep

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
53,chr11,113102877,113103036,chr11_113102927_113102986,chr11,113102935,113102950,+,36,A,13,3,13,3,0,0,53.0,ATTAGACACAAAATAA,||||X|||||||X||X,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,2645
211,chr12,51595410,51595573,chr12_51595460_51595523,chr12,51595507,51595518,+,5,A,11,1,11,1,0,0,51.0,ATGGGCTTTGGA,|||||||||||X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,3698
33,chr18,44726511,44726657,chr18_44726561_44726607,chr18,44726566,44726577,-,5,A,8,4,11,1,3,0,45.0,ATGAACTTTGAA,|||..|||||.X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,6384
61,chr19,45742345,45742506,chr19_45742395_45742456,chr19,45742480,45742495,+,-39,T,12,4,12,4,0,0,54.0,TTTAAAAACAAACAAG,X|||||X||||||X|X,ATTAAACACAAACTAC,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,2380
139,chr2,117786821,117786966,chr2_117786871_117786916,chr2,117786900,117786911,+,5,A,10,2,11,1,1,0,49.0,ATGAGCTTTGGG,|||.|||||||X,ATGGGCTTTGGT,../art/DetectSeq_ATP8-DddA11_REP-1_T...,DetectSeq_ATP8-DddA11_REP-1,Both,4411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,chr9,122264587,122264697,chr9_122264637_122264647,chr9,122264641,122264657,-,4,A,13,4,14,3,1,0,56.0,ACAGGAGGCCGCCCTCG,X|.|||||||||X||X|,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,1978
7521,chr9,83274918,83275045,chr9_83274968_83274995,chr9,83274955,83274971,-,-13,C,13,4,15,2,2,0,63.0,CCTGGAAGCCACCCTTG,||X|||.|||.|X||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,605
2734,chr9,91161331,91161450,chr9_91161381_91161400,chr9,91161422,91161438,-,41,C,13,4,14,3,1,0,56.0,CCAGGAGGACGCACAGG,||.|||||X|||||XX|,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,1814
5852,chrX,138128161,138128287,chrX_138128211_138128237,chrX,138128167,138128183,+,54,C,13,4,13,4,0,0,49.0,CAGGGAGGGTGTACTTG,|X||||||XX|X|||||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Right,4555


### save df_arts_tas_dep

In [31]:
df_arts_tas_dep.to_csv('../art_seqinfo/df_arts_tas_dep.csv', index=False)

# if you are from step 12, start from here!

In [32]:
# df_arts_tas_dep = pd.read_csv('../art_seqinfo/df_arts_tas_dep.csv')
# df_arts_tas_dep['TAS-Dep_stat'].value_counts()
# TAS-Dep_stat
# Right    844
# Left     265
# Both      41
# Name: count, dtype: int64

df_arts_tas_dep = pd.read_csv('../art_seqinfo/df_arts_tas_dep_v2.csv')
df_arts_tas_dep['TAS-Dep_stat'].value_counts()
# TAS-Dep_stat
# Left_additional     1328
# Right                830
# Right_additional     302
# Left                 260
# Both_additional       67
# Both                  41
# Name: count, dtype: int64

TAS-Dep_stat
Left_additional     1328
Right                830
Right_additional     302
Left                 260
Both_additional       67
Both                  41
Name: count, dtype: int64

### get Seqinfo for TAS-dependent off-target

用来生成weblogo（使用sublime完成trim，修剪到长度一致）

In [33]:
os.makedirs('../art_seqinfo/tas-dependent', exist_ok=True)

In [34]:
# TAS-dep
for sample, df in df_arts_tas_dep.query('~`TAS-Dep_stat`.str.contains("on-target")').groupby(
        ['<sample>', 'TAS-Dep_stat']):
    sample, tas_dep_stat = sample

    # if tas_dep_stat not in ['Left', 'Right']:  # first run
    if tas_dep_stat not in ['Left', 'Right', 'Left_additional', 'Right_additional']:  # step12
        continue

    print('查看是否有duplication', 'tas_dep_stat =', tas_dep_stat, df.duplicated(subset='region_index').sum())
    # fix之前
    # 查看是否有duplication tas_dep_stat = Both 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Both 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Both 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Both 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Both 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # fix之后
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Left_additional 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Right_additional 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Left_additional 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Right_additional 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Left_additional 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Right_additional 0
    # 查看是否有duplication tas_dep_stat = Left 0
    # 查看是否有duplication tas_dep_stat = Left_additional 0
    # 查看是否有duplication tas_dep_stat = Right 0
    # 查看是否有duplication tas_dep_stat = Right_additional 0
    with open(f'../art_seqinfo/tas-dependent/seqinfo_{sample}_{tas_dep_stat}.seq', 'wt') as f:
        f.write(
            ''.join(
                df[['region_index', 'align_target_seq']].apply(
                    # lambda x: f'>{x["region_index"]}\n{x["align_target_seq"].replace("-", "")}\n', axis=1
                    lambda x: f'{x["align_target_seq"]}\n', axis=1  # 不需要fa的header，只需要sequence即可
                ).tolist()
            )
        )

查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Left_additional 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Right_additional 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Left_additional 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Right_additional 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Left_additional 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Right_additional 0
查看是否有duplication tas_dep_stat = Left 0
查看是否有duplication tas_dep_stat = Left_additional 0
查看是否有duplication tas_dep_stat = Right 0
查看是否有duplication tas_dep_stat = Right_additional 0


### Sequence conservative analysis (SeqLogo) ATP8 Left

```bash
# cat seqinfo_DetectSeq_ATP8-DddA11_REP-1_Left.seq | cut -c 1-16  # 因为ATP8-Left长度为16

for i in `ls *ATP8*Left.seq`
cat $i | cut -c 1-16 > $i.fix.seq
```

ATTAAACACAAACTAC

seqinfo_DetectSeq_ATP8-DddA6_REP-1_Left.seq.fix.seq n=14

![image.png](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddA6_REP-1_Left.seq.fix.jpg)

seqinfo_DetectSeq_ATP8-DddA11_REP-1_Left.seq.fix.seq n=1

![image.png](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddA11_REP-1_Left.seq.fix.jpg)


seqinfo_DetectSeq_ATP8-DddAwt_REP-1_Left.seq.fix.seq n=1

![image.png](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddAwt_REP-1_Left.seq.fix.jpg)

### Sequence conservative analysis (SeqLogo) ATP8 Right

```bash
for i in `ls *ATP8*Right.seq`
cat $i | cut -c 1-12 > $i.fix.seq
```
ATGGGCTTTGGT

seqinfo_DetectSeq_ATP8-DddA6_REP-1_Right.seq.fix.eps n=454

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddA6_REP-1_Right.seq.fix.jpg)

seqinfo_DetectSeq_ATP8-DddA11_REP-1_Right.seq.fix.eps n=99

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddA11_REP-1_Right.seq.fix.jpg)

seqinfo_DetectSeq_ATP8-DddAwt_REP-1_Right.seq.fix.eps n=7

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_ATP8-DddAwt_REP-1_Right.seq.fix.jpg)

### Sequence conservative analysis (SeqLogo) JAK2 Left
```bash
for i in `ls *JAK2*Left.seq`
cat $i | cut -c 1-17 > $i.fix.seq
```

CTGAAAAAGACTCTGCA 17

rep1 n=33

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_JAK2-DddA11_REP-1_Left.seq.fix.jpg)

rep2 n=26

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_JAK2-DddA11_REP-2_Left.seq.fix.jpg)

### Sequence conservative analysis (SeqLogo) JAK2 Right
```bash
for i in `ls *JAK2*Right.seq`
cat $i | cut -c 1-17 > $i.fix.seq
```

CCATTTCTGTCATCGTA 17

rep1 n=15

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_JAK2-DddA11_REP-1_Right.seq.fix.jpg)

rep2 n=212

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_JAK2-DddA11_REP-2_Right.seq.fix.jpg)

### Sequence conservative analysis (SeqLogo) SIRT6 Left
```bash
for i in `ls *SIRT6*Left.seq`
cat $i | cut -c 1-17 > $i.fix.seq
```

TACGCGGCGGGGCTGTC 17

rep1 n=356

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_SIRT6-DddA11_REP-1_Left.seq.fix.jpg)

rep2 n=1158

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_SIRT6-DddA11_REP-2_Left.seq.fix.jpg)

### Sequence conservative analysis (SeqLogo) SIRT6 Right
```bash
for i in `ls *SIRT6*Right.seq`
cat $i | cut -c 1-17 > $i.fix.seq
```

CCGGGAGGCCGCACTTG 17

rep1 n=69

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_SIRT6-DddA11_REP-1_Right.seq.fix.jpg)

rep2 n=275

![](../art_seqinfo/tas-dependent/seqinfo_DetectSeq_SIRT6-DddA11_REP-2_Right.seq.fix.jpg)

### Biological repeat correlation (JAK2 & SIRT6)

In [35]:
df_arts_tas_dep.query('`<sample>`=="DetectSeq_JAK2-DddA11_REP-1"').groupby('TAS-Dep_stat')['chrom'].count()

TAS-Dep_stat
Both                 1
Both_additional      1
Left                22
Left_additional     10
Right               11
Right_additional     5
Name: chrom, dtype: int64

In [36]:
df_arts_tas_dep.query('`<sample>`.str.contains("ATP8")')[df_arts_tas_dep.query('`<sample>`.str.contains("ATP8")').duplicated(subset=['region_index', 'TAS-Dep_stat'])]
# 没有overlap

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id


In [37]:
df_arts_tas_dep.query('`<sample>`.str.contains("JAK2")')[df_arts_tas_dep.query('`<sample>`.str.contains("JAK2")').duplicated(subset=['region_index', 'TAS-Dep_stat'])]

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id


In [38]:
df_arts_tas_dep.query('region_index=="chrX_21926985_21927028"')
# # 没有overlap

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
605,chrX,21926935,21927078,chrX_21926985_21927028,chrX,21927047,21927063,+,-35,T,9,8,10,7,1,0,30.0,TAACATTTTTCAGCATA,XX|XX|X|X|||X|.||,CCATTTCTGTCATCGTA,../art/DetectSeq_JAK2-DddA11_REP-1_T...,DetectSeq_JAK2-DddA11_REP-1,Left_additional,20163
661,chrX,21926935,21927078,chrX_21926985_21927028,chrX,21927007,21927023,+,5,C,14,3,14,3,0,0,58.0,CTGAAACAGACTCTGTG,||||||X||||||||XX,CTGAAAAAGACTCTGCA,../art/DetectSeq_JAK2-DddA11_REP-2_T...,DetectSeq_JAK2-DddA11_REP-2,Left,1436


In [39]:
df_arts_tas_dep.query('`<sample>`.str.contains("SIRT6")')[df_arts_tas_dep.query('`<sample>`.str.contains("SIRT6")').duplicated(subset=['region_index', 'TAS-Dep_stat'])]

# 没有overlap

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
1332,chr8,26671122,26671270,chr8_26671172_26671220,chr8,26671196,26671212,+,8,T,12,5,15,2,3,0,71.0,TACACAGGAGGGCTGTG,|||.|.|X.|||||||X,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,151
1333,chr11,122354769,122354919,chr11_122354819_122354869,chr11,122354795,122354811,+,8,T,12,5,15,2,3,0,71.0,TACACAGCAGGGCTGCT,|||.|.||.||||||XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,147
1376,chr11,35605156,35605300,chr11_35605206_35605250,chr11,35605228,35605244,+,6,T,10,7,15,2,5,0,67.0,TACACAAAAGAGCTGAC,|||.|..X.|.||||X|,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,318
1377,chr5,83517002,83517192,chr5_83517052_83517142,chr5,83517120,83517136,+,6,T,10,7,15,2,5,0,67.0,TACACAGCAGGACTAGA,|||.|.||.||.||.XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,279
1508,chr12,19794202,19794353,chr12_19794252_19794303,chr12,19794224,19794240,+,12,T,11,6,14,3,3,0,62.0,TACACAAAGGGGCAGTT,|||.|..X|||||X||X,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2820,chr19,38073852,38073993,chr19_38073902_38073943,chr19,38073876,38073892,-,-67,C,8,9,13,4,5,0,39.0,CCACAGAAGCACAGTTG,||.X.X..X|.||X|||,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,11195
2821,chr20,44616571,44616710,chr20_44616621_44616660,chr20,44616690,44616706,-,30,C,10,7,12,5,2,0,36.0,CCTGGACGCCACCGTGA,||X|||X|||.|XX|X.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,13572
2823,chr10,129450964,129451150,chr10_129451014_129451100,chr10,129451009,129451025,-,-91,T,7,10,11,6,4,0,33.0,TGGAGAAACTGCCATAA,XX|.||..|X||XX|X.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,17274
2825,chr15,55128413,55128550,chr15_55128463_55128500,chr15,55128518,55128534,+,-34,C,10,7,11,6,1,0,29.0,CAAGGAGGCTGGAGCTC,|X.||||||X|X|XX|X,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,21241


In [40]:
index_df_arts_tas_dep_SIRT6_bio_rep = df_arts_tas_dep.query('`<sample>`.str.contains("SIRT6")')[df_arts_tas_dep.query('`<sample>`.str.contains("SIRT6")').duplicated(subset=['region_index', 'TAS-Dep_stat'])]['region_index'].unique().tolist()
df_arts_tas_dep.query('`<sample>`.str.contains("SIRT6")').query('region_index in @index_df_arts_tas_dep_SIRT6_bio_rep').sort_values('region_index')

Unnamed: 0,chrom,start,end,region_index,align_chr_name,align_chr_start,align_chr_end,align_strand,align_dist_to_signal,align_N0_base,align_total_match,align_total_mismatch,align_degen_total_match,align_degen_total_mismatch,align_degen_num,align_total_gap,align_score,align_target_seq,align_info_state,align_query_seq,file_info,<sample>,TAS-Dep_stat,id
2823,chr10,129450964,129451150,chr10_129451014_129451100,chr10,129451009,129451025,-,-91,T,7,10,11,6,4,0,33.0,TGGAGAAACTGCCATAA,XX|.||..|X||XX|X.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,17274
2775,chr10,129450964,129451150,chr10_129451014_129451100,chr10,129451078,129451094,+,6,T,8,9,14,3,6,0,56.0,TACACAAAAGAACTGAA,|||.|..X.|..|||XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Both_additional,1871
2368,chr10,22210553,22210722,chr10_22210603_22210672,chr10,22210577,22210593,+,10,C,10,7,13,4,3,0,43.0,CACACAGAAGGGCTGCT,X||.|.|X.||||||XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,7938
1277,chr10,22210553,22210722,chr10_22210603_22210672,chr10,22210583,22210599,-,-20,T,8,9,10,7,2,0,28.0,TAAGTCAGCAGCCCTTC,XX.|XX.||X||X|||X,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Left_additional,22003
2710,chr10,25555311,25555490,chr10_25555361_25555440,chr10,25555335,25555351,-,-105,G,8,9,11,6,3,0,25.0,GCTAAAAGCTGTACTCT,X|X..|.||X|X|||XX,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,24748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,chrX,130690147,130690291,chrX_130690197_130690241,chrX,130690221,130690237,+,4,T,11,6,13,4,2,0,55.0,TACAAGGAGGAGCTGCT,|||.X||X||.||||XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Left_additional,2243
2827,chrX,154457743,154457855,chrX_154457793_154457805,chrX,154457784,154457800,-,-9,C,8,9,11,6,3,0,25.0,CGGGACAGCCGTCCAAA,|X||.X.||||XX|XX.,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Both_additional,24507
2781,chrX,154457743,154457855,chrX_154457793_154457805,chrX,154457758,154457774,-,-35,T,10,7,11,6,1,0,39.0,TCCCCGGCGACCCTGAA,|X|X|||||.XX|||XX,TACGCGGCGGGGCTGTC,../art/DetectSeq_SIRT6-DddA11_REP-1_...,DetectSeq_SIRT6-DddA11_REP-1,Both_additional,10982
2647,chrX,27164938,27165086,chrX_27164988_27165036,chrX,27164991,27165007,+,29,T,7,10,11,6,4,0,33.0,TCAGGGAACAAAATTTT,X|.||X..|X.X|X||X,CCGGGAGGCCGCACTTG,../art/DetectSeq_SIRT6-DddA11_REP-2_...,DetectSeq_SIRT6-DddA11_REP-2,Left_additional,17447
