In [1]:
import numpy as np
import os
import sys
import datetime

import scipy as sp
import scipy.stats

from bisect import bisect
from statsmodels.stats.multitest import multipletests

import math

In [2]:
def parse_cfgfile(cfg_file):
    '''Parse configure file
    '''
    Group1_Tophat_aligned_file=''
    Group2_Tophat_aligned_file=''
    output_directory=''
    Annotated_3UTR_file=''
    Output_result_file=''
    Num_least_in_group1_local=''
    Num_least_in_group2_local=''
    Coverage_cutoff_local = ''
    FDR_cutoff_local = ''
    Fold_change_cutoff_local = ''
    PDUI_cutoff_local = ''
    
    for line in open(cfg_file,'r'):
        if line[0] == '\n' or line[0] == '#':
            comments = line;
        else:
            line = line.rstrip();
            command = line.split('=');
            if command[0] == 'Group1_Tophat_aligned_Wig':
                Group1_Tophat_aligned_file = command[1].split(',');
            if command[0] == 'Group2_Tophat_aligned_Wig':
                Group2_Tophat_aligned_file = command[1].split(',');
            if command[0] == 'Output_directory':
                output_directory = command[1]
                if output_directory[-1] != '/':
                    output_directory += '/'
            if command[0] == 'Annotated_3UTR':
                Annotated_3UTR_file = command[1]
            if command[0] == 'Output_result_file':
                Output_result_file = command[1]
            
            ##Parameters
            if command[0] == 'Num_least_in_group1':
                Num_least_in_group1_local = command[1]
            if command[0] == 'Num_least_in_group2':
                Num_least_in_group2_local = command[1]
            if command[0] == 'Coverage_cutoff':
                Coverage_cutoff_local = command[1]
            if command[0] == 'FDR_cutoff':
                FDR_cutoff_local = command[1]
            if command[0] == 'Fold_change_cutoff':
                Fold_change_cutoff_local = command[1]
            if command[0] == 'PDUI_cutoff':
                PDUI_cutoff_local = command[1]
            
    
    if Group1_Tophat_aligned_file=='':
        print("No Tophat aligned BAM file for group 1!", file=sys.stderr)
        exit(1)
    if Group2_Tophat_aligned_file=='':
        print("No Tophat aligned BAM file for group 2!", file=sys.stderr)
        exit(1)
    if output_directory=='':
        print("No output directory!", file=sys.stderr)
        exit(1)
    if Annotated_3UTR_file=='':
        print("No annotated 3' UTR file!", file=sys.stderr)
        exit(1)
    if Output_result_file=='':
        print("No result file name!", file=sys.stderr)
        exit(1)
    return Group1_Tophat_aligned_file,Group2_Tophat_aligned_file,output_directory,Annotated_3UTR_file,Output_result_file,Num_least_in_group1_local,Num_least_in_group2_local,Coverage_cutoff_local,FDR_cutoff_local,Fold_change_cutoff_local,PDUI_cutoff_local


In [4]:
Group1_Tophat_aligned_file,Group2_Tophat_aligned_file,output_directory,Annotated_3UTR_file,Output_result_file,Num_least_in_group1_local,Num_least_in_group2_local,Coverage_cutoff_local,FDR_cutoff_local,Fold_change_cutoff_local,PDUI_cutoff_local = parse_cfgfile('/home/li/桌面/PROJECT6/apasite_predict2/DAPARS/DATA/DaPars_test_data_configure.txt')

In [5]:
num_group_1 = len(Group1_Tophat_aligned_file)
All_Sample_files = Group1_Tophat_aligned_file[:]
All_Sample_files.extend(Group2_Tophat_aligned_file)

In [6]:
def Load_Target_Wig_files(All_Wig_files, UTR_Annotation_file):
    UTR_events_dict = {}
    All_Samples_Total_depth = []
    for line in open(UTR_Annotation_file,'r'):
        fields = line.strip('\n').split('\t')
        curr_chr = fields[0]
        region_start = int(float(fields[1]))
        region_end   = int(float(fields[2]))
        curr_strand  = fields[-1]
        UTR_pos = "%s:%s-%s" % (curr_chr, region_start, region_end)
        end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2))
        if curr_strand == '+':
            region_end = str(int(region_end) - end_shift)
        else:
            region_start = str(int(region_start) + end_shift)
        region_start = int(region_start) + 1
        region_end   = int(region_end) - 1
        if region_start + 50 < region_end:
            UTR_events_dict[fields[3]] = [fields[0],region_start,region_end,fields[-1],UTR_pos]

    ##Load coverage for all samples
    All_samples_extracted_3UTR_coverage_dict = {}
    for curr_wig_file in All_Wig_files:
        curr_sample_All_chroms_coverage_dict = {}
        num_line = 0
        cur_sample_total_depth = 0
        for line in open(curr_wig_file,'r'):
            if '#' not in line and line[0:3] == 'chr':
                fields = line.strip('\n').split('\t')
                chrom_name = fields[0]
                region_start = int(float(fields[1]))
                region_end = int(float(fields[2]))
                cur_sample_total_depth += int(float(fields[-1])) * (region_end - region_start)
                if chrom_name not in curr_sample_All_chroms_coverage_dict:
                    curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]]
                if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]:
                    curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start)
                    curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0)
                curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end)
                curr_sample_All_chroms_coverage_dict[chrom_name][1].append(int(float(fields[-1])))
            num_line += 1
        curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0)
        All_Samples_Total_depth.append(cur_sample_total_depth)
        for curr_3UTR_event_id in UTR_events_dict:
            curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id]
            curr_chr = curr_3UTR_structure[0]
            if curr_chr in curr_sample_All_chroms_coverage_dict:
                curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr]
                region_start = curr_3UTR_structure[1]
                region_end = curr_3UTR_structure[2]
                left_region_index = bisect(curr_chr_coverage[0],region_start)
                right_region_index = bisect(curr_chr_coverage[0],region_end)

                extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1]
                extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index]
                extracted_3UTR_region.insert(0,region_start)
                extracted_3UTR_region.append(region_end)
                if curr_3UTR_event_id not in All_samples_extracted_3UTR_coverage_dict:
                    All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id] = []
                All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id].append([extracted_coverage,extracted_3UTR_region])
    return All_samples_extracted_3UTR_coverage_dict,np.array(All_Samples_Total_depth),UTR_events_dict


In [None]:
#Load_Target_Wig_files
All_samples_Target_3UTR_coverages, All_samples_sequencing_depths, UTR_events_dict = Load_Target_Wig_files(All_Sample_files, Annotated_3UTR_file)

In [None]:
#All_samples_Target_3UTR_coverages
len(All_samples_Target_3UTR_coverages)

1043

In [14]:
type(All_samples_Target_3UTR_coverages)

dict

In [15]:
for i in All_samples_Target_3UTR_coverages:
    print(i)

NM_001468|GAGE1|chrX|+
NM_001012968|SPIN4|chrX|-
NM_003828|MTMR1|chrX|+
NM_032967|PCDH11X|chrX|+
NM_003173|SUV39H1|chrX|+
NM_003179|SYP|chrX|-
NM_001081550|THOC2|chrX|-
NM_014008|CCDC22|chrX|+
NM_014009|FOXP3|chrX|-
NR_029379|LOC100132163|chrX|+
NM_001256188|SNX12|chrX|-
NM_004312|ARR3|chrX|+
NR_024062|UBE2DNL|chrX|+
NM_198881|TBC1D8B|chrX|+
NM_001105243|PCDH19|chrX|-
NM_021049|MAGEA5|chrX|-
NM_005193|CDX4|chrX|+
NM_033380|COL4A5|chrX|+
NM_001168361|PCDH11X|chrX|+
NM_016303|WBP5|chrX|+
NR_029513|MIR98|chrX|-
NM_001170570|CXorf56|chrX|-
NM_001145951|TIMM8A|chrX|-
NM_020922|WNK3|chrX|-
NR_038988|LOC100287765|chrX|+
NM_173493|PASD1|chrX|+
NM_173495|PTCHD1|chrX|+
NM_173494|CXorf41|chrX|+
NM_001017438|CT45A6|chrX|-
NM_000425|L1CAM|chrX|-
NM_006743|RBM3|chrX|+
NR_029708|MIR188|chrX|+
NR_030233|MIR506|chrX|-
NR_030230|MIR505|chrX|-
NR_030236|MIR509-1|chrX|-
NR_030237|MIR510|chrX|-
NR_030234|MIR507|chrX|-
NR_030235|MIR508|chrX|-
NR_030238|MIR514A1|chrX|-
NR_030239|MIR514A2|chrX|-
NM_006520|DYN

In [None]:
#All_samples_sequencing_depths
All_samples_sequencing_depths

array([475249457, 453111546])

In [None]:
#UTR_events_dict
len(UTR_events_dict)

24983

In [None]:
type(UTR_events_dict)

dict

In [20]:
for i in All_samples_Target_3UTR_coverages:
    print(i)

NM_001468|GAGE1|chrX|+
NM_001012968|SPIN4|chrX|-
NM_003828|MTMR1|chrX|+
NM_032967|PCDH11X|chrX|+
NM_003173|SUV39H1|chrX|+
NM_003179|SYP|chrX|-
NM_001081550|THOC2|chrX|-
NM_014008|CCDC22|chrX|+
NM_014009|FOXP3|chrX|-
NR_029379|LOC100132163|chrX|+
NM_001256188|SNX12|chrX|-
NM_004312|ARR3|chrX|+
NR_024062|UBE2DNL|chrX|+
NM_198881|TBC1D8B|chrX|+
NM_001105243|PCDH19|chrX|-
NM_021049|MAGEA5|chrX|-
NM_005193|CDX4|chrX|+
NM_033380|COL4A5|chrX|+
NM_001168361|PCDH11X|chrX|+
NM_016303|WBP5|chrX|+
NR_029513|MIR98|chrX|-
NM_001170570|CXorf56|chrX|-
NM_001145951|TIMM8A|chrX|-
NM_020922|WNK3|chrX|-
NR_038988|LOC100287765|chrX|+
NM_173493|PASD1|chrX|+
NM_173495|PTCHD1|chrX|+
NM_173494|CXorf41|chrX|+
NM_001017438|CT45A6|chrX|-
NM_000425|L1CAM|chrX|-
NM_006743|RBM3|chrX|+
NR_029708|MIR188|chrX|+
NR_030233|MIR506|chrX|-
NR_030230|MIR505|chrX|-
NR_030236|MIR509-1|chrX|-
NR_030237|MIR510|chrX|-
NR_030234|MIR507|chrX|-
NR_030235|MIR508|chrX|-
NR_030238|MIR514A1|chrX|-
NR_030239|MIR514A2|chrX|-
NM_006520|DYN

In [21]:
All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths)
All_sample_coverage_weights

array([1.02384623, 0.97615377])