In [1]:
import numpy as np
import os
import sys
import datetime

import scipy as sp
import scipy.stats

from bisect import bisect
from statsmodels.stats.multitest import multipletests

import math

In [2]:
def time_now():#return time
    curr_time = datetime.datetime.now()
    return curr_time.strftime("%c")

In [None]:
#Convert_wig_into_bp_coverage

def Convert_wig_into_bp_coverage(extracted_coverage,extracted_3UTR_region,strand_info):
    bp_coverage = np.zeros(extracted_3UTR_region[-1] - extracted_3UTR_region[0])
    relative_start = extracted_3UTR_region[0]
    for i in range(len(extracted_coverage)):
    
        curr_region_start = extracted_3UTR_region[i] - relative_start
        curr_region_end = extracted_3UTR_region[i+1] - relative_start
        bp_coverage[curr_region_start:curr_region_end] = extracted_coverage[i]
    if strand_info == '-':
        bp_coverage = bp_coverage[::-1]
    
    return bp_coverage


In [5]:
def De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Main(argv=None):
    '''
    '''
    if len(sys.argv) == 1:
        print("Please provide the configure file ...")
        exit(1)
    cfg_file = sys.argv[1]
    print("[%s] Start Analysis ..." % time_now(), file=sys.stderr)
    Group1_Tophat_aligned_file,Group2_Tophat_aligned_file,output_directory,Annotated_3UTR_file,Output_result_file,Num_least_in_group1_local,Num_least_in_group2_local,Coverage_cutoff_local,FDR_cutoff_local,Fold_change_cutoff_local,PDUI_cutoff_local = parse_cfgfile(cfg_file)
    
    num_group_1 = len(Group1_Tophat_aligned_file)
    All_Sample_files = Group1_Tophat_aligned_file[:]
    All_Sample_files.extend(Group2_Tophat_aligned_file)
    
    
    global Num_least_in_group1
    global Num_least_in_group2
    global Coverage_cutoff
    global FDR_cutoff
    global Fold_change_cutoff
    global PDUI_cutoff
    
    if Num_least_in_group1_local != '':
        Num_least_in_group1 = float(Num_least_in_group1_local)
    if Num_least_in_group2_local != '':
        Num_least_in_group2 = float(Num_least_in_group2_local)
    if Coverage_cutoff_local != '':
        Coverage_cutoff = float(Coverage_cutoff_local)
    if FDR_cutoff_local != '':
        FDR_cutoff = float(FDR_cutoff_local)
    if Fold_change_cutoff_local != '':
        Fold_change_cutoff = float(Fold_change_cutoff_local)
    if PDUI_cutoff_local != '':
        PDUI_cutoff = float(PDUI_cutoff_local)
    
    

    ##Prepare output directory
    d = os.path.dirname(output_directory)
    if not os.path.exists(d):
        os.makedirs(d)
    temp_dir = d+'/tmp/'
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    Output_all_prediction_file = output_directory+Output_result_file+'_result_temp.txt'
    Output_result = open(Output_all_prediction_file, 'w')
    
    num_samples = len(All_Sample_files)
    
    ##Debug
    print("[%s] Loading coverage ..." % time_now(), file=sys.stderr)
    All_samples_Target_3UTR_coverages, All_samples_sequencing_depths, UTR_events_dict = Load_Target_Wig_files(All_Sample_files, Annotated_3UTR_file)
    All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths)
    print("[%s] Loading coverage finished ..." % time_now(), file=sys.stderr)
    ##Write the first line
    first_line = ['Gene','fit_value','Predicted_Proximal_APA','Loci']
    for i in range(num_group_1):
        curr_long_exp = 'A_%s_long_exp' % str(i+1)
        curr_short_exp = 'A_%s_short_exp' % str(i+1)
        curr_ratio ='A_%s_PDUI' % str(i+1)
        first_line.extend([curr_long_exp,curr_short_exp,curr_ratio])
    for i in range(num_samples - num_group_1):
        curr_long_exp = 'B_%s_long_exp' % str(i+1)
        curr_short_exp = 'B_%s_short_exp' % str(i+1)
        curr_ratio ='B_%s_PDUI' % str(i+1)
        first_line.extend([curr_long_exp,curr_short_exp,curr_ratio])
    first_line.append('PDUI_Group_diff')
    
    Output_result.writelines('\t'.join(first_line) + '\n')
    
    
    for curr_3UTR_id in UTR_events_dict:
        curr_3UTR_structure = UTR_events_dict[curr_3UTR_id]
        region_start = curr_3UTR_structure[1]
        region_end   = curr_3UTR_structure[2]
        curr_strand  = curr_3UTR_structure[-2]
        UTR_pos = curr_3UTR_structure[-1]
        if curr_3UTR_id in All_samples_Target_3UTR_coverages:
            curr_3UTR_coverage_wig = All_samples_Target_3UTR_coverages[curr_3UTR_id]
            curr_3UTR_all_samples_bp_coverage = []
            for curr_sample_curr_3UTR_coverage_wig in curr_3UTR_coverage_wig: 
                curr_3UTR_curr_sample_bp_coverage = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0],curr_sample_curr_3UTR_coverage_wig[1],curr_strand)
                curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_sample_bp_coverage)
            
            select_mean_squared_error,selcted_break_point,UTR_abundances = De_Novo_3UTR_Coverage_estimation_Genome_for_TCGA_multiple_samples(curr_3UTR_all_samples_bp_coverage, region_start, region_end,curr_strand,All_sample_coverage_weights)
            
            
            if str(select_mean_squared_error) != "Na":
                Long_3UTR_exp_all = np.array(UTR_abundances[0])
                Short_3UTR_exp_all = np.array(UTR_abundances[1])
                num_non_zero = sum((Long_3UTR_exp_all + Short_3UTR_exp_all)>0)
                if num_non_zero == num_samples:
                    All_Long_inclusion_ratios = []
                    line_write = [curr_3UTR_id, "%.1f" % select_mean_squared_error, str(selcted_break_point), UTR_pos]
                    for i in range(num_samples):
                        curr_sample_ratio = float(UTR_abundances[0][i])/(float(UTR_abundances[0][i]) + float(UTR_abundances[1][i]))##long 3'UTR percentage
                        All_Long_inclusion_ratios.append(curr_sample_ratio)
                        line_write.append("%.2f" % UTR_abundances[0][i])
                        line_write.append("%.2f" % UTR_abundances[1][i])
                        line_write.append("%.2f" % curr_sample_ratio)
                    
                    Group1_IR = All_Long_inclusion_ratios[:num_group_1]
                    Group2_IR = All_Long_inclusion_ratios[num_group_1:]
                    inclusion_ratio_Group_diff = np.mean(np.array(Group1_IR)) - np.mean(np.array(Group2_IR))
                    
                    line_write.append("%.2f" % inclusion_ratio_Group_diff)
                    
                    Output_result.writelines( '\t'.join(line_write)+'\n')
        
    Output_result.close()
    
    print("[%s] Filtering the result ..." % time_now(), file=sys.stderr)
    
    Output_Motif_filtered_result_file = output_directory+Output_result_file+'_All_Prediction_Results.txt'
    #UTR_APA_Result_filtering(Output_all_prediction_file,Genome_seq_fasta,Output_Motif_filtered_result_file)
    
    DaPars_Filtering(Output_all_prediction_file, num_samples,num_group_1 ,Output_Motif_filtered_result_file)
    
    
    try:
        os.remove(Output_all_prediction_file)
    except OSError:
        pass

    try:
        os.rmdir(temp_dir)
    except OSError:
        pass

    
    
    print("[%s] Finished!" % time_now(), file=sys.stderr)

    