In [1]:
import numpy as np
from ipynb.fs.full.utility_functions import gen_data_dict, get_data, get_cumsum, error_count, write_to_csv, opart

In [2]:
seqs   = gen_data_dict('sequence_label_data/genome/signals.gz')
labels = gen_data_dict('sequence_label_data/genome/labels.gz')

header = ['sequenceID', "log_lambda", 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_fp_errs', 'fold_1_fn_errs', 'fold_1_tp', 'fold_1_tn', 'fold_2_fp_errs', 'fold_2_fn_errs', 'fold_2_tp', 'fold_2_tn',]
for i in range(len(seqs)):
    # generate data
    sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)
    sequence_length = len(sequence)-1

    # calculate lambda
    lda = np.log(sequence_length)
    lda = 10**(min([i/2 for i in range(-10, 11)], key=lambda x: abs(x - np.log10(lda))))

    # get total labels
    fold1_total_labels = len(neg_start_1) + len(pos_start_1)
    fold2_total_labels = len(neg_start_2) + len(pos_start_2)

    # run each lambda and record it into csv file
    row = [seqs[i][0], np.log10(lda), fold1_total_labels, fold2_total_labels]

    chpnt = opart(lda, sequence)
    err_1 = error_count(chpnt, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
    err_2 = error_count(chpnt, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    row.append(err_1[0])
    row.append(err_1[1])
    row.append(err_1[2])
    row.append(err_1[3])
    row.append(err_2[0])
    row.append(err_2[1])
    row.append(err_2[2])
    row.append(err_2[3])

    write_to_csv('1.genome_learning_output/BIC_paper.csv', header, row)