In [7]:
# # get functions from OPART.ipynb
# %run OPART.ipynb

In [8]:
import numpy as np
import pandas as pd
from ipynb.fs.full.utility_functions import gen_data_dict, get_data, get_cumsum, error_count, write_to_csv, opart

In [9]:
seqs   = gen_data_dict('../sequence_label_data/genome/signals.gz')
labels = gen_data_dict('../sequence_label_data/genome/labels.gz')

header = ['sequenceID', "log_lambda", 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_fp_errs', 'fold_1_fn_errs', 'fold_1_tp', 'fold_1_tn', 'fold_2_fp_errs', 'fold_2_fn_errs', 'fold_2_tp', 'fold_2_tn',]
# header = ['sequenceID', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_errs', 'fold_2_errs']
for i in range(len(seqs)):
    # generate data
    sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)
    sequence_length = len(sequence)-1

    # vectors of cumulative sums
    y, z = get_cumsum(sequence)

    # calculate lambda
    lda = np.log(sequence_length)

    # get total labels
    fold1_total_labels = len(neg_start_1) + len(pos_start_1)
    fold2_total_labels = len(neg_start_2) + len(pos_start_2)

    # run each lambda and record it into csv file
    row  = [seqs[i][0], np.log10(lda), fold1_total_labels, fold2_total_labels]

    chpnt = opart(lda, sequence)
    err_1 = error_count(chpnt, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
    err_2 = error_count(chpnt, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    row.append(err_1[0])
    row.append(err_1[1])
    row.append(err_1[2])
    row.append(err_1[3])
    row.append(err_2[0])
    row.append(err_2[1])
    row.append(err_2[2])
    row.append(err_2[3])
    # row.append(sum(err_1))
    # row.append(sum(err_2))

    write_to_csv('BIC_1.csv', header, row)

In [10]:
BIC_df    = pd.read_csv('BIC_1.csv')
rate1_list = []
rate2_list = []
for df in [BIC_df]:
    if(df is BIC_df):
        method = "BIC"

    fold1_total_errs = df['fold_1_fp_errs'].sum() + df['fold_1_fn_errs'].sum()
    fold2_total_errs = df['fold_2_fp_errs'].sum() + df['fold_2_fn_errs'].sum()

    fold1_total_labels = df['fold_1_total_labels'].sum()
    fold2_total_labels = df['fold_2_total_labels'].sum()

    rate1 = (fold1_total_labels - fold1_total_errs)/fold1_total_labels
    rate2 = (fold2_total_labels - fold2_total_errs)/fold2_total_labels

    rate1_list.append(round(100*rate1, 2))
    rate2_list.append(round(100*rate2, 2))

    print("method: %4s \t fold1.test: %5.2f \t fold2.test: %5.2f \t total_label_fold1: %3d \t total_label_fold2: %3d \t total_test_error_fold1: %3d \t total_test_error_fold2: %3d" 
          % (method, 100*rate1, 100*rate2, fold1_total_labels, fold2_total_labels, fold1_total_errs, fold2_total_errs))

method:  BIC 	 fold1.test: 70.35 	 fold2.test: 50.00 	 total_label_fold1: 752 	 total_label_fold2: 520 	 total_test_error_fold1: 223 	 total_test_error_fold2: 260
