In [1]:
# IMPORT
import numpy as np
import pandas as pd
import gzip
import csv
import os
from joblib import Parallel, delayed

### DATA

In [2]:
def gen_data_dict(file_path):
    with gzip.open(file_path, 'rt') as file:
        df = pd.read_csv(file)

    _dict = tuple(df.groupby('sequenceID'))
    return _dict

In [3]:
def get_data(i, seqs, labels):
    # sequence
    seq_df = seqs[i][1]
    sequence = seq_df['logratio'].to_numpy()
    sequence = np.append([0], sequence)

    # labels
    lab_df = labels[i][1]

    # fold 1
    lab_df_1 = lab_df[lab_df['fold'] == 1]
    pos_lab_df = lab_df_1[lab_df_1['changes'] == 1]
    neg_lab_df = lab_df_1[lab_df_1['changes'] == 0]

    neg_start_1 = neg_lab_df['start'].to_numpy()
    neg_end_1   = neg_lab_df['end'].to_numpy()
    pos_start_1 = pos_lab_df['start'].to_numpy()
    pos_end_1   = pos_lab_df['end'].to_numpy()

    # fold 2
    lab_df_2 = lab_df[lab_df['fold'] == 2]
    pos_lab_df = lab_df_2[lab_df_2['changes'] == 1]
    neg_lab_df = lab_df_2[lab_df_2['changes'] == 0]

    neg_start_2 = neg_lab_df['start'].to_numpy()
    neg_end_2   = neg_lab_df['end'].to_numpy()
    pos_start_2 = pos_lab_df['start'].to_numpy()
    pos_end_2   = pos_lab_df['end'].to_numpy()

    return sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2

# # test
# seqs   = gen_data_dict('data/signals.gz')
# labels = gen_data_dict('data/labels.gz')
# get_data(95, seqs, labels)

### OPART

In [4]:
# Get cumulative sum vectors
def get_cumsum(sequence):
    y = np.cumsum(sequence)
    z = np.cumsum(np.square(sequence))

    y = np.append([0], y)
    z = np.append([0], z)

    return y, z

# # test
# seqs   = gen_data_dict('data/signals.gz')
# labels = gen_data_dict('data/labels.gz')
# sequence, _, _, _, _, _, _, _, _ = get_data(0, seqs=seqs, labels=labels)
# y, z = get_cumsum(sequence)
# print(y)
# print(z)

In [5]:
# function to create loss value from 'start' to 'end' given cumulative sum vector y (data) and z (square)
def L(start, end, y, z):
    _y = y[end+1] - y[start]
    _z = z[end+1] - z[start]
    return _z - np.square(_y)/(end-start+1)

# # test
# seqs   = gen_data_dict('data/signals.gz')
# labels = gen_data_dict('data/labels.gz')
# sequence, _, _, _, _, _, _, _, _ = get_data(0, seqs=seqs, labels=labels)
# y, z = get_cumsum(sequence)
# print(L(1, 2, y, z))
# print(L(2, 2, y, z))

# print(L(np.array([1,2]), 2, y, z))

In [6]:
# function to get the list of changepoint from vector tau_star
def trace_back(tau_star):
    tau = tau_star[-1]
    chpnt = np.array([len(tau_star)], dtype=int)
    while tau > 0:
        chpnt = np.append(tau, chpnt)
        tau = tau_star[tau-1]
    return np.append(0, chpnt)

# # test
# print(trace_back(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
# print(trace_back(np.array([0, 0, 0, 0, 0, 5, 5, 5, 5, 5])))

In [7]:
# opart dynamic algorithm return set of changepoints given lambda, sequence, and cumsum vectors
def opart(lda, sequence, y, z):
    sequence_length = len(sequence)-1

    # Set up
    C = np.zeros(sequence_length + 1)
    C[0] = -lda

    # Get tau_star
    tau_star = np.zeros(sequence_length+1, dtype=int)
    for t in range(1, sequence_length+1):

        # get set of possible value
        V = C[:t] + lda + L(1 + np.arange(t), t, y, z)

        # get optimal tau from set V
        last_chpnt = np.argmin(V)

        # update C_i
        C[t] = V[last_chpnt]

        # update tau_star
        tau_star[t] = last_chpnt

    # get set of changepoints
    set_of_chpnt = trace_back(tau_star[1:])

    return set_of_chpnt

# # test
# sequence, neg_start, neg_end, pos_start, pos_end = gen_data()
# sequence_length = len(sequence) - 1
# T = get_T(sequence_length, neg_start, neg_end, pos_start, pos_end)
# y, z = get_cumsum(sequence)

# lda = 0
# print(lopart(lda, T, sequence, y, z))

# lda = 1000
# print(lopart(lda, T, sequence, y, z))

In [8]:
# counting errors
def count_items_between(lst, a, b):
    count = sum(1 for item in lst if a <= item < b)
    return count

def error_count(chpnt, neg_start, neg_end, pos_start, pos_end):
    fp_count, fn_count = 0, 0                           # initizlize false positive and false negative

    for s, e in zip(neg_start, neg_end):
        if(count_items_between(chpnt, s, e) > 0):       # number of change is not 0 in negative labels
            fp_count += 1

    for s, e in zip(pos_start, pos_end):
        if(count_items_between(chpnt, s, e) > 1):       # number of change is greater than 1 in positive labels
            fp_count += 1
        elif(count_items_between(chpnt, s, e) == 0):    # number of change is 0 in positive labels
            fn_count += 1

    return fp_count, fn_count


# # test
# neg_start = [2, 7]      # there is no changepoint at point 2, 7, and 8
# neg_end   = [3, 9]      #
# pos_start = [4]         # there can be exactly one changepoint at 4 or 5
# pos_end   = [6]         #

# chpnt = [0, 3, 5, 6, 9, 10]
# print(error_count(chpnt, neg_start, neg_end, pos_start, pos_end))

# chpnt = [0, 1, 2, 6, 8, 9, 10]
# print(error_count(chpnt, neg_start, neg_end, pos_start, pos_end))

In [9]:
def write_to_csv(filename, header, row):
    # Check if the file exists
    file_exists = os.path.isfile(filename)

    # Open the CSV file in write mode
    with open(filename, 'a', newline='') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.writer(csvfile)

        # If the file is newly created, write the header row
        if not file_exists:
            csv_writer.writerow(header)

        # Write the new row
        csv_writer.writerow(row)

In [10]:
def try_lambda(i):
    lda = ldas[i]
    chpnt = opart(lda, sequence, y, z)
    err_1 = error_count(chpnt, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
    err_2 = error_count(chpnt, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    return err_1, err_2
    
if __name__ == "__main__":
    # get sequences and labels
    seqs   = gen_data_dict('sequence_label_data/signals.gz')
    labels = gen_data_dict('sequence_label_data/labels.gz')

    # lambda candidates
    ldas = [10**element for element in [i*0.5 for i in range(-10, 11)]]

    for i in range(len(seqs)):
        # generate data
        sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)

        # vectors of cumulative sums
        y, z = get_cumsum(sequence)

        # run each lambda and record it into csv file
        row  = [i]
        errs = Parallel(n_jobs=4)(delayed(try_lambda)(i) for i in range(0, 21))
        row += errs

        header = ['sequenceID','-5','-4.5','-4','-3.5','-3','-2.5','-2','-1.5','-1','-0.5','0','0.5','1','1.5','2','2.5','3','3.5','4','4.5','5']
        write_to_csv('learning_data/errors.csv', header, row)