In [1]:
import numpy as np
import pandas as pd
import gzip

In [2]:
def gen_data_dict(file_path):
    df = pd.read_csv(file_path, compression='xz')
    df['index'] = df.groupby('sequenceID').cumcount() + 1
    _dict = tuple(df.groupby('sequenceID'))
    return _dict

# # test
# seqs   = gen_data_dict('0_sequences_labels/profiles.csv.xz')
# labels = gen_data_dict('0_sequences_labels/labels.csv.xz')

In [3]:
def find_closest_index(df, arr_pos):
    # Sort the DataFrame by 'position_a' to facilitate finding the closest index
    df_sorted = df.sort_values(by='position')
    
    # Function to find the closest index for a given position
    def find_closest(position):
        closest_index = None
        min_distance = np.inf
        
        for i, row in df_sorted.iterrows():
            distance = abs(row['position'] - position)
            if distance < min_distance:
                min_distance = distance
                closest_index = row['index']
        
        return closest_index
    
    # Find closest index for positions in array B
    closest_indices = [find_closest(position) for position in arr_pos]
    
    return closest_indices

# # test
# pos_start = labels[719][1][labels[719][1]['max.changes'] == 1.0]['labelStart'].to_numpy()
# find_closest_index(seqs[719][1], pos_start)

In [4]:
def get_data(i, seqs, labels):
    # sequence
    sequence = seqs[i][1]['signal'].to_numpy()
    sequence = np.append([0], sequence)

    # labels
    lab_df = labels[i][1]

    # get label sets
    neg_start = lab_df[lab_df['max.changes'] == 0.0]['labelStart'].to_numpy()
    neg_end   = lab_df[lab_df['max.changes'] == 0.0]['labelEnd'].to_numpy()
    pos_start = lab_df[lab_df['max.changes'] == 1.0]['labelStart'].to_numpy()
    pos_end   = lab_df[lab_df['max.changes'] == 1.0]['labelEnd'].to_numpy()

    neg_start = find_closest_index(seqs[i][1], neg_start)
    neg_end   = find_closest_index(seqs[i][1], neg_end)
    pos_start = find_closest_index(seqs[i][1], pos_start)
    pos_end   = find_closest_index(seqs[i][1], pos_end)
    
    return sequence, neg_start, neg_end, pos_start, pos_end

# # test
# get_data(719, seqs, labels)

In [5]:
def get_cumsum(sequence):
    y = np.cumsum(sequence)
    z = np.cumsum(np.square(sequence))

    y = np.append([0], y)
    z = np.append([0], z)

    return y, z

In [6]:
def L(start, end, y, z):
    _y = y[end+1] - y[start]
    _z = z[end+1] - z[start]
    return _z - np.square(_y)/(end-start+1)

In [7]:
def trace_back(tau_star):
    tau = tau_star[-1]
    chpnt = np.array([len(tau_star)], dtype=int)
    while tau > 0:
        chpnt = np.append(tau, chpnt)
        tau = tau_star[tau-1]
    return np.append(0, chpnt)

In [8]:
def error_count(chpnt, neg_start, neg_end, pos_start, pos_end):
    chpnt = chpnt[1:]
    
    fp = 0
    fn = 0

    for ns, ne in zip(neg_start, neg_end):
        count = sum(1 for cp in chpnt if ns <= cp < ne)
        if count >= 1:
            fp += 1

    for ps, pe in zip(pos_start, pos_end):
        count = sum(1 for cp in chpnt if ps <= cp < pe)
        if count >= 2:
            fp += 1
        elif count == 0:
            fn += 1

    return fp, fn

In [9]:
def opart(lda, sequence):
    # cumsum vector:
    y, z = get_cumsum(sequence)

    # length of sequence
    sequence_length = len(sequence)-1

    # Set up
    C = np.zeros(sequence_length + 1)
    C[0] = -lda

    # Get tau_star
    tau_star = np.zeros(sequence_length+1, dtype=int)
    for t in range(1, sequence_length+1):

        # get set of possible value
        V = C[:t] + lda + L(1 + np.arange(t), t, y, z)

        # get optimal tau from set V
        last_chpnt = np.argmin(V)

        # update C_i
        C[t] = V[last_chpnt]

        # update tau_star
        tau_star[t] = last_chpnt

    # get set of changepoints
    set_of_chpnt = trace_back(tau_star[1:])

    return set_of_chpnt

In [13]:
# # Test length
# seqs   = gen_data_dict('0_sequences_labels/profiles.csv.xz')
# lengths = []
# for i in range(len(seqs)):
#     lengths.append(len(seqs[i][1]))

# print(len(lengths))
# print(sorted(lengths))

3730
[25, 31, 32, 37, 37, 39, 39, 40, 40, 42, 43, 44, 44, 44, 44, 45, 45, 46, 49, 49, 51, 51, 51, 52, 53, 53, 53, 54, 55, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 59, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 67, 67, 68, 68, 69, 69, 69, 69, 69, 69, 69, 71, 72, 72, 72, 73, 73, 74, 75, 76, 76, 77, 77, 77, 78, 78, 78, 78, 79, 79, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 84, 85, 85, 85, 86, 86, 86, 86, 86, 86, 86, 87, 87, 87, 87, 87, 88, 89, 89, 89, 89, 89, 90, 90, 90, 91, 91, 91, 91, 92, 92, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 99, 99, 99, 99, 99, 99, 100, 100, 100, 100, 101, 101, 102, 102, 102, 102, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105, 105, 105, 105, 105, 106, 106, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 108, 109, 109, 110, 110, 110, 110, 110, 110, 110, 111, 111, 111, 111, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114, 114, 115, 115, 115, 115, 115, 115, 115, 115

In [10]:
# # Test
# i = 719
# seqs   = gen_data_dict('0_sequences_labels/profiles.csv.xz')
# labels = gen_data_dict('0_sequences_labels/labels.csv.xz')
# sequence, neg_start, neg_end, pos_start, pos_end = get_data(i, seqs, labels)
# lda = 1.5
# chpnt = opart(lda, sequence)
# errors = error_count(chpnt, neg_start, neg_end, pos_start, pos_end)

# print("neg_start:\t", neg_start)
# print("neg_end:\t",   neg_end)
# print("pos_start:\t", pos_start)
# print("pos_end:\t",   pos_end)
# print("changepoint:\t", chpnt)
# print("errors (fp, fn):\t", errors)

neg_start:	 [1]
neg_end:	 [335]
pos_start:	 [418, 453]
pos_end:	 [448, 469]
changepoint:	 [  0 437 460 474]
errors (fp, fn):	 (0, 0)
