In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
from ipynb.fs.full.utility_functions import get_data, get_cumsum, opart

In [2]:
def op(data, **kwargs):
    """ OP Algorithm. Returns a changepoints vector.
    Args:
        data (list): array-like, 1 dimension.
        B (float): penalty to apply.
    """
    
    # Preprocessing
    df = pd.DataFrame(data)
    df['squared'] = np.square(df[0])
    df['cumsum'] = np.cumsum(df[0], axis=0)
    df['cumsumsquared'] = np.cumsum(df['squared'], axis=0)
    df['diviseur'] = [x for x in range(1,len(df)+1)]
    df['mean'] = df['cumsum'] / df['diviseur']
    df['meansquared'] = np.square(df['mean'])
    # df = df.append({
    #     0:0,
    #     'cumsum':0,
    #     'cumsumsquared':0,
    #     'diviseur':0,
    #     'mean':0,
    #     'meansquared':0,
    #     'squared':0}, 
    #     ignore_index=True)
    new_row = {
    0: 0,
    'cumsum': 0,
    'cumsumsquared': 0,
    'diviseur': 0,
    'mean': 0,
    'meansquared': 0,
    'squared': 0
    }

    df = pd.concat([df, pd.DataFrame([new_row], columns=df.columns)], ignore_index=True)

    
    if 'penalty' in kwargs:
        B = kwargs['penalty']
    else:
        B = 2 * np.log(len(data))
    
    F = [-B] # Actual cost
    CP = [-1] # Last segment position

    # Parse the data
    for pos, x in enumerate(data):

        # Parse all the Yi:pos
        costs = []
        min_cost_val_temp = float("inf")
        min_cost_pos_temp = -1
        for i in range(0,pos+1):
            # Square sum minus N times the square mean
            C = (df['cumsumsquared'].iloc[pos] - df['cumsumsquared'].iloc[i-1]) - (((pos+1) - (i+1) + 1) * ((data[i:pos+1].mean())**2))
            # Square sum = (df['cumsumsquared'].iloc[pos] - df['cumsumsquared'].iloc[i-1])
            # n = (pos+1) - (i+1) + 1
            # mean square = (data[i:pos+1].mean())**2)

            # Cost test
            temp_cost = F[i] + C + B
            if min_cost_val_temp > temp_cost:
                min_cost_val_temp = temp_cost
                min_cost_val_pos = i

        # Push the smallest cost
        F.append(min_cost_val_temp)

        # Push the position
        CP.append(min_cost_val_pos)
        
    return CP

def backtracking(CP):
    """ Apply backtracking to a CP vector from OP algorithm. Returns a "segments" vector.
    Args:
     CP: array-like 1 dimension.
    """

    # Data length
    n = len(CP)-1

    # Initialization
    segments = []
    changepoint = CP[n]

    # While the changepoint doesn't return the first point
    while changepoint > 0:

        segments.append(changepoint-1)
        changepoint = CP[changepoint]

    # The new vector was built with .append(), but since we parse from the end to the beginning,
    # We need to reverse it.
    segments.reverse()
    
    return segments

def plot_segments(data, segments, ylim=False):
    """ Plot segments generated by the OP & backtracking algorithms.
    Args:
        data: the data used to fit the model.
        segments: the segments returned by backtracking().
    """
    
    fig, ax = plt.subplots(figsize=(15,5))
    start = 0

    for end in segments:
        mean = data[start:end+1].sum() / len(data[start:end+1])
        plt.plot((start, end), (mean, mean))
        start = end+1

    end = len(data)-1
    mean = data[start:end+1].sum() / len(data[start:end+1])
    plt.plot((start, end), (mean, mean))
    
    if ylim != False:
        plt.ylim(ylim)
    plt.show()

In [3]:
def gen_data_dict(file_path):
    with gzip.open(file_path, 'rt') as file:
        df = pd.read_csv(file)

    _dict = tuple(df.groupby('sequenceID'))
    return _dict

seqs   = gen_data_dict('sequence_label_data/signals.gz')
labels = gen_data_dict('sequence_label_data/labels.gz')

for i in range(10):
    sequence = seqs[i][1]['logratio'].to_numpy()
    tau_star = op(data=sequence, penalty=0.1)
    segments = backtracking(tau_star)
    print(segments)

[23, 44, 55, 186, 400, 414, 436, 459]
[17, 63, 85, 99, 100, 112, 114, 153]
[8, 9, 32]
[63, 70, 90, 91, 149]
[19, 20, 27, 56, 109, 110]
[46, 71, 87, 96, 99, 100, 104, 116, 117, 154, 189]
[126, 146, 148, 154, 157, 160, 164, 167, 169, 174, 180, 184]
[1, 22, 24, 60, 161, 162]
[138]
[13]


In [4]:
for i in range(10):
    # generate data
    sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)

    # vectors of cumulative sums
    y, z = get_cumsum(sequence)

    chpnt = opart(0.1, sequence, y, z)

    print(chpnt)

[  0  24  45  56 187 401 415 437 460 474]
[  0  18  64  86 100 101 113 115 154 155]
[ 0  9 10 33 79]
[  0  64  71  91  92 150 163]
[  0  20  21  28  57 110 111 118]
[  0  47  72  88  97 100 101 105 117 118 155 190 199]
[  0 127 147 149 155 158 161 165 168 170 175 181 185 188]
[  0   2  23  25  61 162 163 256]
[  0 139 223]
[  0  14 170]
