In [1]:
import pandas as pd
import numpy as np
import lzma

In [2]:
dataset = 'detailed'

In [3]:
file_path = '../sequence_data/' + dataset + '/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [4]:
def get_feature(sequence):
    logratio = sequence['signal'].to_numpy()

    # Sequence difference
    diff = np.abs(logratio[1:] - logratio[:-1])
    sum_diff = np.sum(diff)
    mean_diff = np.mean(diff)
    max_diff = np.max(diff)
    min_diff = np.min(diff)

    # Descriptive statistics
    std_deviation = np.std(logratio)
    mean = np.mean(logratio)
    median = np.median(logratio)
    variance = np.var(logratio)

    # Other statistics
    min_value = np.min(logratio)
    max_value = np.max(logratio)
    range_value = max_value - min_value
    iqr = np.percentile(logratio, 75) - np.percentile(logratio, 25)  # Interquartile range

    # Skewness and Kurtosis
    abs_skewness = abs(pd.Series(logratio).skew())
    kurtosis = pd.Series(logratio).kurt()

    # Count and unique values
    count = len(logratio)
    unique_count = len(np.unique(logratio))

    # Percentiles
    percentile_25 = np.percentile(logratio, 25)
    percentile_50 = np.percentile(logratio, 50)  # equivalent to median
    percentile_75 = np.percentile(logratio, 75)

    # Autocorrelation (lag-1)
    if len(logratio) > 1:
        autocorr = np.corrcoef(logratio[:-1], logratio[1:])[0, 1]
    else:
        autocorr = np.nan

    return np.array([
        std_deviation, mean, median, variance, range_value, iqr,
        min_value, max_value, abs_skewness, kurtosis, count, unique_count,
        sum_diff, mean_diff, max_diff, min_diff,
        percentile_25, percentile_50, percentile_75, autocorr
    ])

In [5]:
# Create a DataFrame to store the features for each sequence
array_rows = []
for i in range(len(seqs)):
    sequence = seqs[i][1]
    array_rows.append(np.append(seqs[i][0], get_feature(sequence)))

# Define the column names based on the features calculated
column_names = [
    "seqID", "std_deviation", "mean", "median", "variance", "range_value", "iqr",
    "min_value", "max_value", "abs_skewness", "kurtosis", "count", "unique_count",
    "sum_diff", "mean_diff", "max_diff", "min_diff",
    "percentile_25", "percentile_50", "percentile_75", "autocorr"
]

# Create the DataFrame
df = pd.DataFrame(array_rows, columns=column_names)

In [6]:
df.to_csv('feature_target_data/' + dataset + '/features.csv', index=False)