In [1]:
import gzip
import pandas as pd
import numpy as np

In [2]:
file_path = 'sequence_label_data/signals.gz'
with gzip.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [3]:
def get_feature(sequence):
    # sequence difference
    n = 10
    feature_diff = np.zeros(n)
    diff = np.abs(sequence['logratio'].to_numpy()[1:] - sequence['logratio'].to_numpy()[:-1])
    sum_diff = np.sum(diff)
    for j in range(n):
        feature_diff[j] = np.sum(diff > (j+1)*0.1)
    
    # Descriptive statistics
    std_deviation = sequence['logratio'].std()
    mean = sequence['logratio'].mean()

    # Other statistics
    min_value = sequence['logratio'].min()
    max_value = sequence['logratio'].max()
    range_value = max_value - min_value

    # Skewness and Kurtosis
    abs_skewness = abs(sequence['logratio'].skew())
    kurtosis     = sequence['logratio'].kurt()

    # Count and unique values
    count = sequence['logratio'].count()

    return np.append(np.array([std_deviation, mean, range_value, abs_skewness, kurtosis, count, sum_diff]), feature_diff)

In [4]:
array_rows = []
for i in range(len(seqs)):
    sequence = seqs[i][1]
    array_rows.append(np.append(int(i), get_feature(sequence)))

column_names = ["seqID", "std_deviation", "mean", "range_value", "abs_skewness", "kurtosis", "count"]
column_diff_name = ["sum_diff", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "1.0"]
df = pd.DataFrame(array_rows, columns=column_names+column_diff_name)

In [5]:
df.to_csv('learning_data/seq_features.csv', index=False)