In [1]:
import gzip
import pandas as pd
import numpy as np

In [2]:
file_path = 'sequence_label_data/signals.gz'
with gzip.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [3]:
def get_feature(sequence):
    # Descriptive statistics
    std_deviation = sequence['logratio'].std()

    # Other statistics
    min_value = sequence['logratio'].min()
    max_value = sequence['logratio'].max()
    range_value = max_value - min_value

    # Skewness and Kurtosis
    abs_skewness = abs(sequence['logratio'].skew())
    kurtosis     = sequence['logratio'].kurt()

    # Count and unique values
    count = sequence['logratio'].count()

    return np.array([std_deviation, range_value, abs_skewness, kurtosis, count])


In [4]:
array_rows = []
for i in range(len(seqs)):
    sequence = seqs[i][1]
    array_rows.append(np.append(int(i), get_feature(sequence)))

column_names = ["seqID", "std_deviation", "range_value", "abs_skewness", "kurtosis", "count"]
df = pd.DataFrame(array_rows, columns=column_names)

In [5]:
df.to_csv('learning_data/seq_features.csv', index=False)