In [1]:
import gzip
import pandas as pd
import numpy as np

In [2]:
file_path = 'data/signals.gz'
with gzip.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [3]:
def get_feature(sequence):
    # Descriptive statistics
    mean_value = sequence['logratio'].mean()
    median_value = sequence['logratio'].median()
    std_deviation = sequence['logratio'].std()

    # Percentiles
    percentile_25 = sequence['logratio'].quantile(0.25)
    percentile_75 = sequence['logratio'].quantile(0.75)

    # Other statistics
    min_value = sequence['logratio'].min()
    max_value = sequence['logratio'].max()
    range_value = max_value - min_value

    # Skewness and Kurtosis
    skewness = sequence['logratio'].skew()
    kurtosis = sequence['logratio'].kurt()

    # Count and unique values
    count = sequence['logratio'].count()

    return np.array([mean_value, median_value, std_deviation, percentile_25, percentile_75, range_value, skewness, kurtosis, count/1000])


In [4]:
array_rows = []
for i in range(len(seqs)):
    sequence = seqs[i][1]
    array_rows.append(np.append(int(i), get_feature(sequence)))

column_names = ["seqID", "mean_value", "median_value", "std_deviation", "percentile_25", "percentile_75", "range_value", "skewness", "kurtosis", "count"]
df = pd.DataFrame(array_rows, columns=column_names)

In [6]:
df.to_csv('data/seq_stat.csv', index=False)