In [1]:
import gzip
import pandas as pd
import numpy as np

In [2]:
file_path = 'data/signals.gz'
with gzip.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [3]:
def get_feature(sequence):
    # Descriptive statistics
    mean_value = sequence['logratio'].mean()
    median_value = sequence['logratio'].median()
    std_deviation = sequence['logratio'].std()

    # Percentiles
    percentile_25 = sequence['logratio'].quantile(0.25)
    percentile_75 = sequence['logratio'].quantile(0.75)

    # Other statistics
    min_value = sequence['logratio'].min()
    max_value = sequence['logratio'].max()
    range_value = max_value - min_value

    # Skewness and Kurtosis
    skewness = sequence['logratio'].skew()
    kurtosis = sequence['logratio'].kurt()

    # Count and unique values
    count = sequence['logratio'].count()

    return np.array([mean_value, median_value, std_deviation, percentile_25, percentile_75, range_value, skewness, kurtosis, count])


In [4]:
array_rows = []
for i in range(len(seqs)):
    sequence = seqs[i][1]
    array_rows.append(np.append(int(i), get_feature(sequence)))

column_names = ["seqID", "mean_value", "median_value", "std_deviation", "percentile_25", "percentile_75", "range_value", "skewness", "kurtosis", "count"]
df = pd.DataFrame(array_rows, columns=column_names)

In [5]:
df

Unnamed: 0,seqID,mean_value,median_value,std_deviation,percentile_25,percentile_75,range_value,skewness,kurtosis,count
0,0.0,0.313451,0.336855,0.183431,0.259423,0.412239,1.354229,-2.207026,7.508871,474.0
1,1.0,0.092563,0.162210,0.263228,-0.174622,0.319617,0.931739,-0.148131,-1.487185,155.0
2,2.0,-0.119023,-0.139236,0.186725,-0.254118,-0.039489,1.178472,1.731410,5.540867,79.0
3,3.0,-0.080282,-0.106249,0.212940,-0.259291,0.094906,0.966269,0.287526,-0.943570,163.0
4,4.0,-0.120553,-0.053645,0.229760,-0.213007,0.018278,1.163706,-0.607609,-0.106406,118.0
...,...,...,...,...,...,...,...,...,...,...
408,408.0,0.181327,0.016000,0.304551,-0.025000,0.354250,1.455000,1.217585,0.330306,188.0
409,409.0,-0.116354,-0.012000,0.206474,-0.284000,0.032000,0.885000,-0.926986,-0.499512,257.0
410,410.0,-0.111544,-0.032000,0.172608,-0.214125,0.008000,0.781000,-1.210235,0.406716,238.0
411,411.0,-0.075912,-0.074760,0.064818,-0.115597,-0.032463,0.434886,-0.456224,0.915028,234.0


In [6]:
df.to_csv('data/seq_stat.csv', index=False)