In [6]:
import pandas as pd
import numpy as np
import lzma
import gzip

In [7]:
dataset = 'cancer'

In [8]:
file_path = 'sequence_data/' + dataset + '/profiles.csv.gz'
with gzip.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [9]:
def compute_stats(arr):
    """Compute descriptive statistics for a given array."""
    std_deviation = np.std(arr)
    mean = np.mean(arr)
    median = np.median(arr)
    variance = np.var(arr)
    min_value = np.min(arr)
    max_value = np.max(arr)
    range_value = max_value - min_value
    iqr = np.percentile(arr, 75) - np.percentile(arr, 25)
    abs_skewness = abs(pd.Series(arr).skew())
    kurtosis = pd.Series(arr).kurt()
    count = len(arr)
    unique_count = len(np.unique(arr))
    percentile_25 = np.percentile(arr, 25)
    percentile_50 = np.percentile(arr, 50)  # equivalent to median
    percentile_75 = np.percentile(arr, 75)

    # Autocorrelation (lag-1)
    autocorr = np.corrcoef(arr[:-1], arr[1:])[0, 1] if len(arr) > 1 else np.nan

    return np.array([
        std_deviation, mean, median, variance, range_value, iqr,
        min_value, max_value, abs_skewness, kurtosis, count, unique_count,
        percentile_25, percentile_50, percentile_75, autocorr
    ])

def apply_transformations(arr):
    """Apply transformations (abs, log, loglog, square, sqrt) to the given array."""
    arr_abs = np.abs(arr)
    arr_log = np.log(np.where(arr > 0, arr, 1e-10))  # Avoid log(0) errors
    arr_loglog = np.log(np.where(arr_log > 0, arr_log, 1e-10))  # Log of log
    arr_square = np.square(arr)
    arr_sqrt = np.sqrt(np.where(arr >= 0, arr, 0))  # Square root of non-negative values
    return np.concatenate([arr_abs, arr_log, arr_loglog, arr_square, arr_sqrt])

def get_feature(sequence):
    """Generate features based on the logratio, diff, and residual of the sequence."""
    logratio = sequence['logratio'].to_numpy()
    
    # Create diff and residual arrays
    diff = np.abs(logratio[1:] - logratio[:-1])
    residual = logratio - np.mean(logratio)

    # Compute descriptive statistics for logratio, diff, and residual
    logratio_stats = compute_stats(logratio)
    diff_stats = compute_stats(diff)
    residual_stats = compute_stats(residual)

    # Combine all statistics
    combined_stats = np.concatenate([logratio_stats, diff_stats, residual_stats])

    # Apply transformations to the combined statistics
    transformed_stats = np.nan_to_num(apply_transformations(combined_stats))

    # Combine original stats and transformed stats
    final_stats = np.concatenate([combined_stats, transformed_stats])

    return final_stats

def generate_feature_dataframe(seqs, dataset):
    """Create a DataFrame with features for each sequence."""
    # Initialize an empty list to hold rows
    array_rows = []
    for seq_id, sequence in seqs:
        # Append the sequence ID and its corresponding features
        array_rows.append(np.append(seq_id, get_feature(sequence)))

    # Define the base feature names
    base_features = [
        "std_deviation", "mean", "median", "variance", "range_value", "iqr",
        "min_value", "max_value", "abs_skewness", "kurtosis", "count", "unique_count",
        "percentile_25", "percentile_50", "percentile_75", "autocorr"
    ]

    # Define categories and transformations
    categories = ["logratio", "diff", "residual"]
    transformations = ["abs", "log", "loglog", "square", "sqrt"]

    # Construct column names for the DataFrame
    column_names = ["seqID"]  # First column for sequence ID
    for category in categories:
        column_names += [f"{category}_{feature}" for feature in base_features]
    for category in categories:
        column_names += [f"{category}_{transform}_{feature}" for transform in transformations for feature in base_features]

    # Create the DataFrame
    df = pd.DataFrame(array_rows, columns=column_names)

    # Save to CSV
    df.to_csv(f'feature_target_data/{dataset}/features.csv', index=False)

In [10]:
generate_feature_dataframe(seqs, 'cancer')