In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Define input and output directory paths
input_dir = "compress_epigenomic_sequences"
output_dir = "processed_chipseq_sequences"

In [3]:
# Function to process a CSV file as per requirements
def process_csv(file_path, output_path):
    # Read the compressed CSV file
    df = pd.read_csv(file_path, compression='xz')
    
    # Apply log(x+1) transformation to 'signal' column
    df['log_signal'] = np.log(df['signal'] + 1)
    
    # Normalize 'log_signal' column to mean 0, standard deviation 1
    df['normalized_signal'] = (df['log_signal'] - df['log_signal'].mean()) / df['log_signal'].std()
    
    # Keep only 'sequenceID' and 'normalized_signal'
    processed_df = df[['sequenceID', 'normalized_signal']]
    processed_df.columns = ['sequenceID', 'signal']
    
    # Save the processed DataFrame to the output path with the specified directory structure
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    processed_df.to_csv(output_path, index=False, compression='xz')

In [4]:
# Traverse the directory structure and process each file
for dataset in os.listdir(input_dir):
    dataset_path = os.path.join(input_dir, dataset)
    
    if os.path.isdir(dataset_path):
        for compress_type in os.listdir(dataset_path):
            compress_type_path = os.path.join(dataset_path, compress_type)
            
            if os.path.isdir(compress_type_path):
                for compress_size in os.listdir(compress_type_path):
                    compress_size_path = os.path.join(compress_type_path, compress_size)
                    
                    if os.path.isdir(compress_size_path):
                        file_path = os.path.join(compress_size_path, 'profiles.csv.xz')
                        
                        # Define the corresponding output path
                        output_path = os.path.join(output_dir, dataset, compress_type, compress_size, 'profiles.csv.xz')
                        
                        # Process the file
                        process_csv(file_path, output_path)
