In [1]:
import os
import csv
import gzip
import pandas as pd
import numpy as np

In [2]:
def list_files(folder_path):
    files = os.listdir(folder_path)
    return files

In [3]:
data_dir = 'C:/Users/Tung/Downloads/0.DATA/data/'

In [4]:
compress_window = 100  # Example value, adjust as needed
compress_type = "median"  # Example value, adjust as needed

for data in list_files(data_dir):
    original_sequence_count = 0  # Reset counter for each data directory
    output_sequence_count = 0  # Reset counter for each data directory

    for sample in list_files(data_dir + data + "/samples"):
        for problem in list_files(data_dir + data + "/samples/" + sample):
            for chr in list_files(data_dir + data + "/samples/" + sample + "/" + problem + "/problems"):
                sequenceID = data + "/samples/" + sample + "/" + problem + "/problems/" + chr.replace('_', ':')
                try:
                    sequence_path = data_dir + data + "/samples/" + sample + "/" + problem + "/problems/" + chr + "/coverage.bedGraph.gz"
                    extracted_file = sequence_path.replace('.gz', '')

                    # Open the compressed file and read the content
                    with gzip.open(sequence_path, 'rb') as f_in:
                        with open(extracted_file, 'wb') as f_out:
                            f_out.write(f_in.read())
                    
                    # Load the data into a Pandas DataFrame
                    df = pd.read_csv(extracted_file, sep='\t', header=None, names=['value'])

                    # Reset the index to remove any MultiIndex issues
                    df = df.reset_index(drop=True)

                    # Increment the original sequence count
                    original_sequence_count += 1

                    # Check if the dataframe length is smaller than compress_window
                    if len(df) <= compress_window:
                        print(f"Sequence {sequenceID} is smaller than compress_window. Keeping original sequence.")

                        # Keeping the original sequence without compression
                        compact_df = df.copy()

                    else:
                        # Group by every compress_window rows and calculate the mean value for each group
                        compact_df = df.groupby(df.index // compress_window).median().reset_index(drop=True)
                        
                        # If compression does not reduce the size, keep the original sequence
                        if len(compact_df) >= len(df):
                            print(f"Compression failed for {sequenceID}. Keeping original sequence.")
                            compact_df = df.copy()

                    # Rename the column 'value' to 'signal'
                    compact_df.rename(columns={'value': 'signal'}, inplace=True)

                    # Add the 'sequenceID' column
                    compact_df.insert(0, 'sequenceID', sequenceID)

                    # Define the output file name
                    output_file = f'{data}/{compress_type}/{compress_window}/profiles.csv'
                    os.makedirs(os.path.dirname(output_file), exist_ok=True)

                    # Check if the file already exists to decide whether to write the header
                    file_exists = os.path.isfile(output_file)

                    # Save the compacted (or original) DataFrame to a CSV file with header only if the file doesn't exist
                    compact_df.to_csv(output_file, mode='a', header=not file_exists, index=False)

                    # Increment the output sequence count
                    output_sequence_count += 1

                except Exception as e:
                    print(f"An error occurred while processing {sequenceID}: {e}")

    # After processing all sequences for this data, check if the counts match
    if original_sequence_count == output_sequence_count:
        print(f"Success: For data '{data}', the number of original sequences ({original_sequence_count}) matches the number of output sequences ({output_sequence_count}).")
    else:
        print(f"Warning: For data '{data}', mismatch between original sequences ({original_sequence_count}) and output sequences ({output_sequence_count}).")

Success: For data 'ATAC_JV_adipose', the number of original sequences (465) matches the number of output sequences (465).
Success: For data 'CTCF_TDH_ENCODE', the number of original sequences (182) matches the number of output sequences (182).
Success: For data 'H3K27ac-H3K4me3_TDHAM_BP', the number of original sequences (2008) matches the number of output sequences (2008).
Success: For data 'H3K27ac_TDH_some', the number of original sequences (95) matches the number of output sequences (95).
Success: For data 'H3K27me3_RL_cancer', the number of original sequences (171) matches the number of output sequences (171).
Success: For data 'H3K27me3_TDH_some', the number of original sequences (43) matches the number of output sequences (43).
Success: For data 'H3K36me3_AM_immune', the number of original sequences (420) matches the number of output sequences (420).
Success: For data 'H3K36me3_TDH_ENCODE', the number of original sequences (78) matches the number of output sequences (78).
An err