In [1]:
import pandas as pd
import lzma
import math
import os

In [2]:
datasets = [name for name in os.listdir('training_data') if os.path.isdir(os.path.join('training_data', name))]

In [3]:
for dataset in datasets:
    for compress_type in ['mean', 'median']:
        for compress_size in [100, 1000, 2000]:
            print(f'{dataset:25} {compress_type} {compress_size}', end='\t')
            
            # Load count data with 'sequenceID' and 'count'
            length_df = pd.read_csv(f"training_data/{dataset}/features.csv")[['sequenceID', 'count']]

            # Load sequence data from CSV
            file_path = f'sequence_data/{dataset}/{compress_type}/{compress_size}/profiles.csv.xz'
            with lzma.open(file_path, 'rt') as file:
                signal_df = pd.read_csv(file)

            # Group sequences by 'sequenceID' and convert to dictionary for fast lookups
            seqs = {sequenceID: data for sequenceID, data in signal_df.groupby('sequenceID')}

            match_count = 0
            mismatch_count = 0

            # Iterate over 'sequenceID' in length_df and compare with sequences in seqs
            for i, row in length_df.iterrows():
                sequenceID = row['sequenceID']

                if sequenceID in seqs:
                    if row['count'] < compress_size:
                        ceil_value = row['count']  # Use actual count if it's less than compress_size
                    else:
                        ceil_value = math.ceil(row['count'] / compress_size)

                    seq_length = len(seqs[sequenceID])

                    if ceil_value == seq_length:
                        match_count += 1
                    else:
                        mismatch_count += 1
                        # Uncomment the following line for debugging
                        # print(f"No match for sequenceID {sequenceID}: {row['count']} vs {seq_length}")
                # else:
                #     mismatch_count += 1
                #     # Uncomment the following line for debugging
                #     # print(f"sequenceID {sequenceID} not found in signal data")

            print(f"Number of matches: {match_count:5} \t Number of mismatches: {mismatch_count}")

ATAC_JV_adipose           mean 100	Number of matches:   465 	 Number of mismatches: 0
ATAC_JV_adipose           mean 1000	Number of matches:   465 	 Number of mismatches: 0
ATAC_JV_adipose           mean 2000	Number of matches:   465 	 Number of mismatches: 0
ATAC_JV_adipose           median 100	Number of matches:   465 	 Number of mismatches: 0
ATAC_JV_adipose           median 1000	Number of matches:   465 	 Number of mismatches: 0
ATAC_JV_adipose           median 2000	Number of matches:   465 	 Number of mismatches: 0
CTCF_TDH_ENCODE           mean 100	Number of matches:   182 	 Number of mismatches: 0
CTCF_TDH_ENCODE           mean 1000	Number of matches:   182 	 Number of mismatches: 0
CTCF_TDH_ENCODE           mean 2000	Number of matches:   182 	 Number of mismatches: 0
CTCF_TDH_ENCODE           median 100	Number of matches:   182 	 Number of mismatches: 0
CTCF_TDH_ENCODE           median 1000	Number of matches:   182 	 Number of mismatches: 0
CTCF_TDH_ENCODE           median 2000