In [1]:
import os
import lzma
import pandas as pd

In [2]:
compress_type = 'mean'
compress_size = 100

In [3]:
datasets = [name for name in os.listdir('training_data') if os.path.isdir(os.path.join('training_data', name))]
for dataset in datasets:
    # Load sequence data from CSV
    file_path = f'sequence_data/{dataset}/{compress_type}/{compress_size}/profiles.csv.xz'
    with lzma.open(file_path, 'rt') as file:
        signal_df = pd.read_csv(file)

    # Group sequences by 'sequenceID'
    seqs = tuple(signal_df.groupby('sequenceID'))

    # Extract sequence IDs from seqs (in the same order as the grouping)
    sequence_ids = [group[0] for group in seqs]

    # Reorder target_df to match the order of sequence_ids in seqs
    folds_df = pd.read_csv(f'training_data/{dataset}/folds.csv').set_index('sequenceID').loc[sequence_ids].reset_index()
    target_df = pd.read_csv(f'training_data/{dataset}/target.csv').set_index('sequenceID').loc[sequence_ids].reset_index()

    # Validate that sequenceIDs match across seqs, folds_df, and target_df
    seq_ids_from_seqs = sequence_ids
    seq_ids_from_target = target_df['sequenceID'].tolist()
    seq_ids_from_folds = folds_df['sequenceID'].tolist()

    # Check if all three lists are the same
    if seq_ids_from_seqs == seq_ids_from_target == seq_ids_from_folds:
        print(f"All sequenceID arrays match for dataset: {dataset}")
    else:
        # Find discrepancies
        discrepancies_target = set(seq_ids_from_seqs) - set(seq_ids_from_target)
        discrepancies_folds = set(seq_ids_from_seqs) - set(seq_ids_from_folds)

        if discrepancies_target:
            print(f"Discrepancies found in target_df for dataset {dataset}: {discrepancies_target}")
        if discrepancies_folds:
            print(f"Discrepancies found in folds_df for dataset {dataset}: {discrepancies_folds}")

All sequenceID arrays match for dataset: ATAC_JV_adipose
All sequenceID arrays match for dataset: CTCF_TDH_ENCODE
All sequenceID arrays match for dataset: H3K27ac-H3K4me3_TDHAM_BP
All sequenceID arrays match for dataset: H3K27ac_TDH_some
All sequenceID arrays match for dataset: H3K27me3_RL_cancer
All sequenceID arrays match for dataset: H3K27me3_TDH_some
All sequenceID arrays match for dataset: H3K36me3_AM_immune
All sequenceID arrays match for dataset: H3K36me3_TDH_ENCODE
All sequenceID arrays match for dataset: H3K36me3_TDH_immune
All sequenceID arrays match for dataset: H3K36me3_TDH_other
All sequenceID arrays match for dataset: H3K4me1_TDH_BP
All sequenceID arrays match for dataset: H3K4me3_PGP_immune
All sequenceID arrays match for dataset: H3K4me3_TDH_ENCODE
All sequenceID arrays match for dataset: H3K4me3_TDH_immune
All sequenceID arrays match for dataset: H3K4me3_TDH_other
All sequenceID arrays match for dataset: H3K4me3_XJ_immune
All sequenceID arrays match for dataset: H3K9me