## Features
- BigWig & BigBed Statistics computed by the `bwq.py` script
- Binary K-mer occurences computed by the `kmer.py` script
- CPATv3 scores computed by the `cpat.py` script
- ViennaRNA package RNA secondary structure Minimum Free Energy values computed by the `mfe.py` script

In [1]:
import os
import re
import logging
# Avoid duplicate handlers in Jupyter
if not logging.getLogger().hasHandlers():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

from scipy.sparse import load_npz
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from scipy.sparse import hstack

In [2]:
def dim_redux(sparse_matrix, row_names, col_names, n_components=2, verbose=True):
    """
    Performs dimensionality reduction using the TruncatedSVD method.
    Args:
        sparse_matrix (scipy.sparse.csr_matrix): The sparse matrix to reduce.
        row_names (list): List of row names/IDs.
        col_names (list): List of column names/k-mers.
        n_components (int): Number of components for dimensionality reduction.
        verbose (bool): If True, log messages.
    Returns:
        tuple: (reduced_matrix, row_names, col_names)
               Returns (None, None, None) if an error occurs.
    """

    if verbose:
        logger.info(f"Performing dimensionality reduction with {n_components} components.")

    try:
        svd = TruncatedSVD(n_components=n_components)
        reduced_matrix = svd.fit_transform(sparse_matrix)
        if verbose:
            logger.info(f"Reduced matrix shape: {reduced_matrix.shape}")
        return reduced_matrix, row_names, col_names
    except Exception as e:
        if verbose: logger.error(f"Error during dimensionality reduction: {e}")
        return None, None, None
    
def dim_redux_by_kmer_length(sparse_matrix, col_names, n_components_per_k=1, verbose=True):
    """
    Perform dimensionality reduction separately for each k-mer length.
    Args:
        sparse_matrix: scipy.sparse matrix (samples x k-mers)
        col_names: list of column names (k-mers)
        n_components_per_k: int or dict, number of components per k-mer length
        verbose: bool, print progress
    Returns:
        reduced_matrix: concatenated reduced matrix
        reduced_col_names: new column names (with k-mer length info)
    """
    # Group columns by k-mer length
    kmer_groups = defaultdict(list)
    for idx, name in enumerate(col_names):
        kmer_groups[len(name)].append(idx)

    reduced_matrices = []
    reduced_col_names = []
    for k, idxs in sorted(kmer_groups.items()):
        if verbose:
            logger.info(f"Reducing {len(idxs)} columns of {k}-mers")
        submatrix = sparse_matrix[:, idxs]
        n_comp = n_components_per_k[k] if isinstance(n_components_per_k, dict) and k in n_components_per_k else n_components_per_k
        svd = TruncatedSVD(n_components=min(n_comp, submatrix.shape[1]-1))
        reduced = svd.fit_transform(submatrix)
        reduced_matrices.append(reduced)
        reduced_col_names.extend([f"{k}mer_SVD{i+1}" for i in range(reduced.shape[1])])
    # Concatenate all reduced matrices horizontally
    reduced_matrix = np.hstack(reduced_matrices)
    return reduced_matrix, reduced_col_names
    
def load_kmer_results(base_path, redux_n_components, redux=True, group_redux_kmer_len=True, tfidf=True, verbose=True):
    """
    Loads k-mer result files (sparse matrix, row names, column names) from disk, with optional dimensionality reduction and TF-IDF transformation.

    Args:
        base_path (str): Base path to the k-mer result files (without suffix).
        redux_n_components (int or dict): Number of components for dimensionality reduction. If a dict, keys are k-mer lengths.
        redux (bool, optional): Whether to perform dimensionality reduction. Defaults to True.
        group_redux_kmer_len (bool, optional): If True, performs dimensionality reduction separately for each k-mer length. Defaults to True.
        tfidf (bool, optional): Whether to apply TF-IDF transformation to the matrix. Defaults to True.
        verbose (bool, optional): If True, logs progress and info messages. Defaults to True.

    Returns:
        tuple: (sparse_matrix, row_names, col_names)
            - sparse_matrix: The loaded (and optionally reduced/transformed) matrix (scipy.sparse or numpy.ndarray).
            - row_names: List of row/sample names.
            - col_names: List of column/k-mer names (may be reduced).
        Returns (None, None, None) if loading or processing fails.
    """
    logger = logging.getLogger(__name__)
    # Determine file paths based on whether it's likely binary or ATGC output
    # This is a simple heuristic; a more robust way would be to pass the exact filenames
    # or have a metadata file.
    sparse_matrix_file = f"{base_path}_sparse.npz"
    rows_file = f"{base_path}_rows.txt"
    cols_file = f"{base_path}_cols.txt"

    if not (os.path.exists(sparse_matrix_file) and os.path.exists(rows_file) and os.path.exists(cols_file)):
         # Try with _binary suffix if primary files not found
         binary_sparse_matrix_file = f"{base_path}_binary_sparse.npz"
         binary_rows_file = f"{base_path}_binary_rows.txt"
         binary_cols_file = f"{base_path}_binary_cols.txt"
         if os.path.exists(binary_sparse_matrix_file) and os.path.exists(binary_rows_file) and os.path.exists(binary_cols_file):
             sparse_matrix_file = binary_sparse_matrix_file
             rows_file = binary_rows_file
             cols_file = binary_cols_file
             if verbose: logger.info(f"Loading binary k-mer results from: {base_path}_binary*")
         else:
            if verbose: logger.error(f"One or more result files not found for base path '{base_path}' (tried with and without '_binary' suffix).")
            return None, None, None

    if verbose and sparse_matrix_file.startswith(base_path + "_binary"):
        pass # Already logged above
    elif verbose:
        logger.info(f"Loading k-mer results from: {base_path}*")


    try:
        sparse_matrix = load_npz(sparse_matrix_file)
        with open(rows_file, 'r') as f:
            row_names = [line.strip() for line in f]
        with open(cols_file, 'r') as f:
            col_names = [line.strip() for line in f]
        if verbose:
            logger.info(f"Loaded sparse matrix ({sparse_matrix.shape}), {len(row_names)} row names, {len(col_names)} column names.")
    except FileNotFoundError as e:
        if verbose: logger.error(f"File not found: {e}")
        return None, None, None
        
    if redux:
        try:
            if group_redux_kmer_len:
                # Perform dimensionality reduction by k-mer length
                reduced_matrix, col_names = dim_redux_by_kmer_length(sparse_matrix, col_names, n_components_per_k=redux_n_components, verbose=verbose)
                logger.info(f"Dimensionality reduction by k-mer length completed. Reduced matrix shape: {reduced_matrix.shape}")
                logger.info(f"Number of rows: {len(row_names)}, Number of columns: {len(col_names)}")
            else:
                # Perform dimensionality reduction on the entire matrix
                reduced_matrix, row_names, col_names = dim_redux(sparse_matrix, row_names, col_names, n_components=redux_n_components, verbose=verbose)
                logger.info(f"Dimensionality reduction completed. Reduced matrix shape: {reduced_matrix.shape}")
                logger.info(f"Number of rows: {len(row_names)}, Number of columns: {len(col_names)}")
        except Exception as e:
            if verbose: logger.error(f"Dimensionality reduction failed: {e}")
            return None, None, None
        
        if reduced_matrix is not None:
            sparse_matrix = reduced_matrix
        else:
            if verbose: logger.error("Dimensionality reduction failed.")
            return None, None, None
        
    if tfidf:
        # Apply TF-IDF transformation if requested
        if verbose: logger.info("Applying TF-IDF transformation.")
        tfidf = TfidfTransformer()
        sparse_matrix = tfidf.fit_transform(sparse_matrix)
        if verbose: logger.info(f"TF-IDF transformed matrix shape: {sparse_matrix.shape}")
    
    return sparse_matrix, row_names, col_names

In [None]:
def prep_training_data(
    target,
    base_parquet_path,
    bwq_parquet_path,
    cpat_parquet_path,
    mfe_parquet_path,
    kmer_base_path,
    sparse=True,
    use_dim_redux=True,
    redux_n_components=1,
    use_tfidf=True
):
    
    base = pd.read_parquet(base_parquet_path)
    df = base[['transcript_id', 'Chromosome', 'Start', 'End', 'length', 'gene_name', 'Sequence']]
    df.columns = [x.lower() for x in df.columns]
    # clear base dataframe
    del base

    # load bwq dataframe and generate a unique ID for each row
    bwq = pd.read_parquet(bwq_parquet_path)
    # drop unnecessary columns
    cols_to_drop = ['chromosome', 'start', 'end']
    bwq = bwq.drop(columns=cols_to_drop)

    # inner join df with bwq and assert that the number of rows is the same
    df = df.merge(bwq, how='inner', left_on=['transcript_id'], right_on=['transcript_id'])
    del bwq

    cpat = pd.read_parquet(cpat_parquet_path)
    cols_to_keep = ['transcript_id', 'coding_prob', 'fickett_score', 'hexamer_score', 'orf_len']
    cpat = cpat[cols_to_keep]
    cpat.rename(columns={
        'coding_prob': 'cpat_cod_prob',
        'fickett_score': 'cpat_fickett_score',
        'hexamer_score': 'cpat_hexamer_score',
        'orf_len': 'cpat_orf_len'
    }, inplace=True)
    # inner join df with cpat and assert that the number of rows is the same
    df = df.merge(cpat, how='inner', left_on=['transcript_id'], right_on=['transcript_id'])
    del cpat

    mfe = pd.read_parquet(mfe_parquet_path)

    cols_to_keep = ['transcript_id', 'mfe', 'structure']
    mfe = mfe[cols_to_keep]
    mfe.rename(columns={
        'mfe': 'ss_mfe',
        'structure': 'ss_structure'
    }, inplace=True)
    df = df.merge(mfe, how='inner', left_on=['transcript_id'], right_on=['transcript_id'])
    del mfe

    npz_mtx, row_names, col_names = load_kmer_results(kmer_base_path, 
                                                      redux_n_components=redux_n_components, 
                                                      redux=use_dim_redux, 
                                                      group_redux_kmer_len=True, 
                                                      tfidf=use_tfidf, 
                                                      verbose=True)
    if npz_mtx is None or row_names is None or col_names is None:
        raise FileNotFoundError(f"Failed to load k-mer results from {kmer_base_path}. Ensure the files exist and are accessible.")
    assert len(row_names) == len(set(row_names)), f"Duplicate row names found: {len(row_names) - len(set(row_names))} duplicates"

    kmer = pd.DataFrame.sparse.from_spmatrix(npz_mtx, columns=col_names, index=row_names)

    if not sparse:
        try:
            # Convert sparse matrix to dense
            kmer = kmer.sparse.to_dense()
        except Exception as e:
            raise ValueError(f"Error converting sparse matrix to dense: {e}")

    # set index of df as 'transcript_id' to merge with kmer sparse matrix
    df.set_index('transcript_id', inplace=True)
    df = df.merge(kmer, how='inner', left_index=True, right_index=True)
    del kmer

    if target == 'ncr':
        df['y'] = True
        return df
    elif target == 'pcg':
        df['y'] = False
        return df
    else:
        raise ValueError(f"Invalid target: {target}. Must be 'ncr' or 'pcg'.")

In [4]:
ncr_redux = prep_training_data(
    target='ncr',
    base_parquet_path='/home/chlab/flync/new-tests/ncr_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/ncr_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/ncr_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/ncr_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr',
    redux_n_components=1,
    use_dim_redux=True,
    use_tfidf=True,
    sparse=False
)

ncr_redux.to_parquet('/home/chlab/flync/src/data/ncr_training_redux.parquet')

2025-07-30 23:42:18,951 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr_binary*
2025-07-30 23:42:20,501 - INFO - Loaded sparse matrix ((2981, 8184)), 2981 row names, 8184 column names.
2025-07-30 23:42:20,503 - INFO - Reducing 8 columns of 3-mers
2025-07-30 23:42:21,180 - INFO - Reducing 16 columns of 4-mers
2025-07-30 23:42:21,679 - INFO - Reducing 32 columns of 5-mers
2025-07-30 23:42:22,032 - INFO - Reducing 64 columns of 6-mers
2025-07-30 23:42:22,418 - INFO - Reducing 128 columns of 7-mers
2025-07-30 23:42:22,948 - INFO - Reducing 256 columns of 8-mers
2025-07-30 23:42:23,332 - INFO - Reducing 512 columns of 9-mers
2025-07-30 23:42:24,577 - INFO - Reducing 1024 columns of 10-mers
2025-07-30 23:42:27,277 - INFO - Reducing 2048 columns of 11-mers
2025-07-30 23:42:31,015 - INFO - Reducing 4096 columns of 12-mers
2025-07-30 23:42:35,337 - INFO - Dimensionality reduction by k-mer length completed. Reduced matrix shape: (2981, 1

In [5]:
pcg_redux = prep_training_data(
    target='pcg',
    base_parquet_path='/home/chlab/flync/new-tests/pcg_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/pcg_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/pcg_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/pcg_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg',
    redux_n_components=1,
    use_dim_redux=True,
    use_tfidf=True,
    sparse=False
)

pcg_redux.to_parquet('/home/chlab/flync/src/data/pcg_training_redux.parquet')

2025-07-30 23:42:41,521 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg_binary*
2025-07-30 23:43:10,305 - INFO - Loaded sparse matrix ((30662, 8184)), 30662 row names, 8184 column names.
2025-07-30 23:43:10,308 - INFO - Reducing 8 columns of 3-mers
2025-07-30 23:43:12,030 - INFO - Reducing 16 columns of 4-mers
2025-07-30 23:43:13,151 - INFO - Reducing 32 columns of 5-mers
2025-07-30 23:43:14,972 - INFO - Reducing 64 columns of 6-mers
2025-07-30 23:43:17,982 - INFO - Reducing 128 columns of 7-mers
2025-07-30 23:43:20,911 - INFO - Reducing 256 columns of 8-mers
2025-07-30 23:43:26,570 - INFO - Reducing 512 columns of 9-mers
2025-07-30 23:43:38,071 - INFO - Reducing 1024 columns of 10-mers
2025-07-30 23:43:56,619 - INFO - Reducing 2048 columns of 11-mers
2025-07-30 23:44:26,947 - INFO - Reducing 4096 columns of 12-mers
2025-07-30 23:45:09,406 - INFO - Dimensionality reduction by k-mer length completed. Reduced matrix shape: (30662

In [6]:
ncr_full = prep_training_data(
    target='ncr',
    base_parquet_path='/home/chlab/flync/new-tests/ncr_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/ncr_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/ncr_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/ncr_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr',
    redux_n_components=None,
    use_dim_redux=False,
    use_tfidf=True,
    sparse=False
)

ncr_full.to_parquet('/home/chlab/flync/src/data/ncr_training_full.parquet')

2025-07-30 23:45:14,041 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr_binary*
2025-07-30 23:45:16,221 - INFO - Loaded sparse matrix ((2981, 8184)), 2981 row names, 8184 column names.
2025-07-30 23:45:16,224 - INFO - Applying TF-IDF transformation.
2025-07-30 23:45:18,568 - INFO - TF-IDF transformed matrix shape: (2981, 8184)


In [7]:
pcg_full = prep_training_data(
    target='pcg',
    base_parquet_path='/home/chlab/flync/new-tests/pcg_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/pcg_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/pcg_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/pcg_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg',
    redux_n_components=None,
    use_dim_redux=False,
    use_tfidf=True,
    sparse=False
)

pcg_full.to_parquet('/home/chlab/flync/src/data/pcg_training_full.parquet')

2025-07-30 23:45:42,322 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg_binary*
2025-07-30 23:46:10,301 - INFO - Loaded sparse matrix ((30662, 8184)), 30662 row names, 8184 column names.
2025-07-30 23:46:10,304 - INFO - Applying TF-IDF transformation.
2025-07-30 23:46:57,974 - INFO - TF-IDF transformed matrix shape: (30662, 8184)
