## Features
- BigWig & BigBed Statistics computed by the `bwq.py` script
- Binary K-mer occurences computed by the `kmer.py` script
- CPATv3 scores computed by the `cpat.py` script
- ViennaRNA package RNA secondary structure Minimum Free Energy values computed by the `mfe.py` script

In [50]:
import os
import re
import logging
# Avoid duplicate handlers in Jupyter
if not logging.getLogger().hasHandlers():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

from scipy.sparse import load_npz, issparse
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from scipy.sparse import hstack

In [51]:
def dim_redux(sparse_matrix, row_names, col_names, n_components=2, verbose=True):
    """
    Performs dimensionality reduction using the TruncatedSVD method.
    Args:
        sparse_matrix (scipy.sparse.csr_matrix): The sparse matrix to reduce.
        row_names (list): List of row names/IDs.
        col_names (list): List of column names/k-mers.
        n_components (int): Number of components for dimensionality reduction.
        verbose (bool): If True, log messages.
    Returns:
        tuple: (reduced_matrix, row_names, col_names)
               Returns (None, None, None) if an error occurs.
    """

    if verbose:
        logger.info(f"Performing dimensionality reduction with {n_components} components.")

    try:
        svd = TruncatedSVD(n_components=n_components)
        reduced_matrix = svd.fit_transform(sparse_matrix)
        if verbose:
            logger.info(f"Reduced matrix shape: {reduced_matrix.shape}")
        return reduced_matrix, row_names, col_names
    except Exception as e:
        if verbose: logger.error(f"Error during dimensionality reduction: {e}")
        return None, None, None
    

def dim_redux_by_kmer_length(sparse_matrix, col_names, n_components_per_k=1, verbose=True):
    """
    Perform dimensionality reduction separately for each k-mer length.
    Args:
        sparse_matrix: scipy.sparse matrix (samples x k-mers)
        col_names: list of column names (k-mers)
        n_components_per_k: int or dict, number of components per k-mer length
        verbose: bool, print progress
    Returns:
        reduced_matrix: concatenated reduced matrix
        reduced_col_names: new column names (with k-mer length info)
    """
    kmer_groups = defaultdict(list)
    for idx, name in enumerate(col_names):
        kmer_groups[len(name)].append(idx)

    reduced_matrices = []
    reduced_col_names = []
    for k, idxs in sorted(kmer_groups.items()):
        if verbose:
            logger.info(f"Reducing {len(idxs)} columns of {k}-mers")
        submatrix = sparse_matrix[:, idxs]
        n_comp = n_components_per_k[k] if isinstance(n_components_per_k, dict) and k in n_components_per_k else n_components_per_k
        if submatrix.shape[1] <= 1:
            # Edge case: cannot reduce further
            reduced = submatrix.toarray() if hasattr(submatrix, 'toarray') else submatrix
            reduced_matrices.append(reduced)
            reduced_col_names.extend([f"{k}mer_SVD1"])  # single passthrough
            continue
        svd = TruncatedSVD(n_components=min(n_comp, submatrix.shape[1]-1))
        reduced = svd.fit_transform(submatrix)
        reduced_matrices.append(reduced)
        reduced_col_names.extend([f"{k}mer_SVD{i+1}" for i in range(reduced.shape[1])])
    reduced_matrix = np.hstack(reduced_matrices)
    return reduced_matrix, reduced_col_names
    

def load_kmer_results(base_path, redux_n_components, redux=True, group_redux_kmer_len=True, tfidf=True, verbose=True):
    """
    Loads k-mer result files (sparse matrix, row names, column names) from disk.
    Correct transformation order: (1) optional TF-IDF weighting on raw counts, then (2) optional SVD (grouped or global).

    Args:
        base_path (str): Base path to the k-mer result files (without suffix).
        redux_n_components (int or dict): Components for dimensionality reduction. If dict, keys are k-mer lengths.
        redux (bool): Whether to perform dimensionality reduction.
        group_redux_kmer_len (bool): If True, perform SVD separately per k-mer length.
        tfidf (bool): Apply TF-IDF weighting BEFORE SVD.
        verbose (bool): Log progress if True.

    Returns:
        tuple: (matrix, row_names, col_names) where matrix is TF-IDF weighted and/or SVD reduced.
    """
    logger = logging.getLogger(__name__)
    sparse_matrix_file = f"{base_path}_sparse.npz"
    rows_file = f"{base_path}_rows.txt"
    cols_file = f"{base_path}_cols.txt"

    if not (os.path.exists(sparse_matrix_file) and os.path.exists(rows_file) and os.path.exists(cols_file)):
        binary_sparse_matrix_file = f"{base_path}_binary_sparse.npz"
        binary_rows_file = f"{base_path}_binary_rows.txt"
        binary_cols_file = f"{base_path}_binary_cols.txt"
        if os.path.exists(binary_sparse_matrix_file) and os.path.exists(binary_rows_file) and os.path.exists(binary_cols_file):
            sparse_matrix_file = binary_sparse_matrix_file
            rows_file = binary_rows_file
            cols_file = binary_cols_file
            if verbose: logger.info(f"Loading binary k-mer results from: {base_path}_binary*")
        else:
            if verbose: logger.error(f"One or more result files not found for base path '{base_path}' (tried with and without '_binary' suffix).")
            return None, None, None

    if verbose and sparse_matrix_file.startswith(base_path + "_binary"):
        pass
    elif verbose:
        logger.info(f"Loading k-mer results from: {base_path}*")

    try:
        sparse_matrix = load_npz(sparse_matrix_file)
        with open(rows_file, 'r') as f:
            row_names = [line.strip() for line in f]
        with open(cols_file, 'r') as f:
            col_names = [line.strip() for line in f]
        if verbose:
            logger.info(f"Loaded sparse matrix ({sparse_matrix.shape}), {len(row_names)} row names, {len(col_names)} column names.")
    except FileNotFoundError as e:
        if verbose: logger.error(f"File not found: {e}")
        return None, None, None

    # 1. TF-IDF weighting first (if requested)
    if tfidf:
        if verbose: logger.info("Applying TF-IDF transformation BEFORE SVD.")
        transformer = TfidfTransformer()
        sparse_matrix = transformer.fit_transform(sparse_matrix)
        if verbose: logger.info(f"TF-IDF weighted matrix shape: {sparse_matrix.shape}")

    # 2. Dimensionality reduction
    if redux:
        try:
            if group_redux_kmer_len:
                reduced_matrix, new_cols = dim_redux_by_kmer_length(sparse_matrix, col_names, n_components_per_k=redux_n_components, verbose=verbose)
                sparse_matrix = reduced_matrix
                col_names = new_cols
                if verbose: logger.info(f"Grouped SVD completed. Shape: {sparse_matrix.shape}")
            else:
                reduced_matrix, _, _ = dim_redux(sparse_matrix, row_names, col_names, n_components=redux_n_components, verbose=verbose)
                sparse_matrix = reduced_matrix
                # col_names replaced with generic component names
                col_names = [f"SVD{i+1}" for i in range(sparse_matrix.shape[1])]
                if verbose: logger.info(f"Global SVD completed. Shape: {sparse_matrix.shape}")
        except Exception as e:
            if verbose: logger.error(f"Dimensionality reduction failed: {e}")
            return None, None, None

    return sparse_matrix, row_names, col_names

In [52]:
def prep_training_data(
    target,
    base_parquet_path,
    bwq_parquet_path,
    cpat_parquet_path,
    mfe_parquet_path,
    kmer_base_path,
    sparse=True,
    use_dim_redux=True,
    redux_n_components=1,
    use_tfidf=True
):
    base = pd.read_parquet(base_parquet_path)
    df = base[['transcript_id', 'Chromosome', 'Start', 'End', 'length', 'gene_name', 'Sequence']]
    df.columns = [x.lower() for x in df.columns]
    del base

    bwq = pd.read_parquet(bwq_parquet_path)
    cols_to_drop = ['chromosome', 'start', 'end']
    bwq = bwq.drop(columns=[c for c in cols_to_drop if c in bwq.columns])
    df = df.merge(bwq, how='inner', on='transcript_id')
    del bwq

    cpat = pd.read_parquet(cpat_parquet_path)
    cols_to_keep = ['transcript_id', 'coding_prob', 'fickett_score', 'hexamer_score', 'orf_len']
    cpat = cpat[[c for c in cols_to_keep if c in cpat.columns]]
    cpat.rename(columns={
        'coding_prob': 'cpat_cod_prob',
        'fickett_score': 'cpat_fickett_score',
        'hexamer_score': 'cpat_hexamer_score',
        'orf_len': 'cpat_orf_len'
    }, inplace=True)
    df = df.merge(cpat, how='inner', on='transcript_id')
    del cpat

    mfe = pd.read_parquet(mfe_parquet_path)
    mfe_cols_to_keep = ['transcript_id', 'mfe', 'structure']
    mfe = mfe[[c for c in mfe_cols_to_keep if c in mfe.columns]]
    mfe.rename(columns={'mfe': 'ss_mfe', 'structure': 'ss_structure'}, inplace=True)
    df = df.merge(mfe, how='inner', on='transcript_id')
    del mfe

    npz_mtx, row_names, col_names = load_kmer_results(
        kmer_base_path,
        redux_n_components=redux_n_components,
        redux=use_dim_redux,
        group_redux_kmer_len=True,
        tfidf=use_tfidf,
        verbose=True
    )
    if npz_mtx is None or row_names is None or col_names is None:
        raise FileNotFoundError(f"Failed to load k-mer results from {kmer_base_path}. Ensure the files exist and are accessible.")
    if len(row_names) != len(set(row_names)):
        dupes = len(row_names) - len(set(row_names))
        raise ValueError(f"Duplicate row names found: {dupes} duplicates")

    # Build k-mer DataFrame based on matrix type
    if issparse(npz_mtx):
        kmer_df = pd.DataFrame.sparse.from_spmatrix(npz_mtx, index=row_names, columns=col_names)
        if not sparse:
            kmer_df = kmer_df.sparse.to_dense()
    else:
        # numpy ndarray (SVD output). If sparse requested, this is inconsistent; proceed dense.
        if sparse:
            logger.warning("SVD produced a dense ndarray; overriding sparse=True to use dense DataFrame.")
        kmer_df = pd.DataFrame(npz_mtx, index=row_names, columns=col_names)

    df.set_index('transcript_id', inplace=True)
    df = df.merge(kmer_df, how='inner', left_index=True, right_index=True)
    del kmer_df

    if target == 'ncr':
        df['y'] = True
    elif target == 'pcg':
        df['y'] = False
    else:
        raise ValueError("Invalid target: {target}. Must be 'ncr' or 'pcg'.")
    return df

In [53]:
ncr_redux = prep_training_data(
    target='ncr',
    base_parquet_path='/home/chlab/flync/new-tests/ncr_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/ncr_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/ncr_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/ncr_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr',
    redux_n_components=1,
    use_dim_redux=True,
    use_tfidf=True,
    sparse=False
)

ncr_redux.to_parquet('/home/chlab/flync/src/data/ncr_training_redux.parquet')

2025-09-17 01:33:27,839 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr_binary*
2025-09-17 01:33:28,076 - INFO - Loaded sparse matrix ((2981, 8184)), 2981 row names, 8184 column names.
2025-09-17 01:33:28,077 - INFO - Applying TF-IDF transformation BEFORE SVD.
2025-09-17 01:33:28,076 - INFO - Loaded sparse matrix ((2981, 8184)), 2981 row names, 8184 column names.
2025-09-17 01:33:28,077 - INFO - Applying TF-IDF transformation BEFORE SVD.
2025-09-17 01:33:28,423 - INFO - TF-IDF weighted matrix shape: (2981, 8184)
2025-09-17 01:33:28,424 - INFO - Reducing 8 columns of 3-mers
2025-09-17 01:33:28,423 - INFO - TF-IDF weighted matrix shape: (2981, 8184)
2025-09-17 01:33:28,424 - INFO - Reducing 8 columns of 3-mers
2025-09-17 01:33:28,436 - INFO - Reducing 16 columns of 4-mers
2025-09-17 01:33:28,436 - INFO - Reducing 16 columns of 4-mers
2025-09-17 01:33:28,451 - INFO - Reducing 32 columns of 5-mers
2025-09-17 01:33:28,451 - INFO - R

In [54]:
pcg_redux = prep_training_data(
    target='pcg',
    base_parquet_path='/home/chlab/flync/new-tests/pcg_base.parquet',
    bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/pcg_bwq.parquet',
    cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/pcg_cpat.parquet',
    mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/pcg_mfe_linear.parquet',
    kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg',
    redux_n_components=1,
    use_dim_redux=True,
    use_tfidf=True,
    sparse=False
)

pcg_redux.to_parquet('/home/chlab/flync/src/data/pcg_training_redux.parquet')

2025-09-17 01:33:29,826 - INFO - Loading binary k-mer results from: /home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg_binary*
2025-09-17 01:33:33,460 - INFO - Loaded sparse matrix ((30662, 8184)), 30662 row names, 8184 column names.
2025-09-17 01:33:33,460 - INFO - Applying TF-IDF transformation BEFORE SVD.
2025-09-17 01:33:33,460 - INFO - Loaded sparse matrix ((30662, 8184)), 30662 row names, 8184 column names.
2025-09-17 01:33:33,460 - INFO - Applying TF-IDF transformation BEFORE SVD.
2025-09-17 01:33:39,653 - INFO - TF-IDF weighted matrix shape: (30662, 8184)
2025-09-17 01:33:39,654 - INFO - Reducing 8 columns of 3-mers
2025-09-17 01:33:39,653 - INFO - TF-IDF weighted matrix shape: (30662, 8184)
2025-09-17 01:33:39,654 - INFO - Reducing 8 columns of 3-mers
2025-09-17 01:33:39,817 - INFO - Reducing 16 columns of 4-mers
2025-09-17 01:33:39,817 - INFO - Reducing 16 columns of 4-mers
2025-09-17 01:33:40,040 - INFO - Reducing 32 columns of 5-mers
2025-09-17 01:33:40,040 - IN

In [55]:
# ncr_full = prep_training_data(
#     target='ncr',
#     base_parquet_path='/home/chlab/flync/new-tests/ncr_base.parquet',
#     bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/ncr_bwq.parquet',
#     cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/ncr_cpat.parquet',
#     mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/ncr_mfe_linear.parquet',
#     kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/ncr_binary_sparse/ncr',
#     redux_n_components=None,
#     use_dim_redux=False,
#     use_tfidf=True,
#     sparse=False
# )

# ncr_full.to_parquet('/home/chlab/flync/src/data/ncr_training_full.parquet')

In [56]:
# pcg_full = prep_training_data(
#     target='pcg',
#     base_parquet_path='/home/chlab/flync/new-tests/pcg_base.parquet',
#     bwq_parquet_path='/home/chlab/flync/new-tests/bw-feature/pcg_bwq.parquet',
#     cpat_parquet_path='/home/chlab/flync/new-tests/cpat-feature/pcg_cpat.parquet',
#     mfe_parquet_path='/home/chlab/flync/new-tests/mfe-feature/pcg_mfe_linear.parquet',
#     kmer_base_path='/home/chlab/flync/new-tests/kmer-feature/pcg_binary_sparse/pcg',
#     redux_n_components=None,
#     use_dim_redux=False,
#     use_tfidf=True,
#     sparse=False
# )

# pcg_full.to_parquet('/home/chlab/flync/src/data/pcg_training_full.parquet')