In [58]:
from pathlib import Path
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from interpret.glassbox import ExplainableBoostingClassifier

In [60]:
models_path = Path("../src/models")

with open(models_path / "CORR_flync_ebm_model_redux.pkl", "rb") as f:
    model = pickle.load(f)

In [20]:
inf_data_path = Path("../vu_gse199164/vu_gse199164.parquet")

df = pd.read_parquet(inf_data_path)

# Check for duplicates
duplicates = df[df.duplicated(subset=["chromosome", "start", "end"])]
duplicates.shape

(0, 54)

In [21]:
df.set_index("index", inplace=True)

cols_to_drop = ['chromosome', 'start', 'end', 'ss_sequence', 'ss_structure'] + [col for col in df.columns if col.startswith('cpat')]
cols_to_drop

df.drop(columns=cols_to_drop, inplace=True)

In [23]:
# Logic for data cleaning:
# 1. If columns are UCSC (BigWig or BigBed) statistics features or CPAT scores (no ORFs found), fill NAs with 0
logic_op1 = (df.columns.str.startswith(tuple(['min_', 'max_', 'mean_', 'std_', 'sum_', 'cov_'])) | df.columns.str.startswith('cpat_'))
for col in df.columns:
    if col in df.columns[logic_op1]:
        if df[col].isna().sum() > 0:
            print(f"Filling NAs in column {col} with 0")
            df[col] = df[col].fillna(0)

# 2. If columns are ss_* features, drop rows with NAs. This seems to be the best approach as no structure was calculated for the sequence
logic_op2 = df.columns.str.startswith('ss_')
for col in df.columns:
    if col in df.columns[logic_op2]:
        if df[col].isna().sum() > 0:
            print(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            print(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")
        # Drop values where `ss_mfe` is not < 0
        if col == 'ss_mfe':
            df = df[df[col] < 0]
            print(f"Rows with ss_mfe >= 0 dropped. Shape: {df.shape}")

# 3. If columns start with '0' or '1', drop rows with NAs. This seems to be the best approach as we have no counts for all required k-mers
logic_op3 = df.columns.str.startswith(('0', '1')) | df.columns.str.contains('mer_SVD')
for col in df.columns:
    if col in df.columns[logic_op3]:
        if df[col].isna().sum() > 0:
            print(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            print(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")

# 4. If columns are categorical, fill NAs with ''
logic_op4 = df.dtypes == 'object'
for col in df.columns:
    if col in df.columns[logic_op4]:
        if df[col].isna().sum() > 0:
            print(f"Filling NAs in column {col} with empty string")
            df[col] = df[col].fillna('')

    # remove trailing '_[1-9]' digits from entries_epdnew column. These are promoter rankings which are not needed.
    if 'entries_epdnew' in df.columns:
        df['entries_epdnew'] = df['entries_epdnew'].str.split(',')
        # Remove '_[any digit]' suffix from each entry, even if two or more digits
        df["entries_epdnew"] = df[df['entries_epdnew'].notna()]['entries_epdnew'].apply(lambda x: ','.join([re.sub(r'_[0-9]+$', '', i) for i in x]))

Filling NAs in column cov_epdnew with 0
Filling NAs in column cov_h3k4me3 with 0
Filling NAs in column cov_s2_pol2 with 0
Filling NAs in column cov_tss_minus with 0
Filling NAs in column cov_tss_plus with 0
Filling NAs in column max_s2_pol2 with 0
Filling NAs in column max_tss_plus with 0
Filling NAs in column mean_h3k4me3 with 0
Filling NAs in column mean_pcons27 with 0
Filling NAs in column mean_phylocons124 with 0
Filling NAs in column min_tss_minus with 0
Filling NAs in column std__phylocons124 with 0
Filling NAs in column std_pcons27 with 0
Filling NAs in column sum__phylocons124 with 0
Filling NAs in column sum_h3k4me3 with 0
Filling NAs in column sum_pcons27 with 0
Filling NAs in column sum_s2_pol2 with 0
Filling NAs in column sum_tss_minus with 0
Filling NAs in column sum_tss_plus with 0
Rows with ss_mfe >= 0 dropped. Shape: (18956, 38)
Filling NAs in column entries_remap with empty string


In [26]:
# Multi-hot encoding
# Get a df with only the categorical features.
df_categorical = df.loc[:, df.dtypes == 'object']

mhe_df = pd.DataFrame(index=df.index)

prefix_cols = True

# For each categorical feature, perform multi-hot encoding
for col in df_categorical.columns:
    prefix = col.split('_')[-1]

    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','), binary=True, token_pattern=None)

    # Only fit on non-empty values to avoid creating a column for ''
    non_empty = df_categorical[col][df_categorical[col] != '']
    if not non_empty.empty:
        vectorizer.fit(non_empty)
        # Transform all values (including empty) to ensure shape matches
        encoded = vectorizer.transform(df_categorical[col])
        encoded_df = pd.DataFrame(encoded.toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)
        # Convert to bool dtype
        encoded_df = encoded_df.astype(bool)
        if prefix_cols:
            encoded_df.columns = [f"{prefix}_{c}" for c in encoded_df.columns]
        mhe_df = pd.concat([mhe_df, encoded_df], axis=1)
    else:
        # If all values are empty, skip this column
        continue

In [27]:
mhe_df.shape

(18956, 695)

In [28]:
# Check if duplicated columns exist in the multi-hot encoded DataFrame
if mhe_df.columns.duplicated().any():
    print("Duplicated columns found in the multi-hot encoded DataFrame. Removing duplicates.")
    # Merge duplicated columns by taking the logical OR (future-proof way)
    mhe_df = mhe_df.T.groupby(level=0).any().T.astype(bool)

# Check if duplicated entries from different BigBed files.
# For each column <prefix>_<col_name> if <col_name> matches, merge them to a single one
# and change the column name to merged_<col_name> using the logical OR approach to keeping 1s
if prefix_cols:
    merged_columns = {}
    for col in mhe_df.columns:
        prefix, col_name = col.split('_', 1)
        if col_name not in merged_columns:
            merged_columns[col_name] = []
        merged_columns[col_name].append(col)

    for col_name, cols in merged_columns.items():
        if len(cols) > 1:
            # Check if all values are identical across the columns for this col_name
            all_equal = mhe_df[cols].nunique(axis=1).eq(1).all()

            if all_equal:
                # Safe to merge because all rows have the same value
                mhe_df[f'merged_{col_name}'] = mhe_df[cols].iloc[:, 0]
                mhe_df = mhe_df.drop(columns=cols)

In [29]:
mhe_df.shape

(18956, 695)

In [30]:
# Concatenate the multi-hot encoded features with the original DataFrame. Making sure the indexes match
df = pd.concat([df.drop(columns=df_categorical.columns), mhe_df], axis=1)

# Check columns for invalid characters and rename them
df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
df.columns = df.columns.str.replace(r'__+', '_', regex=True)
df.columns = df.columns.str.strip('_')

In [54]:
# sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Columns to lowercase
df.columns = [col.lower() for col in df.columns]

df.columns

Index(['10mer_svd1', '11mer_svd1', '12mer_svd1', '3mer_svd1', '4mer_svd1',
       '5mer_svd1', '6mer_svd1', '7mer_svd1', '8mer_svd1', '9mer_svd1',
       ...
       'tfbs_usp', 'tfbs_vfl', 'tfbs_vis', 'tfbs_vnd', 'tfbs_vsx1',
       'tfbs_vsx2', 'tfbs_vvl', 'tfbs_z', 'tfbs_zen', 'tfbs_zen2'],
      dtype='object', length=731)

In [62]:
y_pred = model.predict(df)
y_probab = model.predict_proba(df)[:, 1]

In [64]:
# Attach predictions to the original DataFrame
df['prediction'] = y_pred
df['probability'] = y_probab

In [68]:
res = df.drop(columns=[col for col in df.columns if col not in ['prediction', 'probability']])

In [71]:
df = pd.read_parquet(inf_data_path)
df = df[['index', 'chromosome', 'start', 'end']]
df.set_index('index', inplace=True)

In [73]:
res = df.merge(res, how='inner', right_index=True, left_index=True)

In [76]:
res.to_excel("flync_vu_gse199164_inference_results.xlsx", index=True)