# Enet using deconvo genes from Bayes Prism (mirroring feature engineering as closely as possible to original paper)

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
import pickle

## Data Filtering

In [2]:
# import data
ann_tcga = pd.read_csv('../data/toga.breast.brca.status.txt', sep='\t', index_col=0)
tcga_brca = pd.read_csv('../data/tcga.brca.rnaseq.unstranded.fpkm.counts.matrix.txt', sep='\t', index_col=0)
deconvo = pd.read_csv('../data/Deconvo2.csv',  index_col=0)
hrd_scores = pd.read_excel('../data/tcga.hrdscore.xlsx')

In [3]:
# ann_tcga[ann_tcga['event.PALB2'].ne('0')]
# ann_tcga[ann_tcga['event.RAD51C'].ne('0')]
ann_tcga = ann_tcga[~ann_tcga['event.RAD51C'].ne('0')]
ann_tcga = ann_tcga[~ann_tcga['event.PALB2'].ne('0')]
ann_tcga = ann_tcga[ann_tcga['event.BRCA1'] != '1']

In [None]:
ann_tcga.index = ann_tcga.index.str.replace('.', '-', regex=False)
tcga_brca = tcga_brca.set_index('Case ID')
tcga_brca = tcga_brca[tcga_brca['Sample Type'] == 'Primary Tumor']
print(f"annotated shape{ann_tcga.shape}")
print(f"rna-seq shape{tcga_brca.shape}")

annotated shape(962, 33)
rna-seq shape(1231, 60662)


In [5]:
common_indices = ann_tcga.index.intersection(tcga_brca.index)

# Filter DataFrames to keep only rows with common indices
ann_tcga = ann_tcga.loc[common_indices]
tcga_brca = tcga_brca.loc[common_indices]
print(f"rna-seq shape{tcga_brca.shape}")

rna-seq shape(1070, 60662)


In [7]:
deconvo.index = deconvo.index.map(lambda x: x[:12])

In [8]:
common_indices1 = deconvo.index.intersection(tcga_brca.index)
tcga_brca.loc[common_indices1].shape

(835, 60662)

In [9]:
# tcga_brca.loc[tcga_brca.index.difference(deconvo.index)]
# deconvo.loc[deconvo.index.difference(tcga_brca.index)]

In [10]:
hrd_scores = hrd_scores.set_index('sample')

hrd_scores = hrd_scores.loc[hrd_scores.index.intersection(deconvo.index)]
labels_df = hrd_scores['HRD-sum'].apply(lambda x: 'HRD' if x >= 79 else 'HR-Proficient')
labels_df = labels_df.sort_index()
deconvo = deconvo.sort_index()
deconvo = np.log2(deconvo + 1)

In [16]:
labels = labels_df.squeeze()
features_df = deconvo

## Regression: Takes 250 hrs 6-cpu

In [None]:
# Initialize a list to store coefficients from each iteration
coefficients_list = []

# Split the data into training and test sets (1/3 test split)
X_train, X_test, y_train, y_test = train_test_split(
    features_df, labels, test_size=1/3, stratify=labels, random_state=123
)

# Run 1000 iterations
for iteration in range(1000):
    print(f"Iteration {iteration + 1} of 1000")
    
    # Define the LogisticRegressionCV model
    logreg_cv = LogisticRegressionCV(
        Cs=10,  # Number of Cs to try; you can specify an array of Cs if desired
        cv=10,  # Tenfold cross-validation
        penalty='elasticnet',
        solver='saga',  # Solver that supports elastic net penalty
        # multi_class='multinomial',
        l1_ratios=[0.25],  # Alpha = 0.25
        max_iter=1000,
        n_jobs=-1,  # Utilize all processors
        random_state=123*iteration
    )
    
    # Fit the model
    logreg_cv.fit(X_train, y_train)
    
    # Get coefficients at the best C (lambda.min)
    coef = logreg_cv.coef_  # Shape: (n_classes, n_features)
    
    # Append the coefficients to the list
    coefficients_list.append(coef)


with open('../data/output/reg_coefs_list.pkl', 'wb') as f:
    pickle.dump(coefficients_list, f)


Iteration 1 of 1000




In [None]:
# If not running reg:
with open('../data/output/reg_coefs_list.pkl', 'rb') as f:
    coefficients_list = pickle.load(f)
coefficients_list

In [None]:
# Step 1: Identify genes with non-zero coefficients across all iterations
coefficients_array = np.array(coefficients_list)  # Shape: (1000, n_classes, n_features)
non_zero_coefficients = np.any(coefficients_array != 0, axis=1)  # Shape: (1000, n_features)
non_zero_counts = np.sum(non_zero_coefficients, axis=0)  # Shape: (n_features,)
genes_selected_mask = non_zero_counts == len(coefficients_list)
selected_gene_indices = np.where(genes_selected_mask)[0]
feature_names = features_df.columns
selected_genes = feature_names[selected_gene_indices]

print(f"Number of selected genes: {len(selected_genes)}") 


Number of selected genes: 2376


In [21]:
non_zero_counts

array([2, 0, 0, ..., 0, 2, 2])

In [24]:
# Step 2: Compute centroids for each class
features_selected = features_df[selected_genes]
labels_aligned = labels.loc[features_selected.index]
centroids = features_selected.groupby(labels_aligned).mean()

In [25]:
# Step 3: Define function to calculate scores for new samples
from scipy.stats import pearsonr

def calculate_scores(new_sample):
    new_sample_selected = new_sample[selected_genes]
    scores = {}
    for class_label in centroids.index:
        centroid = centroids.loc[class_label]
        corr_coef, _ = pearsonr(new_sample_selected, centroid)
        scores[class_label] = corr_coef
    return scores

# Example usage with a new sample
# new_sample = pd.Series(..., index=selected_genes)

# Calculate scores
# scores = calculate_scores(new_sample)
# print(scores)


In [28]:
first_sample = deconvo.iloc[0]
new_sample = pd.Series(first_sample.values, index=deconvo.columns)
scores = calculate_scores(new_sample)
print(scores)

{'HR-Proficient': 0.7873614958645063, 'HRD': 0.5378528251342551}


HRD              4
Telomeric AI     3
LST              6
HRD-sum         13
Name: TCGA-3C-AAAU, dtype: int64