In [44]:
# Import required libraries
import pandas as pd
import numpy as np
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import matplotlib.pyplot as plt
import seaborn as sns
import decoupler as dc


In [45]:
# Load the RNA-seq data
rnaseq_df = pd.read_csv('tcga.brca.rnaseq.unstranded.fpkm.counts.matrix (1).txt', sep='\t', index_col=0)

# Load the HRD score data
hrd_df = pd.read_excel('tcga.hrdscore.xlsx')

# Load the BRCA status data
brca_df = pd.read_csv('toga.breast.brca.status.txt', sep='\t', index_col=0)
brca_df.index = brca_df.index.str.replace('.', '-')


### Feature Selection

In [46]:
#drop non-protein coding genes

# Step 1: Filter for protein coding genes (from your original code)
columns_to_keep = ['Case ID', 'Sample ID', 'Sample Type']
gene_columns = [col for col in rnaseq_df.columns if col not in columns_to_keep]
protein_coding_cols = [col for col in gene_columns if 'protein_coding' in col]
final_columns = columns_to_keep + protein_coding_cols
rnaseq_df_filtered = rnaseq_df[final_columns]


In [47]:

# Step 2: Calculate mean expression for each gene
# First, separate metadata columns from gene expression columns
gene_expression = rnaseq_df_filtered[protein_coding_cols]
metadata = rnaseq_df_filtered[columns_to_keep]


In [48]:

# Calculate mean expression for each gene
gene_means = gene_expression.mean(axis=0)

# Calculate number of genes to keep (top 2%)
n_genes_to_keep = 400

# Get the column names (genes) with highest mean expression
top_genes = gene_means.nlargest(n_genes_to_keep).index

# Create final dataframe with metadata and top genes
final_df = pd.concat([metadata, rnaseq_df_filtered[top_genes]], axis=1)


In [49]:

# Print information about the filtering
print("Original number of protein coding genes:", len(protein_coding_cols))
print("Number of top genes kept (2%):", n_genes_to_keep)
print("\nDataframe shapes:")
print("Original:", rnaseq_df.shape)
print("After protein coding filter:", rnaseq_df_filtered.shape)
print("Final (with top 2% genes):", final_df.shape)


Original number of protein coding genes: 19962
Number of top genes kept (2%): 400

Dataframe shapes:
Original: (1231, 60663)
After protein coding filter: (1231, 19965)
Final (with top 2% genes): (1231, 403)


In [50]:

# Optional: Show summary statistics of the mean expression values
print("\nMean expression summary for top genes:")
print(gene_means[top_genes].describe())

# Save results to new dataframe
rnaseq_df_top_genes = final_df

# Optional: verify that our key columns are still present
print("\nChecking if key columns are present:")
for col in columns_to_keep:
    print(f"{col} present: {col in rnaseq_df_top_genes.columns}")


Mean expression summary for top genes:
count     400.000000
mean      339.773484
std       799.847929
min        80.347665
25%       103.432431
50%       135.239130
75%       242.387309
max      7294.766159
dtype: float64

Checking if key columns are present:
Case ID present: True
Sample ID present: True
Sample Type present: True


In [51]:
# Drop rows where event.BRCA1 is '1'
brca_df_filtered = brca_df[brca_df['event.BRCA1'] != '1']

# Drop rows where event.BRCA2 is 'Bi-allelic-undetermined'
brca_df_filtered = brca_df_filtered[brca_df_filtered['event.BRCA2'] != 'Bi-allelic-undetermined']

# Drop rows where event.PALB2 is '2'
brca_df_filtered = brca_df_filtered[brca_df_filtered['event.PALB2'] != '2']

In [52]:
#idenfity the common samples

# Extract sample IDs from each dataframe
rnaseq_samples = set(rnaseq_df_top_genes['Case ID'])
hrd_samples = set(hrd_df['sample'])
brca_samples = set(brca_df_filtered.index)

# Find the intersection of all sample IDs
common_samples = rnaseq_samples.intersection(hrd_samples).intersection(brca_samples)

# Print the number of common samples
print("\nNumber of common samples:", len(common_samples))



Number of common samples: 857


In [53]:
# Drop samples that are not common in all three dataframes
rnaseq_df_common = rnaseq_df_top_genes[rnaseq_df_top_genes['Case ID'].isin(common_samples)]
hrd_df_common = hrd_df[hrd_df['sample'].isin(common_samples)]
brca_df_common = brca_df_filtered[brca_df_filtered.index.isin(common_samples)]

# Print the shapes of the new dataframes
print("rnaseq_df_common:", rnaseq_df_common.shape)
print("hrd_df_common:", hrd_df_common.shape)
print("brca_df_common:", brca_df_common.shape)

rnaseq_df_common: (921, 403)
hrd_df_common: (857, 5)
brca_df_common: (857, 33)


In [80]:
# duplicates = rnaseq_df_common[rnaseq_df_common['Case ID'].duplicated(keep=False)]
# print("Duplicates in 'Case ID':")
# print(duplicates)
rnaseq_df_common_unique = rnaseq_df_common.drop_duplicates(subset='Case ID', keep='first')
print(rnaseq_df_common_unique.shape)

(857, 403)


In [170]:
# Step 1: Merge rnaseq_df_common and hrd_df_common
merged_df = pd.merge(rnaseq_df_common_unique, hrd_df_common, 
                     left_on='Case ID', right_on='sample', 
                     how='inner')
print(merged_df.shape)
# Step 2: Merge the result with brca_df_common
final_merged_df = pd.merge(merged_df, brca_df_common, 
                           left_on='Case ID', right_index=True, 
                           how='inner')

# Print the shape of the final merged dataframe
print("Final merged dataframe shape:", final_merged_df.shape)


# # # List of columns in the final merged dataframe
# # print("\nColumns in the merged dataframe:")
# # print(final_merged_df.columns.tolist())

# # Check for any duplicate columns that might have been created during the merge
# duplicate_columns = final_merged_df.columns[final_merged_df.columns.duplicated()].tolist()
# if duplicate_columns:
#     print("\nWarning: The following columns are duplicated:")
#     print(duplicate_columns)
# else:
#     print("\nNo duplicate columns found.")

# # Save the merged dataframe to a CSV file (optional)
# # final_merged_df.to_csv('merged_dataframe.csv', index=False)

(857, 408)
Final merged dataframe shape: (857, 441)


In [171]:
final_merged_df['BRCA_status'] = final_merged_df.apply(lambda row: 1 if (row['event.BRCA1'] in ['Bi-allelic-inactivation', 'Epigenetic-silencing'] or 
                                                row['event.BRCA2'] in ['Bi-allelic-inactivation', 'Epigenetic-silencing']) 
                                                else 0, axis=1)

### Feature Scaling

In [172]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Perform one-hot encoding on the 'event.PAM50' column
encoded_data = encoder.fit_transform(final_merged_df[['event.PAM50']])

# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['event.PAM50']))

# Combine the original DataFrame with the new encoded DataFrame
df_encoded = pd.concat([final_merged_df, encoded_df], axis=1)

# Drop the original 'event.PAM50' column if you don't need it anymore
final_merged_df = df_encoded.drop('event.PAM50', axis=1)

In [173]:
print(final_merged_df['Sample Type'].unique())

['Primary Tumor' 'Solid Tissue Normal']


In [176]:
final_merged_df['event.TNBC'] = final_merged_df['event.TNBC'].replace({'TNBC': 1, '0': 0})
final_merged_df['Sample Type'] = final_merged_df['Sample Type'].replace({'Primary Tumor': 1, 'Solid Tissue Normal': 0})
final_merged_df = final_merged_df.rename(columns={'Sample Type': 'Tumor Status'})
# final_merged_df = final_merged_df.drop(columns=['Case ID','Sample ID'])
final_merged_df = final_merged_df.drop(columns=['Sample ID'])
final_merged_df = final_merged_df.set_index('Case ID')

KeyError: 'Sample Type'

In [179]:
final_merged_df

Unnamed: 0_level_0,Sample ID,Tumor Status,ENSG00000198938.2|MT-CO3|protein_coding,ENSG00000198712.1|MT-CO2|protein_coding,ENSG00000198804.2|MT-CO1|protein_coding,ENSG00000198886.2|MT-ND4|protein_coding,ENSG00000198899.2|MT-ATP6|protein_coding,ENSG00000198727.2|MT-CYB|protein_coding,ENSG00000198840.2|MT-ND3|protein_coding,ENSG00000198888.2|MT-ND1|protein_coding,...,event.RAD51C,event.PALB2,event.All Events,event.TNBC,BRCA_status,event.PAM50_Basal,event.PAM50_Her2,event.PAM50_LumA,event.PAM50_LumB,event.PAM50_nan
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-E9-A1RH,TCGA-E9-A1RH-01A,1,3203.8251,3644.7298,4764.8059,5225.0952,2513.6768,1905.5552,2086.7278,3645.0990,...,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0
TCGA-C8-A26W,TCGA-C8-A26W-01A,1,6277.4125,4962.9082,5839.5775,5883.2235,3621.6717,3623.3232,4106.4289,4466.8036,...,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0
TCGA-AO-A0JJ,TCGA-AO-A0JJ-01A,1,5354.8326,4427.3243,5240.8011,4348.5075,2719.6039,2793.0431,2958.2169,2449.5296,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
TCGA-BH-A0HO,TCGA-BH-A0HO-01A,1,6075.6148,3333.3697,3143.3527,3064.6632,2682.5045,1604.8493,1772.1208,1758.3276,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
TCGA-BH-A18F,TCGA-BH-A18F-01A,1,5906.3923,2980.1571,5065.5663,3136.1668,2249.0876,2659.3910,2027.5980,2797.3084,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-AO-A0JE,TCGA-AO-A0JE-01A,1,6902.5193,5804.6276,4819.4440,5272.4834,2361.1993,2709.8857,1879.7420,2912.4703,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0
TCGA-A8-A09G,TCGA-A8-A09G-01A,1,5571.6277,6820.9861,7130.4760,6315.3994,3855.5024,3673.3146,1974.3086,2286.0981,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0
TCGA-E2-A14X,TCGA-E2-A14X-01A,1,8560.3119,10778.5308,5463.6607,7035.1366,3249.8283,3940.9823,3205.9240,1851.1934,...,0,0,0,1,0,1.0,0.0,0.0,0.0,0.0
TCGA-HN-A2OB,TCGA-HN-A2OB-01A,1,7720.2416,4630.1643,6692.5203,4918.1147,3149.1231,3624.8785,4784.3566,3081.8000,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [151]:
from sklearn.model_selection import train_test_split

# Assuming your DataFrame has a column named 'HRD_BRCAstatus' for grouping
X = final_merged_df.drop(columns=['BRCA_status'])  # Features
y = final_merged_df['BRCA_status']  # Target variable

# Splitting the dataset while preserving the proportion of each class
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/3, stratify=y, random_state=1234
)

# Combine back into DataFrames if needed
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

print(f"Training set size: {train_data.shape[0]}, Testing set size: {test_data.shape[0]}")

Training set size: 571, Testing set size: 286


In [162]:
print(test_data['event.PAM50'].unique())

['LumB' nan 'LumA' 'Her2' 'Basal']


In [157]:
import numpy as np
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [158]:
# Create the ElasticNetCV model with 10 folds
elastic_cv = ElasticNetCV(cv=10, random_state=42)

# Fit the model on the training data
elastic_cv.fit(X_train, y_train)

ValueError: could not convert string to float: 'TCGA-E2-A1IL'

In [None]:
# Predictions on the test set
y_pred = elastic_cv.predict(X_test)

# Calculate Mean Squared Error or R-squared score
mse = mean_squared_error(y_test, y_pred)
r_squared = elastic_cv.score(X_test, y_test)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")

In [None]:
print(f"Best alpha: {elastic_cv.alpha_}")
print(f"Best l1_ratio: {elastic_cv.l1_ratio_}")