In [137]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import datetime

### Constants & Options

In [141]:
BULK_PATH = "input/2dRNA/group1/bulk_RawCounts.tsv"
SC_DIR_PATH = "input/2dRNA/group1/"
TEST_SIZE = 0.2  # For train-test split
EPOCHS = 200  # Number of epochs for training
BATCH_SIZE = 32  # Batch size for training

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)
np.set_printoptions(linewidth=120)
np.set_printoptions(precision=4, suppress=True)

### 1. Data Loading
Load necessary files to DataFrames, see info/stats

In [73]:
bulk_df = pd.read_csv(BULK_PATH, sep="\t")

print("B Matrix (Tissue GEPs) Sample:\n")
print(bulk_df.iloc[:, :6].head(5))
print("\n----------------------------------------------")
print(f"\nB DIMENSIONS: rows (genes) = {bulk_df.shape[0]}, columns (patients) = {bulk_df.shape[1]}")

B Matrix (Tissue GEPs) Sample:

             gene_id  gene_symbol  CANUCK1057-BAL-LB3B  CANUCK1047-BAL-LB5  RESP1024-BAL-LB5  CANUCK1060-BAL-RB4
0  ENSG00000290825.1      DDX11L2                    2                   0                 0                   1
1  ENSG00000223972.6      DDX11L1                    0                   0                 0                   0
2  ENSG00000227232.6       WASH7P                   89                  81                47                 101
3  ENSG00000278267.1    MIR6859-1                   23                  12                 7                  14
4  ENSG00000243485.5  MIR1302-2HG                    0                   0                 0                   0

----------------------------------------------

B DIMENSIONS: rows (genes) = 63187, columns (patients) = 34


In [116]:
sc_path = SC_DIR_PATH + "scRNA_CT1_top200_RawCounts.tsv"
sc_df = pd.read_csv(sc_path, sep="\t")

print("S Matrix (Cell GEPs) Sample:\n")
print(sc_df.iloc[:, :12].head(5))
print("\n----------------------------------------------")
print(f"\nS DIMENSIONS: rows (patients x cells) = {sc_df.shape[0]}, columns (genes) = {sc_df.shape[1]}")

S Matrix (Cell GEPs) Sample:

                cell_id patient_id  TUBA1A  SPA17  ACTG1  TSTD1  H1-0  NQO1  ATP5IF1  DNPH1  NEDD9  ALDH1A1
0  AAACCCACAATACGAA-1_1   BAL-RB-2       0      0      1      1     0     0        0      0      0        0
1  AAACGAACACGCTATA-1_1   BAL-RB-2      81      1     64      2     0     1        6      1      0       15
2  AACAACCCAAACTCGT-1_1   BAL-RB-2       4      0    106      0     0     0        4      1      0       13
3  AACACACCAAATTGGA-1_1   BAL-RB-2       0      0      1      0     0     0        0      0      0        0
4  AACAGGGGTCGTACTA-1_1   BAL-RB-2       0      0     16      0     0     0        1      0      2        0

----------------------------------------------

S DIMENSIONS: rows (patients x cells) = 241924, columns (genes) = 1013


In [74]:
sc_metadata_path = SC_DIR_PATH + "scRNA_CT1_top200_Metadata.tsv"
sc_metadata_df = pd.read_csv(sc_metadata_path, sep="\t")

print("S Metadata Matrix Sample:\n")
print(sc_metadata_df.head(5))
print("\n----------------------------------------------")
print("S Metadata Info:\n")
sc_metadata_df.info()
print("----------------------------------------------")
print(f"\nS METADATA DIMENSIONS: rows (patients x cells) = {sc_metadata_df.shape[0]}, columns (metadata) = {sc_metadata_df.shape[1]}\n")

S Metadata Matrix Sample:

                cell_id patient_id  patient_age patient_sex  cell_type_1                cell_type_2                cell_type_3          cell_type_4                      data_source deconv_cluster
0  AAACCCACAATACGAA-1_1   BAL-RB-2           32      Female   Epithelial                        NaN                 Epithelial           Epithelial  Post-covid respiratory symptoms     Epithelial
1  AAACGAACACGCTATA-1_1   BAL-RB-2           32      Female  Macrophages  Alveolar_Macrophage_CSF1R  Alveolar_Macrophage_CSF1R  Alveolar_macrophage  Post-covid respiratory symptoms    Macrophages
2  AACAACCCAAACTCGT-1_1   BAL-RB-2           32      Female  Macrophages           Macrophage_CCL18           Macrophage_CCL18  Alveolar_macrophage  Post-covid respiratory symptoms    Macrophages
3  AACACACCAAATTGGA-1_1   BAL-RB-2           32      Female  Macrophages           Macrophage_CCL18           Macrophage_CCL18  Alveolar_macrophage  Post-covid respiratory symptoms    Macro

### 2. Data Processing
Process bulk and single-cell data to generate training samples.

In [128]:
bulk_df_vals = bulk_df.iloc[:, 2:]  # drop gene_id and gene_symbol

# Filter B and S to keep only common genes
bulk_genes = bulk_df["gene_symbol"].str.strip().str.lower()
sc_genes = sc_df.columns[2:].str.strip().str.lower()
common_genes = np.intersect1d(bulk_genes, sc_genes)
print("Common genes:", len(common_genes))
bulk_df_vals = bulk_df_vals.loc[bulk_df["gene_symbol"].isin(common_genes)]
print(bulk_df_vals.shape)

# Match patient IDs in metadata to bulk matrix column names
sc_patient_ids = sc_metadata_df['patient_id'].unique()
bulk_patient_ids = bulk_df_vals.columns
if not all(pat in bulk_patient_ids for pat in sc_patient_ids):
    raise ValueError("Patient IDs in S do not match B. Check mapping.")

# Calculate cell-type fractions for each patient
ct_labels = sc_metadata_df["cell_type_1"].dropna().unique()
C = []
for pat in bulk_patient_ids:
    cells = sc_metadata_df[sc_metadata_df["patient_id"] == pat]
    # TODO: Augmentation -- random sample cells if needed

    ct_fractions = cells["cell_type_1"].value_counts(normalize=True)
    all_ct_fractions = {ct: ct_fractions.get(ct, 0.0) for ct in ct_labels}  # Impute missing cell types
    C.append(list(all_ct_fractions.values()))

# Log-normalize bulk data and prepare output arrays
B = np.log1p(bulk_df_vals.values.T)  # Transpose to patients x genes numpy array
C = np.array(C)

print(f"B shape (samples x genes): {B.shape}")
print(f"C shape (samples x CTs): {C.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(
    B, C, test_size=TEST_SIZE, random_state=42
)

Common genes: 1009
(1009, 32)
B shape (samples x genes): (32, 1009)
C shape (samples x CTs): (32, 11)


In [129]:
print("B Sample --------------------------------------------------\n")
print(B[:5, :15])
print("\nC Sample --------------------------------------------------\n")
print(C[:, :])

B Sample --------------------------------------------------

[[6.1924 3.1355 3.1781 4.4998 3.2581 6.5294 5.4467 7.6353 8.2633 5.0814 4.2195 4.2905 8.8323 6.2785 6.9508]
 [6.7754 2.4849 1.9459 4.7707 3.1355 6.5639 4.8752 7.5919 7.9352 5.8999 3.1781 4.1744 8.9866 6.3368 6.3172]
 [6.142  1.9459 3.5835 4.4308 5.4161 6.2672 4.3175 7.1562 7.5694 2.7726 2.1972 3.7842 8.2636 6.5468 6.7154]
 [7.7165 3.2581 4.654  4.4427 6.1092 5.7777 4.9698 7.4616 7.6246 4.4067 2.0794 4.0604 9.2519 5.9738 6.2804]
 [7.0166 1.7918 2.0794 5.0304 4.9972 6.3439 4.8363 7.4679 7.5761 5.6419 3.3673 4.6821 8.7358 6.0014 6.286 ]]

C Sample --------------------------------------------------

[[0.0706 0.8308 0.0156 0.0282 0.0074 0.006  0.0018 0.0075 0.0049 0.0002 0.027 ]
 [0.0004 0.9668 0.0232 0.0075 0.0002 0.0009 0.0005 0.0003 0.0001 0.     0.    ]
 [0.0012 0.8548 0.0887 0.0393 0.0004 0.0043 0.0038 0.0069 0.0004 0.0002 0.0001]
 [0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0012 0.8478 

### 3. Model Training
Define model architecture/parameters, run training

In [138]:
def build_model(input_dim, output_dim):
    """
    Build Scaden-like neural network.

    1. Define a 4-layer feedforward network with batch normalization and dropout.
    2. Use linear activation in the output layer to predict cell-type abundance fractions.
    3. Compile the model using Adam optimizer and MSE loss.
    """
    model = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(input_dim,)),
            tf.keras.layers.Dense(1000, activation="relu"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(500, activation="relu"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation="relu"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(output_dim, activation="linear"),  # No softmax
        ]
    )
    model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])
    return model


def save_model_and_preds(model, X_test, y_test, history):
    """
    Save the trained model, training history, and predictions on the test set.
    """
    os.makedirs("output", exist_ok=True)

    dtnum = str(datetime.datetime.now().strftime("%Y%m%d_%H%M"))
    model_dir = os.path.join("output", "2dRNA", dtnum)
    os.makedirs(model_dir, exist_ok=True)

    model_path = os.path.join(model_dir, f"model.keras")
    model.save(model_path)
    print(f"Saved model to {model_path}")

    history_path = os.path.join(model_dir, f"history.npy")
    np.save(history_path, history.history)
    print(f"Saved training history to {history_path}")

    y_pred = model.predict(X_test)
    predictions_file = os.path.join(model_dir, f"pred_fractions.csv")
    true_fractions_file = os.path.join(model_dir, f"true_fractions.csv")
    np.savetxt(predictions_file, y_pred, delimiter=",")
    np.savetxt(true_fractions_file, y_test, delimiter=",")
    print(f"Saved predicted fractions to {predictions_file}")
    print(f"Saved true fractions to {true_fractions_file}")


In [142]:
input_dim = X_train.shape[1]
output_dim = Y_train.shape[1]
model = build_model(input_dim, output_dim)

print("Training model...")
history = model.fit(
    X_train,
    Y_train,
    validation_data=(X_test, Y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=TEST_SIZE,
    verbose=2,
)
print("Model training complete!\n")

save_model_and_preds(model, X_test, Y_test, history)

Training model...
Epoch 1/200
1/1 - 2s - 2s/step - loss: 1.6342 - mae: 1.0370 - val_loss: 15.9924 - val_mae: 3.2896
Epoch 2/200
1/1 - 0s - 60ms/step - loss: 1.6946 - mae: 1.0262 - val_loss: 19.0404 - val_mae: 3.4807
Epoch 3/200
1/1 - 0s - 58ms/step - loss: 1.3137 - mae: 0.9060 - val_loss: 31.5209 - val_mae: 4.7622
Epoch 4/200
1/1 - 0s - 54ms/step - loss: 1.2545 - mae: 0.8724 - val_loss: 54.3461 - val_mae: 6.6207
Epoch 5/200
1/1 - 0s - 54ms/step - loss: 1.2325 - mae: 0.8683 - val_loss: 80.8300 - val_mae: 8.0674
Epoch 6/200
1/1 - 0s - 52ms/step - loss: 0.9761 - mae: 0.7800 - val_loss: 104.3736 - val_mae: 9.1595
Epoch 7/200
1/1 - 0s - 56ms/step - loss: 1.0700 - mae: 0.8068 - val_loss: 111.6079 - val_mae: 9.4463
Epoch 8/200
1/1 - 0s - 53ms/step - loss: 0.8641 - mae: 0.7467 - val_loss: 114.9002 - val_mae: 9.5224
Epoch 9/200
1/1 - 0s - 53ms/step - loss: 0.8758 - mae: 0.7649 - val_loss: 112.4135 - val_mae: 9.3582
Epoch 10/200
1/1 - 0s - 54ms/step - loss: 0.6320 - mae: 0.6272 - val_loss: 111.6

### 4. Model Evaluation
Extract performance metrics and visualizations for accuracy, generalizability, robustness, etc.