In [34]:
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.model_selection import train_test_split

### 1. Data Loading
Load necessary files to DataFrames, see info/stats

In [3]:
BULK_PATH = "input/paired/group1/bulk_RawCounts.tsv"
SC_DIR_PATH = "input/paired/group1/"

In [31]:
bulk_df = pd.read_csv(BULK_PATH, sep="\t")

print("B Matrix Sample:\n", bulk_df.iloc[:, :4].head(3))
print("\n----------------------------------------------")
print(f"\nB DIMENSIONS: rows (genes) = {bulk_df.shape[0]}, columns (patients) = {bulk_df.shape[1]}")

B Matrix Sample:
              gene_id gene_symbol  CANUCK1057-BAL-LB3B  CANUCK1047-BAL-LB5
0  ENSG00000290825.1     DDX11L2                    2                   0
1  ENSG00000223972.6     DDX11L1                    0                   0
2  ENSG00000227232.6      WASH7P                   89                  81

----------------------------------------------

B DIMENSIONS: rows (genes) = 63187, columns (patients) = 34


In [33]:
sc_metadata_path = SC_DIR_PATH + "scRNA_CT1_top200_Metadata.tsv"
sc_metadata_df = pd.read_csv(sc_metadata_path, sep="\t")

print("S Metadata Matrix Sample:\n", sc_metadata_df.head(3))
print("\n----------------------------------------------")
print("S Metadata Info:")
sc_metadata_df.info()
print("\n----------------------------------------------")
print(f"\nS METADATA DIMENSIONS: rows (patients x cells) = {sc_metadata_df.shape[0]}, columns (metadata) = {sc_metadata_df.shape[1]}")

S Metadata Matrix Sample:
                 cell_id patient_id  patient_age patient_sex  cell_type_1  \
0  AAACCCACAATACGAA-1_1   BAL-RB-2           32      Female   Epithelial   
1  AAACGAACACGCTATA-1_1   BAL-RB-2           32      Female  Macrophages   
2  AACAACCCAAACTCGT-1_1   BAL-RB-2           32      Female  Macrophages   

                 cell_type_2                cell_type_3          cell_type_4  \
0                        NaN                 Epithelial           Epithelial   
1  Alveolar_Macrophage_CSF1R  Alveolar_Macrophage_CSF1R  Alveolar_macrophage   
2           Macrophage_CCL18           Macrophage_CCL18  Alveolar_macrophage   

                       data_source deconv_cluster  
0  Post-covid respiratory symptoms     Epithelial  
1  Post-covid respiratory symptoms    Macrophages  
2  Post-covid respiratory symptoms    Macrophages  

----------------------------------------------
S Metadata Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241924 entries, 0 to 241

In [32]:
sc_gep_path = SC_DIR_PATH + "scRNA_CT1_top200_RawCounts.tsv"
sc_gep_df = pd.read_csv(sc_gep_path, sep="\t")

print("\nSC GEP Matrix Sample:")
print(sc_gep_df.iloc[:, :7].head(5))
print("\n----------------------------------------------")
print(f"\nS DIMENSIONS: rows (patients x cells) = {sc_gep_df.shape[0]}, columns (genes) = {sc_gep_df.shape[1]}")


SC GEP Matrix Sample:
                cell_id patient_id  TUBA1A  SPA17  ACTG1  TSTD1  H1-0
0  AAACCCACAATACGAA-1_1   BAL-RB-2       0      0      1      1     0
1  AAACGAACACGCTATA-1_1   BAL-RB-2      81      1     64      2     0
2  AACAACCCAAACTCGT-1_1   BAL-RB-2       4      0    106      0     0
3  AACACACCAAATTGGA-1_1   BAL-RB-2       0      0      1      0     0
4  AACAGGGGTCGTACTA-1_1   BAL-RB-2       0      0     16      0     0

----------------------------------------------

S DIMENSIONS: rows (patients x cells) = 241924, columns (genes) = 1013


### 2. Data Processing
Data cleaning, augmentation, feature engineering, normalization, train-test split

### 3. Model Training
Define model architecture/parameters, run training

### 4. Model Evaluation
Extract performance metrics and visualizations for accuracy, generalizability, robustness, etc.