In [15]:
import cptac
import pandas as pd
import json
import numpy as np
from scipy.stats import zscore

#### Metadata

In [16]:
## display cancer info
cptac.get_cancer_info()

{'brca': 'Breast invasive carcinoma',
 'ccrcc': 'Clear cell renal cell carcinoma',
 'coad': 'Colon adenocarcinoma',
 'gbm': 'Glioblastoma multiforme',
 'hnscc': 'Head and Neck squamous cell carcinoma',
 'lscc': 'Lung squamous cell carcinoma',
 'luad': 'Lung adenocarcinoma',
 'ov': 'Ovarian serous cystadenocarcinoma',
 'pda': 'Pancreatic ductal adenocarcinoma',
 'pdac': 'Pancreatic ductal adenocarcinoma',
 'ucec': 'Uterine Corpus Endometrial Carcinoma'}

In [17]:
## load cptac by cancer type
brca_data = cptac.Brca()
ccrcc_data = cptac.Ccrcc()
coad_data = cptac.Coad()
gbm_data = cptac.Gbm()
hnscc_data = cptac.Hnscc()
lscc_data = cptac.Lscc()
luad_data = cptac.Luad()
ov_data = cptac.Ov()
pdac_data = cptac.Pdac()
ucec_data = cptac.Ucec()

#### Clinical

In [18]:
## pull clinical data by cancer type
brca_clinical_data = brca_data.get_clinical('mssm')
ccrcc_clinical_data = ccrcc_data.get_clinical('mssm')
coad_clinical_data = coad_data.get_clinical('mssm')
gbm_clinical_data = gbm_data.get_clinical('mssm')
hnscc_clinical_data = hnscc_data.get_clinical('mssm')
lscc_clinical_data = lscc_data.get_clinical('mssm')
luad_clinical_data = luad_data.get_clinical('mssm')
ov_clinical_data = ov_data.get_clinical('mssm')
pdac_clinical_data = pdac_data.get_clinical('mssm')
ucec_clinical_data = ucec_data.get_clinical('mssm')

In [19]:
## combine clinical data
clinical_data_list = [brca_clinical_data, ccrcc_clinical_data, coad_clinical_data,
                      gbm_clinical_data, hnscc_clinical_data, lscc_clinical_data,
                      luad_clinical_data, ov_clinical_data, pdac_clinical_data, ucec_clinical_data]

combined_clinical_data = pd.concat(clinical_data_list)

#### Proteomics

In [20]:
## pull proteomics data by cancer type
brca_proteomics_data = brca_data.get_proteomics('bcm')
ccrcc_proteomics_data = ccrcc_data.get_proteomics('bcm')
coad_proteomics_data = coad_data.get_proteomics('bcm')
gbm_proteomics_data = gbm_data.get_proteomics('bcm')
hnscc_proteomics_data = hnscc_data.get_proteomics('bcm')
lscc_proteomics_data = lscc_data.get_proteomics('bcm')
luad_proteomics_data = luad_data.get_proteomics('bcm')
ov_proteomics_data = ov_data.get_proteomics('bcm')
pdac_proteomics_data = pdac_data.get_proteomics('bcm')
ucec_proteomics_data = ucec_data.get_proteomics('bcm')

In [21]:
## combine proteomics data
proteomics_data_list = [brca_proteomics_data, ccrcc_proteomics_data, coad_proteomics_data,
                        gbm_proteomics_data, hnscc_proteomics_data, lscc_proteomics_data,
                        luad_proteomics_data, ov_proteomics_data, pdac_proteomics_data, ucec_proteomics_data]

combined_proteomics_data = pd.concat(proteomics_data_list)


## drop ens level
combined_proteomics_data.columns = combined_proteomics_data.columns.droplevel('Database_ID')

#### Transcriptomics

In [22]:
## pull transcriptomics data by cancer type
brca_transcriptomics_data = brca_data.get_transcriptomics('bcm')
ccrcc_transcriptomics_data = ccrcc_data.get_transcriptomics('bcm')
coad_transcriptomics_data = coad_data.get_transcriptomics('bcm')
gbm_transcriptomics_data = gbm_data.get_transcriptomics('bcm')
hnscc_transcriptomics_data = hnscc_data.get_transcriptomics('bcm')
lscc_transcriptomics_data = lscc_data.get_transcriptomics('bcm')
luad_transcriptomics_data = luad_data.get_transcriptomics('bcm')
ov_transcriptomics_data = ov_data.get_transcriptomics('bcm')
pdac_transcriptomics_data = pdac_data.get_transcriptomics('bcm')
ucec_transcriptomics_data = ucec_data.get_transcriptomics('bcm')

In [23]:
## combine transcriptomics data
transcriptomics_data_list = [brca_transcriptomics_data, ccrcc_transcriptomics_data, coad_transcriptomics_data,
                            gbm_transcriptomics_data, hnscc_transcriptomics_data, lscc_transcriptomics_data,
                            luad_transcriptomics_data, ov_transcriptomics_data, pdac_transcriptomics_data, ucec_transcriptomics_data]

combined_transcriptomics_data = pd.concat(transcriptomics_data_list)

## drop ens level
combined_transcriptomics_data.columns = combined_transcriptomics_data.columns.droplevel('Database_ID')

#### Process

In [24]:
## load gene lists
with open('../genes.json', 'r') as file:
    marker_genes = json.load(file)

In [25]:
## select relevant clinical columns
select_clinical_data = combined_clinical_data['tumor_code']

In [26]:
## select relevant proteomics columns
select_gene_columns = [gene for genes in marker_genes.values() for gene in genes]
select_proteomics_data = combined_proteomics_data[select_gene_columns].copy()

## z-score normalize, ignoring nan values
def zscore_with_nan_handling(series):
    non_nan_values = series.dropna()
    z_scores = zscore(non_nan_values)
    result = series.copy()
    result[non_nan_values.index] = z_scores
    return result
select_proteomics_data[select_gene_columns] = select_proteomics_data[select_gene_columns].apply(zscore_with_nan_handling, axis=0)

## average expression by gene list
for marker, genes in marker_genes.items():
    genes_present = [g for g in genes if g in select_proteomics_data.columns]
    select_proteomics_data[marker] = select_proteomics_data[genes_present].mean(axis=1)

## calculate overall score
select_proteomics_data['overall_score'] = select_proteomics_data[list(marker_genes.keys())].mean(axis=1)

## merge proteomics and clinical data on patient_id, 
merged_proteomics_data = pd.merge(select_proteomics_data, select_clinical_data, on='Patient_ID', how='inner')

## save proteomics data
merged_proteomics_data.to_csv('./data/proteomics_data.tsv', sep='\t', index=False)

In [27]:
## select relevant transcriptomics columns
select_gene_columns = [gene for genes in marker_genes.values() for gene in genes]
select_transcriptomics_data = combined_transcriptomics_data[select_gene_columns].copy()

## z-score normalize, ignoring nan values
def zscore_with_nan_handling(series):
    non_nan_values = series.dropna()
    z_scores = zscore(non_nan_values)
    result = series.copy()
    result[non_nan_values.index] = z_scores
    return result
select_transcriptomics_data[select_gene_columns] = select_transcriptomics_data[select_gene_columns].apply(zscore_with_nan_handling, axis=0)

## average expression by gene list
for marker, genes in marker_genes.items():
    genes_present = [g for g in genes if g in select_transcriptomics_data.columns]
    select_transcriptomics_data[marker] = select_transcriptomics_data[genes_present].mean(axis=1)

## calculate overall score
select_transcriptomics_data['overall_score'] = select_transcriptomics_data[list(marker_genes.keys())].mean(axis=1)

## merge transcriptomics and clinical data on patient_id, 
merged_transcriptomics_data = pd.merge(select_transcriptomics_data, select_clinical_data, on='Patient_ID', how='inner')

## save transcriptomics data
merged_transcriptomics_data.to_csv('./data/transcriptomics_data.tsv', sep='\t', index=False)