In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Demographic Data

In [2]:
# Read in data
demo_data = pd.read_excel('../data/SARC_Master_Patient_Table.xlsx')

# Select columns of interest
demo_data = demo_data[['TCGA_barcode', 'short_histo', 'age_at_diagnosis', 'gender']]

print(demo_data.shape)
demo_data.head()

(206, 4)


  warn(msg)


Unnamed: 0,TCGA_barcode,short_histo,age_at_diagnosis,gender
0,TCGA-3B-A9HI-01,UPS,68,MALE
1,TCGA-3B-A9HL-01,DDLPS,67,MALE
2,TCGA-3B-A9HO-01,DDLPS,75,MALE
3,TCGA-3B-A9HP-01,ULMS,57,FEMALE
4,TCGA-3B-A9HQ-01,STLMS,66,FEMALE


# Gene Expression / RNA Data

In [3]:
# Read in data
rna_data = pd.read_csv('../data/Human__TCGA_SARC__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct', sep='\t')

# Select genes of interest, transpose data, and reset index
genes = ["CD274", "PDCD1", "CTLA4", "LAG3", "TIGIT", "TCF7", "HAVCR2"]
rna_data = rna_data[rna_data['attrib_name'].isin(genes)].T
rna_data = rna_data.rename(columns=rna_data.iloc[0]).drop(rna_data.index[0]).reset_index()

# Reformat the subject IDs
for i, entry in enumerate(rna_data['index']):
    rna_data.loc[i, 'index'] = entry.replace('.', '-')[:15] + "-01"

print(rna_data.shape)
rna_data.head()

(259, 8)


Unnamed: 0,index,CD274,CTLA4,HAVCR2,LAG3,PDCD1,TCF7,TIGIT
0,TCGA-3B-A9HI-01,4.8806,5.7956,8.5865,7.5205,6.3787,7.5813,7.6166
1,TCGA-3B-A9HJ-01,5.9762,7.8571,9.8304,8.5099,10.2843,12.151,10.1038
2,TCGA-3B-A9HL-01,1.702,2.2034,8.7916,4.0682,2.4613,6.6654,4.9633
3,TCGA-3B-A9HO-01,3.6266,7.0599,10.4555,9.9738,9.3923,7.6773,8.5286
4,TCGA-3B-A9HP-01,4.3137,3.8988,8.3686,8.3423,4.4836,6.678,5.4673


# Immune Infiltration / Cibersort Data

In [4]:
# Read in data
cibersort_data = pd.read_csv("../data/TCGA.Kallisto.fullIDs.cibersort.relative.tsv", sep="\t")

# Select columns of interest
cibersort_data = cibersort_data[["SampleID", 'T.cells.CD8', 'T.cells.CD4.naive',
       'T.cells.CD4.memory.resting', 'T.cells.CD4.memory.activated',
       'T.cells.follicular.helper', 'T.cells.regulatory..Tregs.',
       'T.cells.gamma.delta']]


# Reformat the subject IDs
for i, entry in enumerate(cibersort_data['SampleID']):
    cibersort_data.loc[i, 'SampleID'] = entry.replace('.', '-')[:15]

# Sum the columns
cibersort_data["Y"] = cibersort_data.drop(columns=["SampleID"]).sum(axis=1)
cibersort_data = cibersort_data[['SampleID', 'Y']]

print(cibersort_data.shape)
cibersort_data.head()

(11373, 2)


Unnamed: 0,SampleID,Y
0,TCGA-OR-A5JG-01,0.363818
1,TCGA-OR-A5LG-01,0.284199
2,TCGA-OR-A5JD-01,0.297711
3,TCGA-OR-A5LH-01,0.307141
4,TCGA-OR-A5KY-01,0.277398


# Copy Number Aberrations Data

In [5]:
# Read in data
cna_data = pd.read_excel('../data/CN_by_genes_pansarc.xlsx')

# Select columns of interest
gene_data = cna_data[['Sample', 'JUN', 'VGLL3', 'TERT', 'MAP3K5', 'UST', 'CDKN2A', 'YAP1', 'CDKN1B', 'PTPRQ', 'RB1', 'TP53', 'MYOCD', 'NF1', 'CCNE1', 'CEBPA', 'ZNF552', 'ATRX', 'PTEN', 'DDIT3', 'CDK4', 'HMGA2', 'MDM2', 'FRS2']]

# Reformat the subject IDs
for i, entry in enumerate(cna_data['Sample']):
    cna_data.loc[i, 'Sample'] = entry[:15]

print(cna_data.shape)
cna_data.head()

(206, 64)


  warn(msg)


Unnamed: 0,Sample,short_histo,JUN,VGLL3,TERT,MAP3K5,UST,CDKN2A,YAP1,CDKN1B,...,HDLBP_hetdel,HDLBP_homdel,NF1_hetdel,NF1_homdel,PTEN_hetdel,PTEN_homdel,RB1_hetdel,RB1_homdel,TP53_hetdel,TP53_homdel
0,TCGA-3B-A9HI-01,UPS,-1,0,1,1,1,-1,1,-1,...,False,False,False,False,False,0,False,0,False,0
1,TCGA-3B-A9HL-01,DDLPS,2,1,1,2,0,-1,0,0,...,no call,no call,no call,no call,no call,no call,no call,no call,no call,no call
2,TCGA-3B-A9HO-01,DDLPS,0,0,2,0,0,0,-1,0,...,False,False,False,False,False,0,False,0,False,0
3,TCGA-3B-A9HP-01,ULMS,0,0,0,0,0,0,0,0,...,False,False,False,False,False,0,False,0,1,0
4,TCGA-3B-A9HQ-01,STLMS,0,0,1,0,0,0,0,0,...,False,False,2,False,1,0,1,0,1,0


# Mutational Load Data

In [6]:
# Read in data
mutational_load_data = pd.read_csv('../data/mutation-load_updated.txt', sep='\t')

# Select columns of interest
mutational_load_data = mutational_load_data[['Tumor_Sample_ID', 'Silent per Mb', 'Non-silent per Mb']]

# Reformat the subject IDs
for i, entry in enumerate(mutational_load_data['Tumor_Sample_ID']):
    mutational_load_data.loc[i, 'Tumor_Sample_ID'] = entry[:15]

print(mutational_load_data.shape)
mutational_load_data.head()

(10123, 3)


Unnamed: 0,Tumor_Sample_ID,Silent per Mb,Non-silent per Mb
0,TCGA-OR-A5JR-01,0.051687,0.051687
1,TCGA-OR-A5JH-01,0.10244,0.15366
2,TCGA-OR-A5JQ-01,0.081171,0.162342
3,TCGA-OR-A5L9-01,0.053545,0.160636
4,TCGA-OR-A5LA-01,0.054564,0.190974


# Merge datasets

In [7]:

method = 'left'
result = pd.merge(demo_data, gene_data, left_on='TCGA_barcode', right_on='Sample', how=method)
result = pd.merge(result, mutational_load_data, left_on='TCGA_barcode', right_on='Tumor_Sample_ID', how=method)
result = pd.merge(result, rna_data, left_on='TCGA_barcode', right_on='index', how=method)
result = pd.merge(result, cibersort_data, left_on='TCGA_barcode', right_on='SampleID', how=method)
result = result.drop(['Tumor_Sample_ID', 'Sample', 'SampleID', 'index'], axis=1)

print(result.shape)
result.head()

(206, 37)


Unnamed: 0,TCGA_barcode,short_histo,age_at_diagnosis,gender,JUN,VGLL3,TERT,MAP3K5,UST,CDKN2A,...,Silent per Mb,Non-silent per Mb,CD274,CTLA4,HAVCR2,LAG3,PDCD1,TCF7,TIGIT,Y
0,TCGA-3B-A9HI-01,UPS,68,MALE,-1,0,1,1,1,-1,...,0.211193,0.696937,4.8806,5.7956,8.5865,7.5205,6.3787,7.5813,7.6166,0.21916
1,TCGA-3B-A9HL-01,DDLPS,67,MALE,2,1,1,2,0,-1,...,0.208978,0.88235,1.702,2.2034,8.7916,4.0682,2.4613,6.6654,4.9633,0.208924
2,TCGA-3B-A9HO-01,DDLPS,75,MALE,0,0,2,0,0,0,...,0.319235,1.021551,3.6266,7.0599,10.4555,9.9738,9.3923,7.6773,8.5286,0.381263
3,TCGA-3B-A9HP-01,ULMS,57,FEMALE,0,0,0,0,0,0,...,0.178188,0.801848,4.3137,3.8988,8.3686,8.3423,4.4836,6.678,5.4673,0.256825
4,TCGA-3B-A9HQ-01,STLMS,66,FEMALE,0,0,1,0,0,0,...,0.24388,1.108546,6.1419,6.4742,8.503,6.4067,6.3268,9.8664,7.1106,0.321267


# Export Combined Data

In [8]:
result.to_csv('../data/combined_sarcoma_data.csv', index=False)