# Sarcoma Analysis Data Cleaning
Before running this notebook, make sure you follow the instructions in `/data/data_sources.md`!!

In [None]:
# Basic imports
import pandas as pd

# Demographic Data

In [None]:
# Read in data
demo_data = pd.read_excel('./data/SARC_Master_Patient_Table.xlsx')

# Select columns of interest
demo_data = demo_data[['TCGA_barcode', 'short_histo', 'age_at_diagnosis', 'gender']]

print(demo_data.shape)
demo_data.head()

# Gene Expression / RNA Data

In [None]:
# Read in data
rna_data = pd.read_csv('./data/Human__TCGA_SARC__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct', sep='\t')

# Select genes of interest, transpose data, and reset index
genes = ["CD274", "PDCD1", "CTLA4", "LAG3", "TIGIT", "TCF7", "HAVCR2"]
rna_data = rna_data[rna_data['attrib_name'].isin(genes)].T
rna_data = rna_data.rename(columns=rna_data.iloc[0]).drop(rna_data.index[0]).reset_index()

# Reformat the subject IDs
for i, entry in enumerate(rna_data['index']):
    rna_data.loc[i, 'index'] = entry.replace('.', '-')[:15] + "-01"

print(rna_data.shape)
rna_data.head()

# Immune Infiltration / Cibersort Data

In [None]:
# Read in data
cibersort_data = pd.read_csv("./data/TCGA.Kallisto.fullIDs.cibersort.relative.tsv", sep="\t")

# Select columns of interest
cibersort_data = cibersort_data[["SampleID", 'T.cells.CD8', 'T.cells.CD4.naive',
       'T.cells.CD4.memory.resting', 'T.cells.CD4.memory.activated',
       'T.cells.follicular.helper', 'T.cells.regulatory..Tregs.',
       'T.cells.gamma.delta']]


# Reformat the subject IDs
for i, entry in enumerate(cibersort_data['SampleID']):
    cibersort_data.loc[i, 'SampleID'] = entry.replace('.', '-')[:15]

# Sum the columns
cibersort_data["Y"] = cibersort_data.drop(columns=["SampleID"]).sum(axis=1)
cibersort_data = cibersort_data[['SampleID', 'Y']]

print(cibersort_data.shape)
cibersort_data.head()

# Copy Number Aberrations Data

In [None]:
# Read in data
cna_data = pd.read_excel('./data/CN_by_genes_pansarc.xlsx')

# Select columns of interest
gene_data = cna_data[['Sample', 'JUN', 'VGLL3', 'TERT', 'MAP3K5', 'UST', 'CDKN2A', 'YAP1', 'CDKN1B', 'PTPRQ', 'RB1', 'TP53', 'MYOCD', 'NF1', 'CCNE1', 'CEBPA', 'ZNF552', 'ATRX', 'PTEN', 'DDIT3', 'CDK4', 'HMGA2', 'MDM2', 'FRS2']]

# Reformat the subject IDs
for i, entry in enumerate(cna_data['Sample']):
    cna_data.loc[i, 'Sample'] = entry[:15]

print(cna_data.shape)
cna_data.head()

# Mutational Load Data

In [None]:
# Read in data
mutational_load_data = pd.read_csv('./data/mutation-load_updated.txt', sep='\t')

# Select columns of interest
mutational_load_data = mutational_load_data[['Tumor_Sample_ID', 'Silent per Mb', 'Non-silent per Mb']]

# Reformat the subject IDs
for i, entry in enumerate(mutational_load_data['Tumor_Sample_ID']):
    mutational_load_data.loc[i, 'Tumor_Sample_ID'] = entry[:15]

print(mutational_load_data.shape)
mutational_load_data.head()

# Merge datasets

In [None]:

method = 'left'
result = pd.merge(demo_data, gene_data, left_on='TCGA_barcode', right_on='Sample', how=method)
result = pd.merge(result, mutational_load_data, left_on='TCGA_barcode', right_on='Tumor_Sample_ID', how=method)
result = pd.merge(result, rna_data, left_on='TCGA_barcode', right_on='index', how=method)
result = pd.merge(result, cibersort_data, left_on='TCGA_barcode', right_on='SampleID', how=method)
result = result.drop(['Tumor_Sample_ID', 'Sample', 'SampleID', 'index'], axis=1)

print(result.shape)
result.head()

# Export Combined Data

In [None]:
result.to_csv('./data/combined_sarcoma_data.csv', index=False)