In [1]:
import pandas as pd
import json
import biomart      

# Import utilities
import os
import sys
os.chdir("/Volumes/kueck/PublicDataAnalysis/CASCAM_style_subtype_classification/src/preprocessing")
# os.getcwd() not working as expected, so need to set wd manually (update as needed).
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
sys.path.append(parent_dir)

from utilities import get_ensembl_mappings_grch38

In [3]:
ensembl_to_gene_symbol_df = get_ensembl_mappings_grch38()

Skipped 22300 entries because they were missing gene_symbol


Unnamed: 0,ensembl_gene_id,external_gene_name
0,ENSG00000210049,MT-TF
1,ENSG00000211459,MT-RNR1
2,ENSG00000210077,MT-TV
3,ENSG00000210082,MT-RNR2
4,ENSG00000209082,MT-TL1
...,...,...
48306,ENSG00000232679,LINC01705
48307,ENSG00000200033,RNU6-403P
48308,ENSG00000228437,LINC02474
48309,ENSG00000229463,LYST-AS1


In [4]:
tcga_data_folder = "../../data/public_data_sets/TCGA_OV_RNA"
log2_counts_file = f"{tcga_data_folder}/TCGA-OV.htseq_counts.tsv"
mutations_file = f"{tcga_data_folder}/TCGA-OV.muse_snv.tsv"
copy_number_file = f"{tcga_data_folder}/TCGA-OV.gistic.tsv"
metadata_file = f"{tcga_data_folder}/TCGA_cohort_metadata.json"
parsed_metadata_file = f"{tcga_data_folder}/TCGA_cohort_metadata_parsed.txt"

### Load and parse metadata file

Creates txt file from json metadata file

In [5]:
with open(metadata_file, "r") as file:
    metadata_json = json.loads(file.read())
metadata = pd.json_normalize(metadata_json)

# Format the experimental strategies more legibly by parsing list of dictionaries
metadata["summary.experimental_strategies"] = metadata[
    "summary.experimental_strategies"
].apply(lambda exp_list: [exp["experimental_strategy"] for exp in exp_list])

display(metadata)

metadata.to_csv(parsed_metadata_file, sep = "\t", index=False)

# Check that all submitter ids are unique
assert metadata["submitter_id"].nunique() == metadata.shape[0]

Unnamed: 0,primary_site,submitter_slide_ids,disease_type,case_id,submitter_id,diagnoses,summary.file_count,summary.experimental_strategies,project.project_id,project.program.name,demographic.race,demographic.gender,demographic.ethnicity,demographic.vital_status,demographic.days_to_death
0,Ovary,"[TCGA-13-0920-01A-01-BS1, TCGA-13-0920-01A-01-...","Cystic, Mucinous and Serous Neoplasms",85a85a11-7200-4e96-97af-6ba26d680d59,TCGA-13-0920,[{'age_at_diagnosis': 24064}],84,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1484.0
1,Ovary,"[TCGA-09-1662-01A-01-BS1, TCGA-09-1662-01A-01-...","Cystic, Mucinous and Serous Neoplasms",867f9563-16c9-45a8-b519-6df61ba1b6b7,TCGA-09-1662,[{'age_at_diagnosis': 21246}],54,"[miRNA-Seq, RNA-Seq, WGS, Genotyping Array, Me...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,2717.0
2,Ovary,"[TCGA-24-2033-01A-01-BS1, TCGA-24-2033-01A-01-...","Cystic, Mucinous and Serous Neoplasms",872d2922-7292-4681-adb7-d3b267eccbe7,TCGA-24-2033,[{'age_at_diagnosis': 31977}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not reported,Dead,562.0
3,Ovary,"[TCGA-61-2110-11A-01-TS1, TCGA-61-2110-11A-01-...","Cystic, Mucinous and Serous Neoplasms",88180134-710f-46ea-9b06-5e5d860d6d9f,TCGA-61-2110,[{'age_at_diagnosis': 20718}],69,"[WXS, miRNA-Seq, RNA-Seq, Genotyping Array, Me...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1354.0
4,Ovary,"[TCGA-29-1696-01A-01-TS1, TCGA-29-1696-01A-01-...","Cystic, Mucinous and Serous Neoplasms",8a98a6e6-b763-4824-858b-fd2738e6c9a3,TCGA-29-1696,[{'age_at_diagnosis': 15818}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not reported,Dead,1032.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,Ovary,"[TCGA-10-0938-01A-01-BS1, TCGA-10-0938-01A-02-...","Cystic, Mucinous and Serous Neoplasms",7ca97692-0bf5-4bbd-81ce-10a051d04bd5,TCGA-10-0938,[{'age_at_diagnosis': 29558}],53,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not reported,Dead,636.0
418,Ovary,"[TCGA-13-0804-01A-01-TS1, TCGA-13-0804-01A-01-...","Cystic, Mucinous and Serous Neoplasms",7e34d3c1-1fab-4326-9a69-4260d2bac558,TCGA-13-0804,[{'age_at_diagnosis': 27021}],84,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1073.0
419,Ovary,"[TCGA-25-2042-01A-01-BS1, TCGA-25-2042-01Z-00-...","Cystic, Mucinous and Serous Neoplasms",818dc159-aba4-46bc-a4ed-68ee0f8c4461,TCGA-25-2042,[{'age_at_diagnosis': 22127}],69,"[WXS, miRNA-Seq, RNA-Seq, Genotyping Array, Me...",TCGA-OV,TCGA,american indian or alaska native,female,not hispanic or latino,Dead,396.0
420,Ovary,"[TCGA-13-1512-01A-01-TS1, TCGA-13-1512-01A-01-...","Cystic, Mucinous and Serous Neoplasms",82093ed9-a3c8-4e34-931f-4ec7ae745711,TCGA-13-1512,[{'age_at_diagnosis': 18134}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Alive,


### Load log2 counts data

In [6]:
log2_counts = pd.read_csv(log2_counts_file, sep="\t", index_col=0)

# Rename with submitter ID instead of submitter_slide_id (up to 3rd "-"), merging samples with same submitter ID
log2_counts.columns = ['-'.join(col.split("-", 3)[:3]) for col in log2_counts.columns]
log2_counts = log2_counts.T
log2_counts = log2_counts.groupby(level=0).mean()
log2_counts = log2_counts.T
assert not log2_counts.columns.duplicated().any()

# Add gene symbol column
log2_counts.insert(0,
                   "ensembl_id",
                   [ensembl_id_with_version.split(".")[0] for ensembl_id_with_version in log2_counts.index])
def get_gene_symbol(ensembl_id):
    try:
        return ensembl_to_gene_symbol_df.loc[ensembl_to_gene_symbol_df["ensembl_gene_id"] == ensembl_id, "external_gene_name"].values[0]
    except IndexError:
        return None
log2_counts.insert(0,
                   "gene_symbol",
                   log2_counts["ensembl_id"].apply(get_gene_symbol))

display(log2_counts)

Unnamed: 0_level_0,gene_symbol,ensembl_id,TCGA-04-1331,TCGA-04-1332,TCGA-04-1338,TCGA-04-1341,TCGA-04-1343,TCGA-04-1347,TCGA-04-1350,TCGA-04-1356,...,TCGA-61-2101,TCGA-61-2102,TCGA-61-2104,TCGA-61-2109,TCGA-61-2110,TCGA-61-2111,TCGA-61-2113,TCGA-OY-A56Q,TCGA-VG-A8LO,TCGA-WR-A838
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.13,TSPAN6,ENSG00000000003,12.804131,11.340963,10.324181,11.982637,11.885696,12.466586,12.219169,12.427051,...,11.991876,11.413099,11.516685,11.423641,9.952741,11.976922,11.887982,11.969746,12.122828,12.583318
ENSG00000000005.5,TNMD,ENSG00000000005,3.459432,3.700440,1.000000,3.169925,7.546894,3.807355,0.000000,1.584963,...,1.584963,5.426265,1.000000,3.000000,2.584963,0.000000,1.584963,1.584963,2.321928,3.169925
ENSG00000000419.11,DPM1,ENSG00000000419,11.669771,10.344296,10.451211,11.791163,13.057146,12.323899,11.459944,11.805341,...,12.127350,10.632995,11.373409,10.764042,10.800091,11.249706,11.759472,11.455841,11.658211,11.528942
ENSG00000000457.12,SCYL3,ENSG00000000457,10.364135,8.797662,8.566054,9.301496,9.036174,9.071462,8.693487,9.741467,...,10.273796,9.957102,10.381543,9.703904,9.398744,10.135709,9.968667,10.736402,10.729621,10.222795
ENSG00000000460.15,FIRRM,ENSG00000000460,9.812177,8.471675,7.569856,8.632995,8.299208,8.092757,7.988685,8.985842,...,9.794416,10.203348,9.541097,9.503826,8.876517,9.245553,9.954196,9.477758,9.917372,9.533330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
__no_feature,,__no_feature,23.565784,23.194738,22.149820,22.474268,22.828578,22.161203,22.279944,23.237803,...,22.598964,21.584929,22.804893,22.821447,22.747116,22.056261,23.035873,23.152011,23.547087,23.248577
__ambiguous,,__ambiguous,21.348947,20.827971,20.330253,22.516980,22.132119,21.377337,21.835249,21.728927,...,22.101431,20.672535,21.529985,21.100490,21.386072,20.646611,21.513571,21.506699,21.747044,21.519990
__too_low_aQual,,__too_low_aQual,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
__not_aligned,,__not_aligned,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Load and parse mutations

In [7]:
individual_mutations = pd.read_csv(mutations_file, sep="\t")

# Rename with submitter ID instead of sample_ID(up to 3rd "-")
individual_mutations.insert(0, "submitter_id", ['-'.join(col.split("-", 3)[:3]) for col in individual_mutations["Sample_ID"]])

# See what effects are present
effects_nested = list(set(individual_mutations["effect"]))
effects = list(set([item for sublist in effects_nested for item in sublist.split(";")]))
# display(pd.DataFrame({"effect": effects}))

# Point mutation categories defined by Jessi
point_mutant_effects = ["stop_gained",
                       "start_lost",
                       "missense_variant",
                       "splice_donor_variant",
                       "splice_acceptor_variant"]
assert(all([effect in effects for effect in point_mutant_effects]))

# Find TP53 mutants
individual_mutations.insert(1, "tp53_mutant", individual_mutations["effect"].apply(lambda effect: any([effect in point_mutant_effects])))
tp53_point_mutants = list(set(individual_mutations[(individual_mutations["gene"] == "TP53") & individual_mutations["tp53_mutant"]]["submitter_id"]))

# Identify KRAS mutants
individual_mutations.insert(1, "kras_mutant", individual_mutations["effect"].apply(lambda effect: any([effect in point_mutant_effects])))
kras_point_mutants = list(set(individual_mutations[(individual_mutations["gene"] == "KRAS") & individual_mutations["kras_mutant"]]["submitter_id"]))

print(f"Num samples with TP53 mutations: {len(tp53_point_mutants)}/{metadata.shape[0]}")
print(f"Num samples with KRAS mutations: {len(kras_point_mutants)}/{metadata.shape[0]}")

# Format into a data frame with submitter_id, tp53_mutant, kras_mutant
mutations = pd.DataFrame(
    {
        "submitter_id": list(set(individual_mutations["submitter_id"]))
    }
)
mutations["tp53_mutant"] = mutations["submitter_id"].apply(lambda submitter_id: submitter_id in tp53_point_mutants)
mutations["kras_mutant"] = mutations["submitter_id"].apply(lambda submitter_id: submitter_id in kras_point_mutants)

# Remove redundant rows and ensure submitter ids are unique
mutations = mutations.drop_duplicates(subset=["submitter_id", "tp53_mutant", "kras_mutant"])
assert len(set(mutations["submitter_id"])) == len(mutations["submitter_id"])
display(mutations)

Num samples with TP53 mutations: 298/422
Num samples with KRAS mutations: 5/422


Unnamed: 0,submitter_id,tp53_mutant,kras_mutant
0,TCGA-13-0717,False,False
1,TCGA-20-0996,False,False
2,TCGA-13-1409,True,False
3,TCGA-29-1766,False,False
4,TCGA-24-2261,False,False
...,...,...,...
428,TCGA-13-0903,True,False
429,TCGA-25-2398,False,False
430,TCGA-31-1959,True,False
431,TCGA-31-1950,True,False


## Load and parse copy number data

Convert to gene symbol

In [8]:
copy_number_data = pd.read_csv(copy_number_file, sep="\t")

# Need to parse gene symbol from ensembl id
copy_number_data = copy_number_data.rename(columns={"Gene Symbol": "ensembl_id_with_version"})
copy_number_data.insert(0,
                        "ensembl_id",
                        copy_number_data["ensembl_id_with_version"].apply(
                            lambda ensembl_id_with_version: ensembl_id_with_version.split(".")[0]
                        ))
copy_number_data["has_gene_symbol"] = copy_number_data["ensembl_id"].apply(
        lambda ensembl_id: ensembl_id in ensembl_to_gene_symbol_df["ensembl_gene_id"].values
    )
copy_number_data = copy_number_data[copy_number_data["has_gene_symbol"]]
copy_number_data.insert(0,
                        "gene_symbol",
                        copy_number_data["ensembl_id"].apply(
                            lambda ensembl_id: ensembl_to_gene_symbol_df[ensembl_to_gene_symbol_df["ensembl_gene_id"] == ensembl_id]["external_gene_name"].values[0]
                        ))

display(copy_number_data)

Unnamed: 0,gene_symbol,ensembl_id,ensembl_id_with_version,TCGA-04-1517-01A,TCGA-24-1556-01A,TCGA-36-1571-01A,TCGA-57-1582-01A,TCGA-25-1628-01A,TCGA-09-1661-01B,TCGA-04-1519-01A,...,TCGA-72-4240-01A.1,TCGA-72-4232-01A.1,TCGA-72-4241-01A.1,TCGA-72-4233-01A.1,TCGA-72-4234-01A.1,TCGA-72-4235-01A.1,TCGA-72-4236-01A.1,TCGA-72-4237-01A.1,TCGA-72-4238-01A.1,has_gene_symbol
0,CDK11A,ENSG00000008128,ENSG00000008128.21,-1,0,0,1,0,-1,-1,...,0,0,1,1,0,-1,1,0,1,True
1,NADK,ENSG00000008130,ENSG00000008130.14,-1,0,0,1,0,-1,-1,...,0,0,1,1,0,-1,1,0,1,True
2,PRKCZ,ENSG00000067606,ENSG00000067606.14,-1,0,0,1,0,-1,-1,...,0,0,1,1,0,-1,1,0,1,True
3,GNB1,ENSG00000078369,ENSG00000078369.16,-1,0,0,1,0,-1,-1,...,0,0,1,1,0,-1,1,0,1,True
4,SDF4,ENSG00000078808,ENSG00000078808.15,-1,0,0,1,0,-1,-1,...,0,0,1,1,0,-1,1,0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19724,H2AB3,ENSG00000277745,ENSG00000277745.1,0,0,0,-1,0,0,1,...,1,0,0,1,1,0,0,0,0,True
19725,H2AB2,ENSG00000277858,ENSG00000277858.1,0,0,0,-1,0,0,1,...,1,0,0,1,1,0,0,0,0,True
19726,VAMP7,ENSG00000124333,ENSG00000124333.13,0,0,0,-1,0,0,1,...,0,0,0,1,1,0,0,0,0,True
19727,IL9R,ENSG00000124334,ENSG00000124334.15,0,0,0,-1,0,0,1,...,0,0,0,1,1,0,0,0,0,True


In [9]:
# Examine CN data to try and interpret
cn_only = copy_number_data.iloc[ : , 3 : ]
display(cn_only)
# Count the number of each type of copy number alteration
stacked = cn_only.stack()

# Count the occurrences of each value
value_counts = stacked.value_counts()
display(value_counts)

Unnamed: 0,TCGA-04-1517-01A,TCGA-24-1556-01A,TCGA-36-1571-01A,TCGA-57-1582-01A,TCGA-25-1628-01A,TCGA-09-1661-01B,TCGA-04-1519-01A,TCGA-24-1557-01A,TCGA-36-1574-01A,TCGA-57-1583-01A,...,TCGA-72-4240-01A.1,TCGA-72-4232-01A.1,TCGA-72-4241-01A.1,TCGA-72-4233-01A.1,TCGA-72-4234-01A.1,TCGA-72-4235-01A.1,TCGA-72-4236-01A.1,TCGA-72-4237-01A.1,TCGA-72-4238-01A.1,has_gene_symbol
0,-1,0,0,1,0,-1,-1,1,0,0,...,0,0,1,1,0,-1,1,0,1,True
1,-1,0,0,1,0,-1,-1,1,0,0,...,0,0,1,1,0,-1,1,0,1,True
2,-1,0,0,1,0,-1,-1,1,0,0,...,0,0,1,1,0,-1,1,0,1,True
3,-1,0,0,1,0,-1,-1,1,0,0,...,0,0,1,1,0,-1,1,0,1,True
4,-1,0,0,1,0,-1,-1,1,0,0,...,0,0,1,1,0,-1,1,0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19724,0,0,0,-1,0,0,1,0,0,0,...,1,0,0,1,1,0,0,0,0,True
19725,0,0,0,-1,0,0,1,0,0,0,...,1,0,0,1,1,0,0,0,0,True
19726,0,0,0,-1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,True
19727,0,0,0,-1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,True


0     9007501
1     1622832
-1    1196612
Name: count, dtype: int64

### Infer deletion of genes

Infer deletion from log(counts+1). Original intention was to infer from copy number data, but it was uclear how to interpret the copy number data format.


In [10]:
# Check if any samples have 0 counts of TP53 or KRAS
deletions_data = log2_counts.loc[log2_counts["gene_symbol"].isin(["TP53", "KRAS"]), :]
deletions_data.set_index("gene_symbol", inplace=True)
deletions_data = deletions_data.loc[:, deletions_data.columns.str.contains("TCGA")]
deletions_data = deletions_data.T.rename_axis("submitter_id").reset_index().rename_axis(None, axis=1)
deletions_data["tp53_deletion"] = deletions_data["TP53"].lt(5)
deletions_data["kras_deletion"] = deletions_data["KRAS"].lt(5)

print(f"Num samples with TP53 deletions: {deletions_data['tp53_deletion'].sum()}/{deletions_data.shape[0]}")
print(f"Num samples with KRAS deletions: {deletions_data['kras_deletion'].sum()}/{deletions_data.shape[0]}")
display(deletions_data)

Num samples with TP53 deletions: 0/376
Num samples with KRAS deletions: 0/376


Unnamed: 0,submitter_id,KRAS,TP53,tp53_deletion,kras_deletion
0,TCGA-04-1331,11.348175,10.115044,False,False
1,TCGA-04-1332,10.540128,12.690871,False,False
2,TCGA-04-1338,10.926296,12.447858,False,False
3,TCGA-04-1341,11.886840,12.243174,False,False
4,TCGA-04-1343,11.016808,11.253847,False,False
...,...,...,...,...,...
371,TCGA-61-2111,12.685406,12.276124,False,False
372,TCGA-61-2113,17.447196,13.516192,False,False
373,TCGA-OY-A56Q,12.386401,10.848623,False,False
374,TCGA-VG-A8LO,11.060696,13.300067,False,False


## Inspect metadata

Only keep samples we have metadata, RNAseq, and mutations data for. (Don't need to have copy number data since we didn't end up using it to determine deletions)

In [11]:
relevant_samples = set(metadata["submitter_id"]) & set(log2_counts.columns) & set(mutations["submitter_id"]) & set(deletions_data["submitter_id"])
relevant_metadata = metadata.loc[metadata['submitter_id'].isin(relevant_samples)]
print(f"Num samples with all data: {relevant_metadata.shape[0]}/{metadata.shape[0]}")

# Check that all experimental strategies include RNA-Seq
assert all(["RNA-Seq" in exp for exp in relevant_metadata["summary.experimental_strategies"]])

# Check that all are Cystic, Mucinous and Serous Neoplasms
assert all(["Cystic, Mucinous and Serous Neoplasms" in disease_type for disease_type in relevant_metadata["disease_type"]])

display(relevant_metadata)

Num samples with all data: 271/422


Unnamed: 0,primary_site,submitter_slide_ids,disease_type,case_id,submitter_id,diagnoses,summary.file_count,summary.experimental_strategies,project.project_id,project.program.name,demographic.race,demographic.gender,demographic.ethnicity,demographic.vital_status,demographic.days_to_death
0,Ovary,"[TCGA-13-0920-01A-01-BS1, TCGA-13-0920-01A-01-...","Cystic, Mucinous and Serous Neoplasms",85a85a11-7200-4e96-97af-6ba26d680d59,TCGA-13-0920,[{'age_at_diagnosis': 24064}],84,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1484.0
2,Ovary,"[TCGA-24-2033-01A-01-BS1, TCGA-24-2033-01A-01-...","Cystic, Mucinous and Serous Neoplasms",872d2922-7292-4681-adb7-d3b267eccbe7,TCGA-24-2033,[{'age_at_diagnosis': 31977}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not reported,Dead,562.0
3,Ovary,"[TCGA-61-2110-11A-01-TS1, TCGA-61-2110-11A-01-...","Cystic, Mucinous and Serous Neoplasms",88180134-710f-46ea-9b06-5e5d860d6d9f,TCGA-61-2110,[{'age_at_diagnosis': 20718}],69,"[WXS, miRNA-Seq, RNA-Seq, Genotyping Array, Me...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1354.0
4,Ovary,"[TCGA-29-1696-01A-01-TS1, TCGA-29-1696-01A-01-...","Cystic, Mucinous and Serous Neoplasms",8a98a6e6-b763-4824-858b-fd2738e6c9a3,TCGA-29-1696,[{'age_at_diagnosis': 15818}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not reported,Dead,1032.0
6,Ovary,"[TCGA-13-0911-01A-01-TS1, TCGA-13-0911-01A-01-...","Cystic, Mucinous and Serous Neoplasms",8cad4217-5699-4735-9be3-fc0015a8d262,TCGA-13-0911,[{'age_at_diagnosis': 20276}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1355.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,Ovary,"[TCGA-61-2102-01A-01-TS1, TCGA-61-2102-01A-01-...","Cystic, Mucinous and Serous Neoplasms",7a08efd0-f984-430e-a8eb-1881047214b6,TCGA-61-2102,[{'age_at_diagnosis': 27256}],70,"[WXS, miRNA-Seq, RNA-Seq, Genotyping Array, Me...",TCGA-OV,TCGA,white,female,not reported,Dead,197.0
418,Ovary,"[TCGA-13-0804-01A-01-TS1, TCGA-13-0804-01A-01-...","Cystic, Mucinous and Serous Neoplasms",7e34d3c1-1fab-4326-9a69-4260d2bac558,TCGA-13-0804,[{'age_at_diagnosis': 27021}],84,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Dead,1073.0
419,Ovary,"[TCGA-25-2042-01A-01-BS1, TCGA-25-2042-01Z-00-...","Cystic, Mucinous and Serous Neoplasms",818dc159-aba4-46bc-a4ed-68ee0f8c4461,TCGA-25-2042,[{'age_at_diagnosis': 22127}],69,"[WXS, miRNA-Seq, RNA-Seq, Genotyping Array, Me...",TCGA-OV,TCGA,american indian or alaska native,female,not hispanic or latino,Dead,396.0
420,Ovary,"[TCGA-13-1512-01A-01-TS1, TCGA-13-1512-01A-01-...","Cystic, Mucinous and Serous Neoplasms",82093ed9-a3c8-4e34-931f-4ec7ae745711,TCGA-13-1512,[{'age_at_diagnosis': 18134}],70,"[WXS, miRNA-Seq, RNA-Seq, WGS, Genotyping Arra...",TCGA-OV,TCGA,white,female,not hispanic or latino,Alive,


## Subset data

Only keep samples that have TP53 mutations and do not have KRAS mutations in an effort to make sure that we have only HGSC celllines (KRAS indicates LGSC)

In [12]:
# Assert that there are no duplicate submitter ids in mutations
assert len(set(mutations["submitter_id"])) == len(mutations["submitter_id"])
# Assert that there are no duplicate submitter ids in log2_counts
assert len(set(log2_counts.columns)) == len(log2_counts.columns)
# Assert that there are no duplicate submitter ids in deletions_data
assert len(set(deletions_data["submitter_id"])) == len(deletions_data["submitter_id"])

In [13]:
relevant_samples = relevant_metadata[['submitter_id']].merge(
    mutations[['submitter_id', 'tp53_mutant', 'kras_mutant']], on='submitter_id', how='left'
).merge(
    deletions_data[['submitter_id', 'tp53_deletion', 'kras_deletion']], on='submitter_id', how='left'
)
relevant_samples = relevant_samples[['submitter_id', 'tp53_mutant', 'tp53_deletion', 'kras_mutant', 'kras_deletion']]

subset_samples = relevant_samples[
    # Has a TP53 deletion/mutation (indicator of HGSC)
    (relevant_samples["tp53_deletion"] | relevant_samples["tp53_mutant"]) &
    # And does not have a kras deletion/mutation (indicator of LGSC)
    ~ (relevant_samples["kras_deletion"] | relevant_samples["kras_mutant"])
]

print(f"Num samples with TP53 mutations/deletions and no KRAS mutations/deletions: {subset_samples.shape[0]}/{relevant_samples.shape[0]}")
display(subset_samples)

# Save subset samples to a file
subset_samples.to_csv(f"{tcga_data_folder}/confident_hgsc_samples.tsv", sep="\t", index=False)

Num samples with TP53 mutations/deletions and no KRAS mutations/deletions: 191/271


Unnamed: 0,submitter_id,tp53_mutant,tp53_deletion,kras_mutant,kras_deletion
0,TCGA-13-0920,True,False,False,False
1,TCGA-24-2033,True,False,False,False
2,TCGA-61-2110,True,False,False,False
6,TCGA-61-2111,True,False,False,False
10,TCGA-24-2254,True,False,False,False
...,...,...,...,...,...
262,TCGA-59-2352,True,False,False,False
266,TCGA-61-2102,True,False,False,False
267,TCGA-13-0804,True,False,False,False
269,TCGA-13-1512,True,False,False,False


## Subset RNAseq data based for high confidence HGSC

In [95]:
log2_counts_submitter_id = log2_counts.set_index("gene_symbol").rename_axis("gene_symbol")
subset_log2counts = log2_counts_submitter_id[subset_samples["submitter_id"]]
display(subset_log2counts)

subset_log2counts.to_csv(f"{tcga_data_folder}/confident_hgsc_log2counts.tsv", sep="\t")

Unnamed: 0_level_0,TCGA-13-0920,TCGA-24-2033,TCGA-61-2110,TCGA-61-2111,TCGA-24-2254,TCGA-24-1428,TCGA-24-2280,TCGA-24-1843,TCGA-23-1110,TCGA-10-0937,...,TCGA-24-1552,TCGA-25-1321,TCGA-13-0888,TCGA-04-1331,TCGA-13-1511,TCGA-59-2352,TCGA-61-2102,TCGA-13-0804,TCGA-13-1512,TCGA-29-1768
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,11.829326,12.405673,9.952741,11.976922,12.487338,13.346929,11.777255,12.191985,10.302639,11.919608,...,11.438792,12.175238,10.766529,12.804131,12.615170,11.976206,11.413099,11.454299,12.386132,11.564149
TNMD,3.000000,2.321928,2.584963,0.000000,4.087463,3.000000,4.459432,1.000000,2.000000,2.000000,...,1.000000,1.584963,1.000000,3.459432,1.000000,5.523562,5.426265,1.584963,0.000000,4.906891
DPM1,10.736402,11.681238,10.800091,11.249706,10.576484,12.022021,11.489346,12.463013,10.991522,10.172428,...,11.494856,11.937006,9.975848,11.669771,10.973697,10.830515,10.632995,9.652845,11.556985,10.731319
SCYL3,9.116344,9.339850,9.398744,10.135709,9.415742,10.687376,9.856426,10.115044,7.994353,7.592457,...,9.541097,10.802516,8.144658,10.364135,10.344296,9.348728,9.957102,8.098032,10.144658,8.430453
FIRRM,8.647458,9.497852,8.876517,9.245553,8.909893,10.395534,9.189825,10.182394,7.894818,6.442943,...,9.245553,9.930737,7.055282,9.812177,8.867279,8.523562,10.203348,7.247928,9.483816,7.546894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,21.648527,22.301199,22.747116,22.056261,22.922068,23.909925,23.488641,22.820357,21.135925,20.713963,...,21.657224,23.118446,22.354941,23.565784,23.082234,22.039606,21.584929,21.979689,21.552433,20.889634
,20.723864,21.165166,21.386072,20.646611,20.809075,22.842148,21.363436,22.090146,19.815744,20.599656,...,21.313734,21.682592,20.399962,21.348947,21.768537,21.353379,20.672535,20.915839,21.583226,21.185491
,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
