In [3]:
import pandas as pd

# Import utilities
import os
import sys
os.chdir("/Volumes/kueck/PublicDataAnalysis/CASCAM_style_subtype_classification/src/preprocessing")
# os.getcwd() not working as expected, so need to set wd manually (update as needed).
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
sys.path.append(parent_dir)

from utilities import Subtype, get_ensembl_mappings_grch38, get_ensembl_mappings_grch37

In [2]:
# Create ensembl to gene mappings
ensembl_to_gene_symbol_df_grch38 = get_ensembl_mappings_grch38()
ensembl_to_gene_symbol_df_grch37 = get_ensembl_mappings_grch37()

No saved mappings found. Fetching from biomart...
Skipped 0 entries because they were missing gene_symbol


In [20]:
# Create metadata to hold information about samples across all datasets
metadata = pd.DataFrame(columns=["sample_id", "dataset_name", "original_sample_name", "type"])

# Load genes shared across GRCh37 and GRCh38
shared_genes_file = "../../data/reference/shared_genes.tsv"
shared_genes = []
with open(shared_genes_file, 'r') as file:
    for line in file:
        shared_genes.append(line.strip())

## Format GSE189553

In [18]:
dataset_name = "GSE189553"
data_path = f"../../data/public_data_sets/{dataset_name}_raw_count_matrix.txt"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)

# Remove ensemble id column
data = data[data.columns[1:]]
# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]
# Rename index Gene
data.index.name = "Gene"
# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

##### Add samples to metadata #####

# Types were taken from the metadata of the dataset
# "../../data/public_data_sets/GSE189553_series_matrix.txt"
def type_from_sample_name(sample_name):
    prefix = sample_name.split("_")[0]
    if prefix == "CCC":
        return Subtype.CCC
    if prefix == "SC":
        return Subtype.HGSC
    if prefix == "EC":
        return Subtype.EC
    else:
        raise Exception(f"Unknown sample type {prefix}")

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type
metadata_add["type"] = metadata_add["original_sample_name"].apply(
    lambda x: Subtype(type_from_sample_name(x)).name
)
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)


##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)]
        ["sample_id"].values[0])
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,GSE189553_0_CCC,GSE189553,CCC_1,CCC
1,GSE189553_1_CCC,GSE189553,CCC_2,CCC
2,GSE189553_2_CCC,GSE189553,CCC_3,CCC
3,GSE189553_3_CCC,GSE189553,CCC_4,CCC
4,GSE189553_4_CCC,GSE189553,CCC_5,CCC


Unnamed: 0_level_0,GSE189553_0_CCC,GSE189553_1_CCC,GSE189553_2_CCC,GSE189553_3_CCC,GSE189553_4_CCC,GSE189553_5_CCC,GSE189553_6_CCC,GSE189553_7_CCC,GSE189553_8_CCC,GSE189553_9_CCC,...,GSE189553_13_HGSC,GSE189553_14_HGSC,GSE189553_15_HGSC,GSE189553_16_HGSC,GSE189553_17_HGSC,GSE189553_18_HGSC,GSE189553_19_EC,GSE189553_20_EC,GSE189553_21_EC,GSE189553_22_EC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,9.0,53.72,51.02,12.0,42.45,5.53,1.9,5.52,3.62,6.32,...,2.21,0.77,2.45,0.22,0.88,0.81,18.8,1.0,8.61,2.63
A1BG-AS1,38.0,69.0,47.0,18.0,71.0,3.85,2.58,2.47,1.52,2.08,...,0.62,0.33,1.04,0.19,0.16,0.51,12.0,9.0,12.0,0.84
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.05
A2M,1716.0,1584.98,1707.98,2572.98,1022.81,162.84,76.03,98.07,80.71,45.36,...,117.67,178.66,87.54,52.43,11.42,51.88,7068.12,3637.87,2918.32,24.9
A2M-AS1,5.0,7.0,11.0,10.0,3.0,0.65,0.24,0.93,0.48,0.13,...,0.9,0.68,0.33,0.34,0.12,0.17,45.0,60.0,7.0,0.15


## Format GSE160692

In [20]:
dataset_name = "GSE160692"
data_path = f"../../data/public_data_sets/{dataset_name}_OVA_UTE_Raw_Transcripts_GEO.tsv"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)
# Remove Uterine samples and extraneous columns (gene id, etc.)
data = data[data.columns[data.columns.str.contains("OVA")]]
# Remove " (raw)" suffix from sample names
data.columns = data.columns.map(lambda x: x.split(" (raw)")[0])
# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]
# Rename index to Gene
data.index.name = "Gene"
# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

##### Add samples to metadata #####

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type
# Types were taken from the metadata of the dataset
# "../../data/public_data_sets/GSE160692_series_matrix.txt"
metadata_add["type"] = Subtype.CCC.name
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)

##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,GSE160692_0_CCC,GSE160692,OVA1,CCC
1,GSE160692_1_CCC,GSE160692,OVA2,CCC
2,GSE160692_2_CCC,GSE160692,OVA3,CCC
3,GSE160692_3_CCC,GSE160692,OVA4,CCC
4,GSE160692_4_CCC,GSE160692,OVA5,CCC


Unnamed: 0_level_0,GSE160692_0_CCC,GSE160692_1_CCC,GSE160692_2_CCC,GSE160692_3_CCC,GSE160692_4_CCC,GSE160692_5_CCC,GSE160692_6_CCC,GSE160692_7_CCC,GSE160692_8_CCC,GSE160692_9_CCC,GSE160692_10_CCC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A1BG,0.0,83.564156,313.07413,17.035158,760.3948,0.0,0.0,51.0,0.0,0.0,0.0
A1BG-AS1,0.0,10.930017,307.85217,44.03519,305.50507,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,524.0009,138.0,224.0,1.0,0.0,0.0,107.50009,0.0,171.83307,14.999997
A2M,3584.0146,3233.3215,2425.9995,2361.755,11227.696,8112.397,2740.5098,703.8396,2.0,36.99999,5248.433
A2M-AS1,0.0,145.56526,0.0,1.0,97.434326,3.26816,0.0,37.160004,0.0,1.0,0.0


## Format GSE121103

In [22]:
dataset_name = "GSE121103"
data_path = f"../../data/public_data_sets/{dataset_name}__counts.txt"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)
# Remove extraneous columns (chr, start, end, etc.)
data = data[data.columns[data.columns.str.startswith("KL")]]

# Change geneID to gene name
data.insert(0,
            "ensembl_id",
            [ensembl_id_with_version.split(".")[0] for ensembl_id_with_version in data.index])
def get_gene_symbol(ensembl_id):
    try:
        gene_name = ensembl_to_gene_symbol_df_grch38.loc[ensembl_to_gene_symbol_df_grch38["ensembl_gene_id"] == ensembl_id, "external_gene_name"].values[0]
        return gene_name
    except IndexError:
        return None
data.insert(0,
            "Gene",
            data["ensembl_id"].map(lambda x: get_gene_symbol(x)))
data.set_index("Gene", inplace=True)
data = data.drop("ensembl_id", axis=1)

# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]

# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

##### Add samples to metadata #####

# Types were taken from the metadata of the dataset
# "../../data/public_data_sets/GSE121103_series_matrix.txt"
def type_from_sample_name(sample_name):
    prefix = sample_name.split("-")[0]
    prefix_num = int(prefix.split("KL")[1])
    if prefix_num in range(1, 6):
        return Subtype.CCC
    elif prefix_num in range(6, 10):
        return Subtype.EC
    elif prefix_num in range(10, 15):
        return Subtype.HGSC
    elif prefix_num in range(15, 20):
        return Subtype.MC
    else:
        raise Exception(f"Unknown sample type {prefix}")

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type
metadata_add["type"] = metadata_add["original_sample_name"].apply(
    lambda x: Subtype(type_from_sample_name(x)).name
)
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)

# ##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,GSE121103_0_CCC,GSE121103,KL01-NEBindex1_S1_R1_001,CCC
1,GSE121103_1_CCC,GSE121103,KL02-NEBindex2_S2_R1_001,CCC
2,GSE121103_2_CCC,GSE121103,KL03-NEBindex3_S3_R1_001,CCC
3,GSE121103_3_CCC,GSE121103,KL04-NEBindex4_S4_R1_001,CCC
4,GSE121103_4_CCC,GSE121103,KL05-NEBindex5_S5_R1_001,CCC


Unnamed: 0_level_0,GSE121103_0_CCC,GSE121103_1_CCC,GSE121103_2_CCC,GSE121103_3_CCC,GSE121103_4_CCC,GSE121103_5_EC,GSE121103_6_EC,GSE121103_7_EC,GSE121103_8_EC,GSE121103_9_HGSC,GSE121103_10_HGSC,GSE121103_11_HGSC,GSE121103_12_HGSC,GSE121103_13_HGSC,GSE121103_14_MC,GSE121103_15_MC,GSE121103_16_MC,GSE121103_17_MC,GSE121103_18_MC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
A1BG,121,41,107,182,81,173,119,22,33,41,37,10,117,77,41,8,58,24,130
A1BG-AS1,92,71,85,192,65,188,174,93,42,104,26,43,76,69,55,0,33,24,179
A1CF,381,4,20,4,4,27,12,20,28,21,2,3,14,17,47,0,826,17,56
A2M,1082,420,1042,2486,1286,3559,3283,2117,65,2996,2082,428,5162,1330,1352,79,568,3052,10989
A2M-AS1,159,89,384,767,259,514,638,636,19,665,912,121,1665,384,292,15,163,410,1231


## Format GSE 101108

In [23]:
dataset_name = "GSE101108"
data_path = f"../../data/public_data_sets/{dataset_name}_OV106-391_counts.txt"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"
dataset_metadata_path = f"../../data/public_data_sets/{dataset_name}-26225-1014527-1-SP.xlsx"

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)

# Change ensembl ID to gene name
def get_gene_symbol(ensembl_id):
    try:
        gene_name = ensembl_to_gene_symbol_df_grch38.loc[ensembl_to_gene_symbol_df_grch37["ensembl_gene_id"] == ensembl_id, "external_gene_name"].values[0]
        return gene_name
    except IndexError:
        return None
data.insert(0,
            "Gene",
            data.index.map(lambda x: get_gene_symbol(x)))
data.set_index("Gene", inplace=True)

# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]

# Remove samples that we don't have a type for
dataset_metadata = pd.read_excel(dataset_metadata_path, skiprows=1)
data = data[data.columns[data.columns.isin(dataset_metadata["Sample"])]]

# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

# ##### Add samples to metadata #####
# Types taken from the metadata of the dataset

def type_from_sample_name(sample_name):
    type_abbrev = dataset_metadata[dataset_metadata["Sample"] == sample_name]["Histotype"].values[0]
    if type_abbrev == "CCC":
        return Subtype.CCC
    elif type_abbrev == "EC":
        return Subtype.EC
    elif type_abbrev == "HGSC":
        return Subtype.HGSC
    elif type_abbrev == "LGSC":
        return Subtype.LGSC
    elif type_abbrev == "MC":
        return Subtype.MC
    else:
        raise Exception(f"Unknown sample type {type_abbrev}")

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type
metadata_add["type"] = metadata_add["original_sample_name"].apply(
    lambda x: Subtype(type_from_sample_name(x)).name
)
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)

##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,GSE101108_0_EC,GSE101108,OV155,EC
1,GSE101108_1_CCC,GSE101108,OV170,CCC
2,GSE101108_2_HGSC,GSE101108,OV172,HGSC
3,GSE101108_3_CCC,GSE101108,OV177,CCC
4,GSE101108_4_EC,GSE101108,OV185,EC


Unnamed: 0_level_0,GSE101108_0_EC,GSE101108_1_CCC,GSE101108_2_HGSC,GSE101108_3_CCC,GSE101108_4_EC,GSE101108_5_CCC,GSE101108_6_HGSC,GSE101108_7_CCC,GSE101108_8_CCC,GSE101108_9_HGSC,...,GSE101108_24_HGSC,GSE101108_25_HGSC,GSE101108_26_HGSC,GSE101108_27_HGSC,GSE101108_28_CCC,GSE101108_29_HGSC,GSE101108_30_CCC,GSE101108_31_EC,GSE101108_32_HGSC,GSE101108_33_HGSC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,1696,2184,841,2327,6273,1943,1571,2986,1917,570,...,3157,814,2034,790,2735,2676,1693,723,794,970
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M-AS1,1,4,0,0,0,2,0,2,0,3,...,1,1,3,1,2,0,3,1,0,1
A2ML1,1,3,0,1,0,0,2,3,0,0,...,4,0,0,1,0,12,0,0,0,1


## Format EGAD00001006441

In [24]:
dataset_name = "EGAD00001006441"
data_path = f"../../data/public_data_sets/{dataset_name}/salmon.merged.gene_counts.tsv"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"
dataset_metadata_path = (
    f"../../data/public_data_sets/{dataset_name}-26225-1014527-1-SP.xlsx"
)

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)

# Rename indices to Gene
data.rename_axis("Gene", inplace=True)
data.drop("gene_name", axis=1, inplace=True)

# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]

# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

##### Add samples to metadata #####

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type
metadata_add["type"] = Subtype.LGSC.name # All samples are LGSC
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)

##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,EGAD00001006441_0_LGSC,EGAD00001006441,CL1,LGSC
1,EGAD00001006441_1_LGSC,EGAD00001006441,CL10,LGSC
2,EGAD00001006441_2_LGSC,EGAD00001006441,CL14,LGSC
3,EGAD00001006441_3_LGSC,EGAD00001006441,CL15,LGSC
4,EGAD00001006441_4_LGSC,EGAD00001006441,CL2,LGSC


Unnamed: 0_level_0,EGAD00001006441_0_LGSC,EGAD00001006441_1_LGSC,EGAD00001006441_2_LGSC,EGAD00001006441_3_LGSC,EGAD00001006441_4_LGSC,EGAD00001006441_5_LGSC,EGAD00001006441_6_LGSC,EGAD00001006441_7_LGSC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A1BG,14.0,101.0,3.0,240.0,105.0,0.0,215.0,8.0
A1BG-AS1,0.0,44.0,2.0,70.0,65.0,0.0,206.0,10.0
A1CF,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
A2M,28.0,1.0,17.0,1.0,17.0,2.0,0.0,3.0
A2M-AS1,42.0,44.0,15.0,11.0,25.0,8.0,77.0,101.0


## Format TCGA data (Ovarian cancer only)

In [43]:
dataset_name = "TCGA"
data_path = "../../data/public_data_sets/TCGA_OV_RNA/confident_hgsc_log2counts.tsv"
formatted_path = f"../../data/formatted_rnaseq_data/{dataset_name}.tsv"

##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)

# Change from log2(counts+1) to counts
data = 2 ** data - 1

# Rename row index to Gene
data.index.name = "Gene"

# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]

# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

# ##### Add samples to metadata #####

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
# Set sample type -- assumed to be HGSC; see scrape_tcga_data.ipynb for details
metadata_add["type"] = Subtype.HGSC.name
# Set sample id
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(row.name) + "_" + row["type"], axis=1
)

display(metadata_add.head())
metadata = metadata.merge(metadata_add, how="outer").reset_index(drop=True)

##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,TCGA_0_HGSC,TCGA,TCGA-13-0920,HGSC
1,TCGA_1_HGSC,TCGA,TCGA-24-2033,HGSC
2,TCGA_2_HGSC,TCGA,TCGA-61-2110,HGSC
3,TCGA_3_HGSC,TCGA,TCGA-61-2111,HGSC
4,TCGA_4_HGSC,TCGA,TCGA-24-2254,HGSC


Unnamed: 0_level_0,TCGA_0_HGSC,TCGA_1_HGSC,TCGA_2_HGSC,TCGA_3_HGSC,TCGA_4_HGSC,TCGA_5_HGSC,TCGA_6_HGSC,TCGA_7_HGSC,TCGA_8_HGSC,TCGA_9_HGSC,...,TCGA_181_HGSC,TCGA_182_HGSC,TCGA_183_HGSC,TCGA_184_HGSC,TCGA_185_HGSC,TCGA_186_HGSC,TCGA_187_HGSC,TCGA_188_HGSC,TCGA_189_HGSC,TCGA_190_HGSC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.0,34.0,26.0,19.0,37.0,120.0,35.0,13.0,12.0,4.0,...,14.0,19.0,13.0,93.0,70.0,29.0,9.0,13.0,24.0,13.0
A1BG-AS1,7.0,171.0,99.0,64.0,54.0,534.0,118.0,76.0,22.0,5.0,...,100.0,64.0,31.0,264.0,229.0,122.0,36.0,7.0,25.0,11.0
A1CF,4.0,0.0,13.0,0.0,2.0,3.0,5.0,1.0,0.0,0.0,...,0.0,3.0,3.0,2.0,0.0,1.0,2.0,0.0,2.0,0.0
A2M,4033.0,8870.0,4672.0,12371.0,19628.0,50762.0,33303.0,26762.0,2040.0,8466.0,...,12755.0,17636.0,5051.0,18046.0,12279.0,15432.0,20644.0,8304.0,10730.0,33646.0
A2M-AS1,177.0,241.0,51.0,211.0,204.0,276.0,327.0,139.0,71.0,147.0,...,49.0,505.0,107.0,108.0,66.0,281.0,287.0,53.0,54.0,152.0


## Combined metadata

In [29]:
display(metadata)

# Check that there are no duplicated sample_ids
assert metadata["sample_id"].nunique() == metadata.shape[0]

combined_metadata_path = "../../data/formatted_rnaseq_data/metadata.tsv"
metadata.to_csv(combined_metadata_path, sep="\t", index=False)

Unnamed: 0,sample_id,dataset_name,original_sample_name,type
0,GSE189553_0_CCC,GSE189553,CCC_1,CCC
1,GSE189553_1_CCC,GSE189553,CCC_2,CCC
2,GSE189553_2_CCC,GSE189553,CCC_3,CCC
3,GSE189553_3_CCC,GSE189553,CCC_4,CCC
4,GSE189553_4_CCC,GSE189553,CCC_5,CCC
...,...,...,...,...
281,TCGA_186_HGSC,TCGA,TCGA-59-2352,HGSC
282,TCGA_187_HGSC,TCGA,TCGA-61-2102,HGSC
283,TCGA_188_HGSC,TCGA,TCGA-13-0804,HGSC
284,TCGA_189_HGSC,TCGA,TCGA-13-1512,HGSC


## Format TCGA data (all other cancer types)

Saving metadata that includes all other TCGA cancer types separately.

In [None]:
dataset_name = "TCGA"
data_path = "../../data/public_data_sets/TCGA_all_cancer_types/TCGA_all_cancer_types_log2_counts.tsv"
formatted_path = f"../../data/formatted_rnaseq_data/TCGA_all_cancer_types.tsv"

tcga_all_cancer_metadata_path = f"../../data/public_data_sets/TCGA_all_cancer_types/TCGA_all_cancer_types_metadata_parsed.tsv"
tcga_all_cancer_metadata = pd.read_csv(tcga_all_cancer_metadata_path, sep="\t")

combined_metadata_path = "../../data/formatted_rnaseq_data/metadata.tsv"
combined_metadata = pd.read_csv(combined_metadata_path, sep="\t")

##### Include only non-ovarian cancers (Ovarian cancers already added separately) #####
tcga_all_cancer_metadata = tcga_all_cancer_metadata[tcga_all_cancer_metadata["cancer_type"] != "OV"]


##### Format counts #####

data = pd.read_csv(data_path, sep="\t", index_col=0)

# Set "gene_symbol" column as index
data.set_index("gene_symbol", inplace=True)

# Rename row index to Gene
data.index.name = "Gene"

# Remove extraneous columns
data = data.drop(columns=["ensembl_id"])

# Keep only non-ovarian cancer samples
data = data[data.columns[data.columns.isin(tcga_all_cancer_metadata.index)]]

# Change from log2(counts+1) to counts
data = 2 ** data - 1

# Keep only genes shared by GRCh37 and GRCh38
data = data[data.index.isin(shared_genes)]

# If there are duplicate genes, retain the highest read count
data = data.groupby("Gene").max()

##### Add samples to metadata #####

metadata_add = pd.DataFrame(
    columns=["sample_id", "dataset_name", "original_sample_name", "type", "non-ovarian_cancer_type"]
)
metadata_add["original_sample_name"] = data.columns
metadata_add["dataset_name"] = dataset_name
metadata_add["type"] = "non_ovarian"
metadata_add["non-ovarian_cancer_type"] = metadata_add["original_sample_name"].apply(
    lambda x: tcga_all_cancer_metadata[tcga_all_cancer_metadata["sample_id"] == x]["cancer_type"].values[0]
)

# Set sample id
num_tcga_ovarian_samples = combined_metadata[combined_metadata["dataset_name"] == "TCGA"].shape[0]
metadata_add["sample_id"] = metadata_add.apply(
    lambda row: row["dataset_name"] + "_" + str(int(row.name) + num_tcga_ovarian_samples) + "_" + row["type"], axis=1
)

# display(metadata_add.head())
combined_metadata = combined_metadata.merge(metadata_add, how="outer")
display(combined_metadata.head())

##### Rename samples in counts #####

# Rename columns to sample_id
data.columns = data.columns.map(
    lambda orig_name: metadata_add[
        metadata_add["original_sample_name"] == str(orig_name)
    ]["sample_id"].values[0]
)
display(data.head())

data.to_csv(formatted_path, sep="\t")

##### Save metadata with all TCGA cancer types separately #####

assert combined_metadata.index.nunique() == combined_metadata.shape[0]

combined_metadata_path = "../../data/formatted_rnaseq_data/combined_metadata_with_all_tcga_cancer_types.tsv"
combined_metadata.to_csv(combined_metadata_path, sep="\t", index=False)