In [1]:
import pandas as pd
import json   

# Import utilities
import os
import sys
os.chdir("/Volumes/kueck/PublicDataAnalysis/CASCAM_style_subtype_classification/src/preprocessing")
# os.getcwd() not working as expected, so need to set wd manually (update as needed).
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
sys.path.append(parent_dir)

from utilities import get_ensembl_mappings_grch38

In [2]:
ensembl_to_gene_symbol_df = get_ensembl_mappings_grch38()

In [4]:
tcga_data_folder = "../../data/public_data_sets/TCGA_all_cancer_types"
gene_counts_folder = f"{tcga_data_folder}/gene_counts"

### Load log2 counts data

In [None]:
combined_log2_counts = pd.DataFrame()
combined_metadata  = pd.DataFrame(columns = ["sample_id", "cancer_type"])

for file in os.listdir(gene_counts_folder):
    filename = os.fsdecode(file)

    if not filename.endswith(".gz"):
        continue

    cancer_type = filename.split(".")[0].split("-")[1]

    log2_counts_cancer_type = pd.read_csv(f"{gene_counts_folder}/{filename}", compression='gzip', sep="\t", index_col=0)

    # Rename with submitter ID instead of submitter_slide_id (up to 3rd "-"), merging samples with same submitter ID
    log2_counts_cancer_type.columns = ['-'.join(col.split("-", 3)[:3]) for col in log2_counts_cancer_type.columns]
    log2_counts_cancer_type = log2_counts_cancer_type.T
    log2_counts_cancer_type = log2_counts_cancer_type.groupby(level=0).mean() # Will ignore NaN values by default
    log2_counts_cancer_type = log2_counts_cancer_type.T
    assert not log2_counts_cancer_type.columns.duplicated().any()

    # Combine with other TCGA cancer types
    if combined_log2_counts.empty:
        combined_log2_counts = log2_counts_cancer_type
    else:
        combined_log2_counts = combined_log2_counts.join(log2_counts_cancer_type)

    # Add metadata
    new_metadata = pd.DataFrame({"sample_id": log2_counts_cancer_type.columns, "cancer_type": cancer_type})
    if combined_metadata.empty:
        combined_metadata = new_metadata
    else:
        combined_metadata = pd.concat([combined_metadata, new_metadata]).reset_index(drop=True)

# Save metadata
parsed_metadata_file = f"{tcga_data_folder}/TCGA_all_cancer_types_metadata_parsed.tsv"
combined_metadata.to_csv(parsed_metadata_file, sep="\t", index=False)

# Save combined TCGA data with Encode ID as index
combined_log2_counts_encode_file = f"{tcga_data_folder}/TCGA_all_cancer_types_log2_counts_encode_id.tsv"
combined_log2_counts.to_csv(combined_log2_counts_encode_file, sep="\t")

# Add gene symbol column
combined_log2_counts.insert(0,
                "ensembl_id",
                [ensembl_id_with_version.split(".")[0] for ensembl_id_with_version in combined_log2_counts.index])
def get_gene_symbol(ensembl_id):
    try:
        return ensembl_to_gene_symbol_df.loc[ensembl_to_gene_symbol_df["ensembl_gene_id"] == ensembl_id, "external_gene_name"].values[0]
    except IndexError:
        return None
combined_log2_counts.insert(0,
                "gene_symbol",
                combined_log2_counts["ensembl_id"].apply(get_gene_symbol))

# Save combined TCGA data with gene symbol as index
combined_log2_counts_file = f"{tcga_data_folder}/TCGA_all_cancer_types_log2_counts.tsv"
combined_log2_counts.to_csv(combined_log2_counts_file, sep="\t")