In [None]:
"""
Author: Kye D Nichols
This script preps data and runs clustering

Usage: prep_data.py
"""
import os
import pandas as pd
import numpy as np

In [None]:
from prep_data import prep_multi_omics
from run_customics import get_customics_latent

In [None]:
from run_kmedoids import run_kmedoids_clustering
from customics import CustOMICS, get_common_samples, get_sub_omics_df
from helper_scripts import *

In [None]:
labels_path = os.path.join(os.pardir, "extra/PANCAN_Subtype.csv")
#labels_path = os.path.join(os.pardir, "extra/Immune_subtypes.csv")
output_path = os.path.join(os.pardir, 'data_proc')
results_dir = os.path.join(os.pardir, 'results')
figures_dir = os.path.join(os.pardir, 'figures')

In [None]:
'''
output_name = "GDC-PANCAN"
input_dir = os.path.join(os.pardir, 'data/GDC-PANCAN')
latent_dim = 32

label_col_name = "cancer.type"
sep_token = "\t"
label_idx=0
datatype_tag_dict = {"miRNAseq":".mirna.tsv",
                     "RNAseq":".htseq_counts.tsv",
                     "RNAseq":".htseq_fpkm-uq.tsv",
                     "RNAseq":".htseq_fpkm.tsv",
                     "methyl":".methylation450.tsv",
                     "CNV": ".gistic.tsv"
                     }
encoding = {'STAD': 0, 'LAML': 1, 'COAD': 2, 'ESCA': 3, 'BRCA':4}
(omics_df, labels, mysamples, outpaths) = prep_multi_omics(input_dir,
                                                 output_name,
                                                 output_path,
                                                 label_col_name,
                                                 label_idx,
                                                 datatype_tag_dict,
                                                 sep_token,
                                                 labels_path,
                                                 encoding)


pca_dims=200
pca_data_path = os.path.join(output_path, "%s_pca.csv" % output_name)
pca_df = pca_multi_omics(omics_df, pca_dims)
pca_df.to_csv(pca_data_path)

runtsne(pca_df, figures_dir, output_name, encoding, labels, labels_key="labels")
runumap(pca_df, figures_dir, output_name, encoding, labels, labels_key="labels")
'''

In [None]:
#output_name = "TCGA-STAD-Immune.subtype.noprot"
output_name = "TCGA-STAD-GI.subtype.noprot"
latent_dim = 32

input_dir = os.path.join(os.pardir, 'data/TCGA-STAD')
#label_col_name = "Subtype_Selected"
label_col_name = "cancer.type"

sep_token = ","
datatype_tag_dict = {"RNAseq":"Counts_tpm.csv",
                     "miRNAseq":"miRNA.csv",
                     "methyl":"Methyl450-Beta.csv"
#                     "Protein":"RPPA.csv"
                     }
#encoding = {'GI.CIN': 0, 'GI.EBV': 1, 'GI.GS': 2, 'GI.MSI': 3}
#label_idx= 1
encoding = {"StE":0, "ImD":1, "ImE":2}
label_idx= 0
(omics_df, labels, mysamples, outpaths) = prep_multi_omics(input_dir,
                                                 output_name,
                                                 output_path,
                                                 label_col_name,
                                                 label_idx,
                                                 datatype_tag_dict,
                                                 sep_token,
                                                 labels_path,
                                                 encoding)

comp_clinical_fname = "%s_K=%i_clinical_input.csv"

clinical1 = pd.read_csv(os.path.join(input_dir, "STAD_clinical.csv"), index_col=2)
clinical2 = pd.read_csv(os.path.join(input_dir, "TCGA-STAD.GDC_phenotype.tsv"), sep="\t", index_col=0)
clinical_df = clinical1.join(clinical2)

cat_cols = ["tissue_or_organ_of_origin",
            "primary_diagnosis",
            "ajcc_pathologic_t",
            "race",
            "vital_status",
            "treatments_pharmaceutical_treatment_or_therapy",
            "treatments_radiation_treatment_or_therapy",
            "vital_status",
            "gender",
            "icd_10_code",
            "ajcc_pathologic_m",
            "prior_malignancy"]

clinical_df_reduced = clinical_df[cat_cols]

                                 
                                 
clinical_df = clinical_df_reduced.dropna(axis="columns")
outpaths, clinical_df.shape

In [None]:
latent_df = get_customics_latent(output_path,
                                 output_name,
                                 omics_df,
                                 mysamples,
                                 labels,
                                 latent_dim,
                                 encoding
                                )

In [None]:
mixomics_str = "Rscript runMixOmics.r"
mixomics_output_path = os.path.join(output_path, "%s_mixomics.rds"%output_name)
print("Run:\n%s %s %s" % (mixomics_str, ' '.join(outpaths), mixomics_output_path))

In [None]:
latent_df.columns = ["latent-%s"%str(i) for i in latent_df.columns.to_list()]
distance_types = ["gower", "wishart", "podani"]
norm_params = [1.01]+[i/10 for i in range(11,45,1)]
cluster_num = 4
output_name_latent = output_name+"_latent"
get_clustering_results(output_path,
                       results_dir,
                       output_name_latent,
                       cluster_num,
                       distance_types,
                       norm_params,
                       latent_df,
                       labels)

In [None]:
save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_latent,
               results_dir)

In [None]:
runtsne(latent_df, figures_dir, output_name, encoding, labels, labels_key="labels")
runumap(latent_df, figures_dir, output_name, encoding, labels, labels_key="labels")

In [None]:
sel_index = get_common_samples([latent_df, clinical_df])
merged_latent = latent_df.loc[sel_index].join(clinical_df.loc[sel_index])
merged_latent.to_csv(os.path.join(output_path,comp_clinical_fname%(output_name, cluster_num)))

In [None]:
output_name_latent_mixed = output_name+"_latent_mixed"
get_clustering_results(output_path,
                       results_dir,
                       output_name_latent_mixed,
                       cluster_num,
                       distance_types,
                       norm_params,
                       merged_latent,
                       labels,
                       kmeans_run=False)

In [None]:
save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_latent_mixed,
               results_dir,
               kmeans=False)

In [None]:
comp_fname = "%s_K=%i_comp_input.csv"
mixomics_paths = [i.replace(".csv", "variates.csv") for i in outpaths]
mixomics_df = merge_components(mixomics_paths)
mixomics_df.index = pd.read_csv(outpaths[0], index_col=0).index
mixomics_df.to_csv(os.path.join(output_path, comp_fname%(output_name, cluster_num)))

In [None]:
output_name_comp = output_name+"_comp"
get_clustering_results(output_path,
                       results_dir,
                       output_name_comp,
                       cluster_num,
                       distance_types,
                       norm_params,
                       mixomics_df,
                       labels)

In [None]:
save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_comp,
               results_dir)

In [None]:
sel_index = get_common_samples([mixomics_df, clinical_df])
merged_comp = mixomics_df.loc[sel_index].join(clinical_df.loc[sel_index])

In [None]:
runtsne(latent_df, figures_dir, output_name, encoding, labels, labels_key="labels")
runumap(latent_df, figures_dir, output_name, encoding, labels, labels_key="labels")

In [None]:
output_name_comp_mixed = output_name+"_comp_clinical"
get_clustering_results(output_path,
                       results_dir,
                       output_name_comp_mixed,
                       cluster_num,
                       distance_types,
                       norm_params,
                       merged_comp,
                       labels)

In [None]:
save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_comp_mixed,
               results_dir,
               kmeans=False)

In [None]:
cluster_num = "7"
dist_type = "gower"
norm_param = "4.0"
table_outpath = os.path.join(output_path, "%s_Table1.xlsx" % outname)
compare_clusters(indf, table_outpath, cluster_num, dist_type, norm_param, cat_cols)