In [1]:
"""
Author: Kye D Nichols
This script makes figures and serves as a demo
"""
import os
import pandas as pd
import numpy as np

In [2]:
from customics import get_common_samples
from helper_scripts import *
from prep_data import *

In [3]:
#!Rscript data/download_PANCAN.r

In [4]:
#!Rscript data/download_TCGA-STAD.r

In [5]:
proc_dir = os.path.join(os.pardir, 'data_proc')
results_dir = os.path.join(os.pardir, 'results')
figures_dir = os.path.join(os.pardir, 'figures')
input_dir = os.path.join(os.pardir, 'download')


mydirs = [figures_dir, results_dir, proc_dir]
for mydir in mydirs:
    if not os.path.exists(mydir):
        os.makedirs(mydir)


select_cols = ["age_at_diagnosis",
            "tissue_or_organ_of_origin",
            "primary_diagnosis",
            "ajcc_pathologic_t",
            "race",
            "vital_status",
            "treatments_pharmaceutical_treatment_or_therapy",
            "treatments_radiation_treatment_or_therapy",
            "vital_status",
            "gender",
            "icd_10_code",
            "ajcc_pathologic_m",
            "prior_malignancy"]

clinical_path = os.path.join(input_dir, "STAD_clinical.csv")

In [6]:
#output_name = "TCGA-STAD-GI.subtype.noprot"
output_name = "TCGA-STAD-Immune.subtype.noprot"
#encoding = {'GI.CIN': 0, 'GI.EBV': 1, 'GI.GS': 2, 'GI.MSI': 3}
encoding = {"StE":0, "ImD":1, "ImE":2}
#labels_path = os.path.join(input_dir, "PANCAN_Subtype.csv")
labels_path = os.path.join(input_dir, "PANCAN_Subtype_combined.csv")
latent_dim = 20
#label_col_name = "Subtype_Selected"
label_col_name = "Immune.Subtype"
datatype_tag_dict = {"RNAseq":"Counts_tpm.csv",
                     "miRNAseq":"miRNA.csv",
                     "methyl":"Methyl450-Beta.csv"
#                     "Protein":"RPPA.csv"
                     }
cluster_num = len(list(encoding))
distance_types = ["gower", "wishart", "podani"]
norm_params = [1.01]+[i/10 for i in range(11,45,1)]
mixedd_cluster = 7

In [7]:
(omics_df, labels, mysamples, outpaths) = prep_multi_omics(input_dir,
                                                           output_name,
                                                           proc_dir,
                                                           labels_path,
                                                           label_col_name,
                                                           datatype_tag_dict,
                                                           encoding,
                                                           overwrite=True
                                                            )

Formating RNA-seq Data
(415, 22755)
Formating Methylation Data
(395, 16741)
Formating miRNA-seq Data
(436, 1556)


In [8]:
# Run Mixomics using outpaths

In [9]:
#!(runMixOmics.r TCGA-STAD-GI.subtype.noprot)

In [10]:
#pca_dims=200
#pca_data_path = os.path.join(output_path, "%s_pca.csv" % output_name)
#pca_df = pca_multi_omics(omics_df, pca_dims)
#pca_df.to_csv(pca_data_path)
        
#runtsne(pca_df, figures_dir, output_name+"_pca", encoding, labels, labels_key="labels")
#runumap(pca_df, figures_dir, output_name+"_pca", encoding, labels, labels_key="labels")

In [11]:
latent_df = get_customics_latent(results_dir,
                                 output_name,
                                 omics_df,
                                 mysamples,
                                 labels,
                                 latent_dim,
                                 encoding
                                )

Number of Parameters:  32748796
	Epoch 1 complete. 	Average Loss Train :  0.5750712582043239 	Average Loss Val :  0.5161418318748474
	Epoch 2 complete. 	Average Loss Train :  0.5089392704623086 	Average Loss Val :  0.4863979071378708
	Epoch 3 complete. 	Average Loss Train :  0.4654242992401123 	Average Loss Val :  0.45257745683193207
	Epoch 4 complete. 	Average Loss Train :  0.4242793917655945 	Average Loss Val :  0.4206697940826416
	Epoch 5 complete. 	Average Loss Train :  0.358467127595629 	Average Loss Val :  0.4051290899515152
	Epoch 6 complete. 	Average Loss Train :  0.17541852380548204 	Average Loss Val :  0.1519867554306984
	Epoch 7 complete. 	Average Loss Train :  0.11938887302364622 	Average Loss Val :  0.13415444642305374
	Epoch 8 complete. 	Average Loss Train :  0.10461862278836113 	Average Loss Val :  0.12744084745645523
	Epoch 9 complete. 	Average Loss Train :  0.09295501347099032 	Average Loss Val :  0.12525710090994835
	Epoch 10 complete. 	Average Loss Train :  0.0809608

In [None]:
runtsne(latent_df, figures_dir, output_name, labels, labels_key="labels", encoding=encoding)
runumap(latent_df, figures_dir, output_name, labels, labels_key="labels", encoding=encoding)

In [None]:
latent_df.columns = ["latent-%s"%str(i) for i in latent_df.columns.to_list()]
output_name_latent = output_name+"_latent"
get_clustering_results(proc_dir,
                       results_dir,
                       output_name_latent,
                       cluster_num,
                       distance_types,
                       norm_params,
                       latent_df,
                       labels
                       )

save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_latent,
               results_dir)    

In [None]:
comp_clinical_fname = "%s_K=%i_clinical_input.csv"
clinical1 = pd.read_csv(clinical_path, index_col=2)
clinical1.head()

In [None]:
clinical_df = clinical1[select_cols].dropna(axis="columns")
clinical_df.head()

In [None]:
clinical_df.shape

In [None]:
sel_index = get_common_samples([latent_df, clinical_df])
merged_latent = latent_df.loc[sel_index].join(clinical_df.loc[sel_index])
merged_latent.to_csv(os.path.join(proc_dir,comp_clinical_fname%(output_name,
                                                                   cluster_num)))



In [None]:
output_name_latent_mixed = output_name+"_latent_mixed"
get_clustering_results(proc_dir,
                       results_dir,
                       output_name_latent_mixed,
                       mixedd_cluster,
                       distance_types,
                       norm_params,
                       merged_latent,
                       labels,
                       kmeans_run=False)

save_all_plots(figures_dir,
               mixedd_cluster,
               norm_params,
               output_name_latent_mixed,
               results_dir,
               kmeans=False,
               rand=False)

In [None]:
(mixomics_df, lbldf) = get_mixomics_output(proc_dir, output_name)


In [None]:
runtsne(mixomics_df, figures_dir, "mixomics-%s" %output_name, lbldf)
runumap(mixomics_df, figures_dir, "mixomics-%s" %output_name, lbldf)

In [None]:
output_name_mixomics = output_name+"_mixomics"
get_clustering_results(proc_dir,
                       results_dir,
                       output_name_mixomics,
                       cluster_num,
                       distance_types,
                       norm_params,
                       mixomics_df,
                       lbldf
                       )

save_all_plots(figures_dir,
               cluster_num,
               norm_params,
               output_name_mixomics,
               results_dir)    

In [None]:
mixomics_df.index = latent_df.index

In [None]:
sel_index = get_common_samples([mixomics_df, clinical_df])
merged_mixomics = mixomics_df.loc[sel_index].join(clinical_df.loc[sel_index])
merged_mixomics

In [None]:
output_name_mixomics_mixed = output_name+"_mixomics_mixed"
get_clustering_results(proc_dir,
                       results_dir,
                       output_name_mixomics_mixed,
                       mixedd_cluster,
                       distance_types,
                       norm_params,
                       merged_mixomics,
                       lbldf,
                       kmeans_run=False
                      )

save_all_plots(figures_dir,
               mixedd_cluster,
               norm_params,
               output_name_mixomics_mixed,
               results_dir,
               kmeans=False,
               rand=False)  

In [None]:
#methyldf_load_path = os.path.join(base_dir, "methyl_loadings.csv")
#mrnadf_load_path = os.path.join(base_dir, "mRNA_loadings.csv")
#mirnadf_load_path = os.path.join(base_dir, "miRNA_loadings.csv")
#methyl_loads = pd.read_csv(methyldf_load_path, index_col=0)
#mrna_loads = pd.read_csv(mrnadf_load_path, index_col=0)
#mirna_loads = pd.read_csv(mirnadf_load_path, index_col=0)