# 1. set up

## 1.1. library

In [None]:
import sys
print("print version")
print(sys.version)

import os
import time

from collections import Counter

import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# for plotting
import matplotlib.patches as patches # for plotting figure
from matplotlib.patches import Patch
import gc # for collecting garbage
import seaborn as sns

In [None]:
from helper_plot import *

# 2. read data

In [None]:
path_data_fran = '../data/results/'
vec_data_fran_tsv = os.listdir(path_data_fran)
vec_data_name = list(set([i.split('.')[0] for i in vec_data_fran_tsv]) - {'MPN', 'GTEx', 'Ovarian_cancer_wgs', 'Cosines_all_samples'})
dict_prob = {}
dict_meta = {}
dict_count = {}

# Ovarian_cancer
# scATACseq
# Heart_atlas
for dataset in vec_data_name:
    print(dataset)
    dict_meta[dataset] = pd.read_csv(
        path_data_fran + dataset + '.metadata.tsv', sep = '\t'
    )
    dict_prob[dataset] = pd.read_csv(
        path_data_fran + dataset + '.df_probabilities.tsv', sep = '\t'
    )
    
    dict_count[dataset] = pd.read_csv(
        path_data_fran + dataset + '.genotype_count.tsv', sep = '\t'
    )

# Ovarian_cancer_wgs
# GTEx
for name_dataset in ['Ovarian_cancer_wgs', 'GTEx']:
    dict_prob[name_dataset] = pd.read_csv(path_data_fran + name_dataset + '.df_probabilities.tsv', sep = '\t')
    dict_count[name_dataset] = pd.read_csv(path_data_fran + name_dataset + '.genotype_count.tsv', sep = '\t')

# 3. plot

## 3.1. heart atlas

### 3.1.1. data process

run the following block first to generate figures for this dataset

In [None]:
name_dataset = 'Heart_atlas'

unique_values = {col: dict_meta[name_dataset][col].unique() for col in dict_meta[name_dataset].columns}

print(f"shape of prob: {dict_prob[name_dataset].shape}")
print(f"shape of meta: {dict_meta[name_dataset].shape}")
print(f"columns: {list(dict_meta[name_dataset].columns)}")
print(f"n_uniq per col: {[len(dict_meta[name_dataset][col].unique()) for col in dict_meta[name_dataset].columns]}")

df_prob = dict_prob[name_dataset]
df_meta = dict_meta[name_dataset]
df_count = dict_count[name_dataset]

df_count['GT.read'] = df_count['GT.all'] - df_count['GT.NA']
df_count = df_count.rename(columns={df_count.columns[0]: 'Sample'})
df_meta = df_meta.merge(df_count, on = 'Sample', how = 'left')

dict_id_id_heart = {}
for i, id_heart in enumerate(df_meta['Donor'].unique()):
    dict_id_id_heart[id_heart] = f'H_ID{i + 1}' 
    
col_id_ind = 'Donor'
col_id_sample = 'Sample'
list_meta_imp = ['region.organ_part', 'GT.read']

df, n_sample_subset = get_data_barh(df_prob = df_prob, df_meta = df_meta,
                                    col_id_ind = col_id_ind, col_id_sample = col_id_sample, 
                                    list_meta_imp = list_meta_imp, thres_GT = 20000)

df_meta['GT.read_count'] = df_meta['GT.read'].copy()
df_meta['GT.read'] = (df_meta['GT.read_count'] / n_mut) * 100
df_meta['GT.read'] = df_meta['GT.read'].apply(format_to_three_significant_digits)

### 3.1.2. horizontally stacked plots

uncomment the following code block to save figures

In [None]:
# col_id_ind = 'Donor'
# col_id_sample = 'Sample'
# list_meta_imp = ['EthnicOrigin', 'region', 'region.organ_part', 'GT.read', 'Donor']

# for index_pt in [5, 9]:
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob, df_meta, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     get_barhplot(df_prob_plot.T, df_meta_plot, label_tick = 'region',
#                  bool_save_plot = True, 
#                  path_plot = '../figure/figure4a',
#                  name_plot = 'figure4a_heart',
#                  name_plot_i = df_meta_plot['Donor'][0])

### 3.1.3. pie charts

uncomment the following code block to save figures

In [None]:
# for index, row in df.iterrows():
#     plot_pie_chart(index, row, 'figure4a_heart', bool_save_plot = True, path_plot = "../figure/figure4a")

## 3.2. GTEx

### 3.2.1. data process

run the following block first to generate figures for this dataset

In [None]:
thres_GT = 20000
name_dataset = 'GTEx'
df_prob = dict_prob[name_dataset]
df_prob = df_prob.loc[vec_pop_ordered]
df_meta = dict_count[name_dataset]
df_meta['GT.read'] = df_meta['GT.all'] - df_meta['GT.NA']
df_meta['ID_pt'] = df_meta['ID'].str.extract(r'^([^\-]+\-[^\-]+)')
df_meta['Region'] = df_meta['ID'].str.extract(r'__([^_].+)$')
df_meta.index = df_meta['ID']

vec_index_GTEX_thres20000 = df_meta[df_meta['GT.read'] >= thres_GT].index
df_meta_subset = df_meta.loc[[(i in vec_index_GTEX_thres20000) for i in df_meta['ID']]]
df_meta_subset = df_meta_subset.groupby('ID_pt').filter(lambda x: len(x) > 1)
vec_index_GTEX_thres20000 = df_meta_subset.index

df_prob_subset = df_prob.iloc[:, [i in df_meta_subset['ID'] for i in df_prob.columns]]
df_meta_subset['GT.read_count'] = df_meta_subset['GT.read'].copy()
df_meta_subset['GT.read'] = (df_meta_subset['GT.read_count']/n_mut) * 100
df_meta_subset['GT.read'] = df_meta_subset['GT.read'].apply(format_to_three_significant_digits)

df_prob = df_prob_subset
df_meta = df_meta_subset

col_id_ind = 'ID_pt'
col_id_sample = 'ID'

df, n_sample_subset = get_data_barh(df_prob = df_prob, df_meta = df_meta,
                                    col_id_ind = col_id_ind, col_id_sample = col_id_sample, 
                                    list_meta_imp = ['GT.read', 'ID_pt'])

list_meta_imp = ['Region_rename', 'GT.read', 'ID_pt']

dict_GTEx_region = {
    'Heart_Left_Ventricle': 'LV',
    'Lung': 'LG',
    'Muscle_Skeletal': 'MS',
    'Prostate': 'PS',
    'Esophagus_Muscularis': 'EM'
}

df_meta['Region_rename'] = [dict_GTEx_region[i] for i in df_meta['Region']]


### 3.2.2. horizontally stacked plots

uncomment the following code block to save figures

In [None]:
# for index_pt in range(len(df_meta[col_id_ind].unique())):
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob, df_meta, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     get_barhplot(df_prob_plot.T, df_meta_plot, label_tick = 'Region_rename',
#                  bool_save_plot = True, 
#                  name_plot = 'figure4b_GTEx',
#                  name_plot_i = df_meta_plot['ID_pt'][0],
#                  path_plot = "../figure/figure4b/")

### 3.2.3. pie charts

uncomment the following code block to save figures

In [None]:
# for index, row in df.iterrows():
#     plot_pie_chart(index, row, 'figure4b_GTEx', size_pie = 0.9, size_font = 5, bool_save_plot = True,
#                    path_plot = '../figure/figure4b')

## 3.3. ovarian

### 3.3.1. data process

run the following block first to generate figures for this dataset

In [None]:
# Ovarian_cancer
name_dataset = 'Ovarian_cancer'
df_meta = dict_meta[name_dataset]
df_prob = dict_prob[name_dataset]
thres_GT = 20000

df_count = dict_count[name_dataset]
unique_values = {col: df_meta[col].unique() for col in df_meta.columns}

print(f"meta data shape: {df_meta.shape}")
print(f"columns: {list(df_meta.columns)}")
print(f"n_uniq per col: {[len(df_meta[col].unique()) for col in df_meta.columns]}")

print(f"\ntumor_supersite: {unique_values['tumor_supersite']}")
print(f"\ntumor_subsite: {unique_values['tumor_subsite']}")
print(f"\ntumor_type: {unique_values['tumor_type']}")

# Ovarian_cancer WGS
name_dataset = 'Ovarian_cancer_wgs'
df_prob_wgs = dict_prob[name_dataset]
df_count_wgs = dict_count[name_dataset]

# simple data transformation
df_count_wgs['ID_PT'] = df_count_wgs['ID'].str.split('_').str.get(0)
df_count_wgs['Tumor'] = np.where(df_count_wgs['ID_PT'] == df_count_wgs['ID'], 'No', 'Yes')

df_count['ID_PT'] = df_count['ID'].str.split('_').str.get(0)
df_count['ID_Sample'] = df_count['ID'].str.split('_').str.get(2)
df_count['Site'] = df_count['ID'].str.split('_').str[3:].str.join('_')
df_plot_temp = df_count[df_count['ID_PT'] == 'SPECTRUM-OV-009'] 
df_count['GT.read'] = df_count['GT.all'] - df_count['GT.NA']
df_count_wgs['GT.read'] = df_count_wgs['GT.all'] - df_count_wgs['GT.NA']
df_meta = df_meta.rename(columns = {df_meta.columns[0]: 'ID'})
df_meta_merge = df_meta.merge(df_count, on = 'ID', how = 'left')
df_meta = df_meta.merge(df_meta_merge[['ID', 'GT.read']], on = 'ID', how = 'left')
df_meta_wgs = df_count_wgs[['ID', 'GT.read', 'Tumor', 'ID_PT']]

df_meta.index = df_meta['ID']

vec_index_ovarian_thres20000 = df_meta[df_meta['GT.read'] >= thres_GT].index
df_meta_subset = df_meta.loc[[(i in vec_index_ovarian_thres20000) for i in df_meta['ID']]]
df_meta_subset = df_meta_subset.groupby('patient_id').filter(lambda x: len(x) > 1)

df_prob_subset = df_prob.iloc[:, [i in df_meta_subset['ID'] for i in df_prob.columns]]
df_meta_subset['GT.read_count'] = df_meta_subset['GT.read'].copy()
df_meta_subset['GT.read'] = (df_meta_subset['GT.read_count']/n_mut) * 100
df_meta_subset['GT.read'] = df_meta_subset['GT.read'].apply(format_to_three_significant_digits)

df_meta_wgs['GT.read_count'] = df_meta_wgs['GT.read'].copy()
df_meta_wgs['GT.read'] = (df_meta_wgs['GT.read_count']/n_mut) * 100
df_meta_wgs['GT.read'] = df_meta_wgs['GT.read'].apply(format_to_three_significant_digits)

df_prob = df_prob_subset
df_meta = df_meta_subset

dict_ovarian_tumor_subsite = {
    'Right Diaphragm': 'DP',
    'Left Diaphragm': 'DP',
    
    'Bowel': 'BW',
    
    'Infracolic Omentum': 'IO',
    
    'Pelvic Peritoneum': 'PP',
    
    'Right Paracolic Gutter': 'PG',
    
    'Ascites': 'AS',
    
    'Left Adnexa': 'AX',
    'Right Adnexa': 'AX',
    
    'Right Ovary': 'OV',
    'Left Ovary': 'OV'
}

dict_ovarian_tumor_yesno = {
    'Yes': 'YS',
    'No': 'NO'
}

### 3.3.2. pie charts

uncomment the following code block to save figures

In [None]:
# col_id_ind = 'patient_id'
# col_id_sample = 'ID'
# list_meta_imp = ['tumor_supersite', 'tumor_subsite', 'GT.read']
# df, n_sample_subset = get_data_barh(df_prob = df_prob, df_meta = df_meta,
#                                     col_id_ind = col_id_ind, col_id_sample = col_id_sample, 
#                                     list_meta_imp = list_meta_imp)

# for index, row in df.iterrows():
#     plot_pie_chart(index, row, 'figure5_ovarian', size_pie = 0.9, size_font = 5, 
#                    bool_save_plot = True, path_plot = '../figure/figure5')

### 3.3.3. horizontally stacked plots

uncomment the following code blocks to save figures

#### 3.3.3.1. everyone

In [None]:
# col_id_ind = 'patient_id'
# col_id_sample = 'ID'
# list_meta_imp = ['tumor_supersite', 'tumor_subsite', 'GT.read']
# vec_interest = df_meta['patient_id'].values

# for index_pt in range(len(df_meta[col_id_ind].unique())):
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob, df_meta, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     if(id_patient_plot in vec_interest):
#         # df_meta_plot['tumor_subsite_rename'] = [dict_ovarian_tumor_subsite[i] for i in df_meta_plot['tumor_subsite']]
#         get_barhplot(df_prob_plot.T, df_meta_plot, 
#                      #label_tick = 'tumor_subsite_rename',
#                      label_tick = 'tumor_subsite',
#                      bool_save_plot = True, 
#                      name_plot = 'figure5_ovarian_scRNAseq',
#                      name_plot_i = id_patient_plot,
#                      path_plot = '../figure/figure5/')
        
# col_id_ind = 'ID_PT'
# col_id_sample = 'ID'
# list_meta_imp = ['Tumor', 'GT.read']

# for index_pt in range(len(df_meta_wgs[col_id_ind].unique())):
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob_wgs, df_meta_wgs, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     if(id_patient_plot in vec_interest):
#         df_meta_plot['Tumor_rename'] = [dict_ovarian_tumor_yesno[i] for i in df_meta_plot['Tumor']]
#         get_barhplot(df_prob_plot.T, df_meta_plot, 
#                      label_tick = 'Tumor_rename',
#                      bool_save_plot = True, 
#                      name_plot = 'figure5_ovarian_WGS',
#                      name_plot_i = id_patient_plot,
#                      path_plot = '../figure/figure5/')

#### 3.3.3.2. four individuals for figure 5

In [None]:
# col_id_ind = 'patient_id'
# col_id_sample = 'ID'
# list_meta_imp = ['tumor_supersite', 'tumor_subsite', 'GT.read']
# vec_interest = df_meta['patient_id'].values
# vec_interest = ['SPECTRUM-OV-081', 'SPECTRUM-OV-053', 
#                 'SPECTRUM-OV-116', 'SPECTRUM-OV-118']

# for index_pt in range(len(df_meta[col_id_ind].unique())):
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob, df_meta, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     if(id_patient_plot in vec_interest):
#         # df_meta_plot['tumor_subsite_rename'] = [dict_ovarian_tumor_subsite[i] for i in df_meta_plot['tumor_subsite']]
#         get_barhplot(df_prob_plot.T, df_meta_plot, 
#                      #label_tick = 'tumor_subsite_rename',
#                      label_tick = 'tumor_subsite',
#                      bool_save_plot = True, 
#                      name_plot = 'figure5_ovarian_scRNAseq',
#                      name_plot_i = id_patient_plot,
#                      path_plot = '../figure/figure5/')
        
# col_id_ind = 'ID_PT'
# col_id_sample = 'ID'
# list_meta_imp = ['Tumor', 'GT.read']

# for index_pt in range(len(df_meta_wgs[col_id_ind].unique())):
#     df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob_wgs, df_meta_wgs, 
#                                                               col_id_ind, col_id_sample, list_meta_imp, 
#                                                               index_pt = index_pt)
#     if(id_patient_plot in vec_interest):
#         df_meta_plot['Tumor_rename'] = [dict_ovarian_tumor_yesno[i] for i in df_meta_plot['Tumor']]
#         get_barhplot(df_prob_plot.T, df_meta_plot, 
#                      label_tick = 'Tumor_rename',
#                      bool_save_plot = True, 
#                      name_plot = 'figure5_ovarian_WGS',
#                      name_plot_i = id_patient_plot,
#                      path_plot = '../figure/figure5/')

## 3.4. scATACseq

### 3.4.1. data process

run the following block first to generate figures for this dataset

In [None]:
name_dataset = 'scATACseq'

df_prob = dict_prob[name_dataset]
df_meta = dict_meta[name_dataset]
df_count = dict_count[name_dataset]

unique_values = {col: dict_meta[name_dataset][col].unique() for col in dict_meta[name_dataset].columns}

print(f"shape of prob: {dict_prob[name_dataset].shape}")
print(f"shape of meta: {dict_meta[name_dataset].shape}")
print(f"columns: {list(dict_meta[name_dataset].columns)}")
print(f"n_uniq per col: {[len(dict_meta[name_dataset][col].unique()) for col in dict_meta[name_dataset].columns]}")
print(f"types ethnicity: {unique_values['Ethnicity']}")
print(f"types tissue type: {unique_values['Tissue_type']}")
# [274, 41, 1, 7, 26, 2, 1, 1]

df_count['GT.read'] = df_count['GT.all'] - df_count['GT.NA']
df_count = df_count.rename(columns={df_count.columns[0]: 'Sample'})
df_meta = df_meta.merge(df_count, on = 'Sample', how = 'left')

df_meta.index = df_meta['Sample']
thres_GT = 20000
vec_index_scATAC_thres20000 = df_meta[df_meta['GT.read'] >= thres_GT].index
df_meta_subset = df_meta.loc[[(i in vec_index_scATAC_thres20000) for i in df_meta['Sample']]]
# df_meta_subset = df_meta_subset.groupby('patient_id').filter(lambda x: len(x) > 1)

df_prob_subset = df_prob.iloc[:, [i in df_meta_subset['Sample'] for i in df_prob.columns]]
df_meta_subset['GT.read_count'] = df_meta_subset['GT.read'].copy()
df_meta_subset['GT.read'] = (df_meta_subset['GT.read_count']/n_mut) * 100
df_meta_subset['GT.read'] = df_meta_subset['GT.read'].apply(format_to_three_significant_digits)

df_prob = df_prob_subset
df_meta = df_meta_subset

### 3.4.2. pie charts

uncomment the following code block to save figures

In [None]:
# col_id_ind = 'Donor'
# col_id_sample = 'Sample'
# list_meta_imp = ['Tissue_type', 'GT.read']
# df, n_sample_subset = get_data_barh(df_prob = df_prob, df_meta = df_meta,
#                                     col_id_ind = col_id_ind, col_id_sample = col_id_sample, 
#                                     list_meta_imp = list_meta_imp)

# for index, row in df.iterrows():
#     plot_pie_chart(index, row, 
#                    'figure6_scATAC', size_pie = 0.9, size_font = 5, bool_save_plot = True, 
#                    path_plot = '../figure/figure6/')

### 3.4.3. horizontally stacked bars

uncomment the following code blocks to save figures

In [None]:
col_id_ind = 'Donor'
col_id_sample = 'Sample'
list_meta_imp = ['Tissue_type', 'Age', 'Sex', 'Ethnicity', 'GT.read']
thres_GT = 20000

def get_barhplot_scATACseq(index_pt, name_plot, size_plot_y = size_onecolumn/2):
    df_prob_plot, df_meta_plot, id_patient_plot = get_df_prob(df_prob, df_meta, 
                                                            col_id_ind, col_id_sample, list_meta_imp, 
                                                            index_pt = index_pt)

    list_pass_thres = list(df_meta_plot[pd.to_numeric(df_meta_plot['GT.read']) > (thres_GT / n_mut * 100)].index)
    df_meta_plot = df_meta_plot.loc[[i in list_pass_thres for i in df_meta_plot.index], :]
    df_prob_plot = df_prob_plot.loc[: , [i in list_pass_thres for i in df_prob_plot.T.index]]
    df_meta_plot['Tissue_type'] = [title_case(i) for i in df_meta_plot['Tissue_type']]

    get_barhplot(df_prob_plot.T, df_meta_plot, 
                label_tick = 'Tissue_type', 
                min_percent_visible_pop = 0.09,
                bool_sort = True,
                type_GTread = "percent",
                bool_save_plot = True, 
                size_plot_x = size_onecolumn,
                size_plot_y = size_plot_y,
                name_plot = 'figure6_scATACseq',
                name_plot_i = name_plot,
                path_plot = '../figure/figure6/'
                )

In [None]:
# get_barhplot_scATACseq(0, 'donor1_white', size_plot_y = size_onecolumn/2)
# get_barhplot_scATACseq(1, 'donor2_white', size_plot_y = size_onecolumn/1.2)
# get_barhplot_scATACseq(2, 'donor3_asian', size_plot_y = size_onecolumn)
# get_barhplot_scATACseq(3, 'donor4_asian', size_plot_y = size_onecolumn/0.6)