## 1. Read Files From Raw Files (Clinical data, Transcriptomic data, Methylomics data, Metabolomics data)

### 1.1 Clinical Data

In [None]:
import pandas as pd 
phenodata_df = pd.read_excel('./data/pheno_data/LLFS_phenos_21JUN2022.xlsx', sheet_name='Phenodata').sort_values(by='subject')
# Convert the subject column to string
phenodata_df['subject'] = phenodata_df['subject'].astype(str)
display(phenodata_df)

In [None]:
### read t2ds label data
import numpy as np
t2ds_label_df = pd.read_table('./data/label_data/t2dpret2d.txt')
t2ds_label_df = t2ds_label_df.replace('.', 0)
t2ds_label_df['pret2ds'] = t2ds_label_df['pret2ds'].astype(np.int64)
t2ds_label_df = t2ds_label_df.sort_values(by='subject')
t2ds_label_df['subject'] = t2ds_label_df['subject'].astype(str)
print(t2ds_label_df.dtypes)
display(t2ds_label_df)

### 1.2 Transcriptomic Data

In [None]:
tran_v1_df = pd.read_csv('./data/omics_data/residuals/RNA_seq_residuals_v1_allsubjects.csv').sort_values(by='subject')
display(tran_v1_df)
tran_v1_df_transposed = tran_v1_df.T
# Convert the first row to strings, remove any '.0' at the end, and set it as the new header (column names)
tran_v1_df_transposed.columns = tran_v1_df_transposed.iloc[0].astype(str).str.replace('.0', '', regex=False)
# Drop the first row as it's now the header
tran_v1_df_transposed = tran_v1_df_transposed.drop(tran_v1_df_transposed.index[0])
# Reset the index
tran_v1_df = tran_v1_df_transposed.reset_index()
# Rename the first column to 'subject'
tran_v1_df = tran_v1_df.rename(columns={'index': 'gene_id'})
# Convert the version gene ID to the gene ID
ensembl_gene_ids = tran_v1_df['gene_id'].apply(lambda x: x.split('.')[0]).tolist()
tran_v1_df['gene_id'] = ensembl_gene_ids
display(tran_v1_df)

In [None]:
# Keep the gene ID in the ensembl_data dataframe
ensembl_data = pd.read_csv('./data/kg_data/ensembl/mart_export_genename.txt')
ensembl_data= ensembl_data.rename(columns={'Gene stable ID': 'gene_id'}).dropna().drop_duplicates().reset_index(drop=True)
display(ensembl_data)
# Convert the 'Gene Name' column to 'gene_name'
ensembl_data = ensembl_data.rename(columns={'Gene name': 'gene_name'})
merged_tran_v1_df = pd.merge(tran_v1_df, ensembl_data, on='gene_id', how='inner')
# Move the gene name to the first column
merged_tran_v1_df = merged_tran_v1_df[['gene_name'] + [col for col in merged_tran_v1_df.columns if col != 'gene_name']]
# Drop the gene ID column
merged_tran_v1_df = merged_tran_v1_df.drop(columns=['gene_id'])
# Drop duplicated rows and aggregate the rows by grouping them by gene name
merged_tran_v1_df = merged_tran_v1_df.groupby(['gene_name']).mean().reset_index()
display(merged_tran_v1_df)

### 1.3 Methylation Data

In [None]:
core_promoter_df = pd.read_csv('./data/omics_data/epigenomics/Core_Promoter_final.csv').sort_values(by='gene_name').reset_index(drop=True)
core_promoter_sorted_df = core_promoter_df[['gene_name'] + sorted(core_promoter_df.columns[1:], key=lambda x: int(x))] # Sort the dataframe columns (excluding 'gene_name') numerically
distal_promoter_df = pd.read_csv('./data/omics_data/epigenomics/Distal_Promoter_final.csv').sort_values(by='gene_name').reset_index(drop=True)
distal_promoter_sorted_df = distal_promoter_df[['gene_name'] + sorted(distal_promoter_df.columns[1:], key=lambda x: int(x))] # Sort the dataframe columns (excluding 'gene_name') numerically
downstream_df = pd.read_csv('./data/omics_data/epigenomics/Downstream_final.csv').sort_values(by='gene_name').reset_index(drop=True)
downstream_sorted_df = downstream_df[['gene_name'] + sorted(downstream_df.columns[1:], key=lambda x: int(x))] # Sort the dataframe columns (excluding 'gene_name') numerically
proximal_promoter_df = pd.read_csv('./data/omics_data/epigenomics/Proximal_Promoter_final.csv').sort_values(by='gene_name').reset_index(drop=True)
proximal_promoter_sorted_df = proximal_promoter_df[['gene_name'] + sorted(proximal_promoter_df.columns[1:], key=lambda x: int(x))] # Sort the dataframe columns (excluding 'gene_name') numerically
upstream_df = pd.read_csv('./data/omics_data/epigenomics/Upstream_final.csv').sort_values(by='gene_name').reset_index(drop=True)
upstream_sorted_df = upstream_df[['gene_name'] + sorted(upstream_df.columns[1:], key=lambda x: int(x))] # Sort the dataframe columns (excluding 'gene_name') numerically
display(upstream_sorted_df)

In [None]:
# Keep gene names in the ensebmle_data dataframe and drop gene_id
merged_core_promoter_df = pd.merge(core_promoter_sorted_df, ensembl_data, on='gene_name', how='inner').drop(columns=['gene_id'])
merged_core_promoter_df = merged_core_promoter_df.groupby(['gene_name']).mean().reset_index()
merged_distal_promoter_df = pd.merge(distal_promoter_sorted_df, ensembl_data, on='gene_name', how='inner').drop(columns=['gene_id'])
merged_distal_promoter_df = merged_distal_promoter_df.groupby(['gene_name']).mean().reset_index()
merged_downstream_df = pd.merge(downstream_sorted_df, ensembl_data, on='gene_name', how='inner').drop(columns=['gene_id'])
merged_downstream_df = merged_downstream_df.groupby(['gene_name']).mean().reset_index()
merged_proximal_promoter_df = pd.merge(proximal_promoter_sorted_df, ensembl_data, on='gene_name', how='inner').drop(columns=['gene_id'])
merged_proximal_promoter_df = merged_proximal_promoter_df.groupby(['gene_name']).mean().reset_index()
merged_upstream_df = pd.merge(upstream_sorted_df, ensembl_data, on='gene_name', how='inner').drop(columns=['gene_id'])
merged_upstream_df = merged_upstream_df.groupby(['gene_name']).mean().reset_index()
display(merged_upstream_df)

## 2. Gene and Sample Intersection

### 2.1 Gene Intersection Over Omics Data

In [None]:
merged_tran_v1_gene = set(merged_tran_v1_df['gene_name'].tolist())
merged_core_promoter_gene = set(merged_core_promoter_df['gene_name'].tolist())
intersected_gene = sorted(list(merged_tran_v1_gene & merged_core_promoter_gene))
print(intersected_gene)
print('intersected_gene:', len(intersected_gene))

In [None]:
# Keep the gene names that are in both the tran_v1 and core_promoter dataframes
merged_tran_v1_df = merged_tran_v1_df[merged_tran_v1_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
merged_core_promoter_df = merged_core_promoter_df[merged_core_promoter_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
merged_distal_promoter_df = merged_distal_promoter_df[merged_distal_promoter_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
merged_downstream_df = merged_downstream_df[merged_downstream_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
merged_proximal_promoter_df = merged_proximal_promoter_df[merged_proximal_promoter_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
merged_upstream_df = merged_upstream_df[merged_upstream_df['gene_name'].isin(intersected_gene)].reset_index(drop=True)
display(merged_tran_v1_df)
display(merged_upstream_df)

#### 2.1.1 Regulatory Network

In [None]:
# Add the gene names from databases like [KEGG / BioGRID] to intersect with the common genes
# KEGG
kegg_pathway_df = pd.read_csv('./data/kg_data/KEGG/full_kegg_pathway_list.csv')
kegg_pathway_df = kegg_pathway_df[['source', 'target', 'pathway_name']]
kegg_df = kegg_pathway_df[kegg_pathway_df['pathway_name'].str.contains('signaling pathway|signaling pathways', case=False)]
print(kegg_df['pathway_name'].value_counts())
kegg_df = kegg_df.rename(columns={'source': 'src', 'target': 'dest'})
src_list = list(kegg_df['src'])
dest_list = list(kegg_df['dest'])
path_list = list(kegg_df['pathway_name'])
# Adjust all genes to uppercase
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_dest_list = []
for dest in dest_list:
    up_dest = dest.upper()
    up_dest_list.append(up_dest)
up_kegg_conn_dict = {'src': up_src_list, 'dest': up_dest_list}
up_kegg_df = pd.DataFrame(up_kegg_conn_dict)
up_kegg_df = up_kegg_df.drop_duplicates()
up_kegg_df.to_csv('./data/kg_data/KEGG/up_kegg.csv', index=False, header=True)
kegg_gene_list = list(set(list(up_kegg_df['src']) + list(up_kegg_df['dest'])))
print('----- NUMBER OF GENES IN KEGG: ' + str(len(kegg_gene_list)) + ' -----')
print(up_kegg_df.shape)

up_kegg_path_conn_dict = {'src': up_src_list, 'dest': up_dest_list, 'path': path_list}
up_kegg_path_df = pd.DataFrame(up_kegg_path_conn_dict)
up_kegg_path_df = up_kegg_path_df.drop_duplicates()
up_kegg_path_df.to_csv('./data/kg_data/KEGG/up_kegg_path.csv', index=False, header=True)
kegg_path_gene_list = list(set(list(up_kegg_path_df['src']) + list(up_kegg_path_df['dest'])))
print('----- NUMBER OF GENES IN KEGG PATH: ' + str(len(kegg_path_gene_list)) + ' -----')
print(up_kegg_path_df.shape)

In [None]:
# BioGRID
biogrid_df = pd.read_table('./data/kg_data/BioGrid/BIOGRID-ALL-3.5.174.mitab.Symbol.txt', delimiter = '\t')
eh_list = list(biogrid_df['e_h'])
et_list = list(biogrid_df['e_t'])
# ADJUST ALL GENES TO UPPERCASE
up_eh_list = []
for eh in eh_list:
    up_eh = eh.upper()
    up_eh_list.append(up_eh)
up_et_list = []
for et in et_list:
    up_et = et.upper()
    up_et_list.append(up_et)
up_biogrid_conn_dict = {'src': up_eh_list, 'dest': up_et_list}
up_biogrid_df = pd.DataFrame(up_biogrid_conn_dict)
up_biogrid_df = up_biogrid_df.drop_duplicates()
print(up_biogrid_df)
up_biogrid_df.to_csv('./data/kg_data/BioGrid/up_biogrid.csv', index = False, header = True)
up_biogrid_gene_list = list(set(list(up_biogrid_df['src']) + list(up_biogrid_df['dest'])))
print('----- NUMBER OF GENES IN BioGRID: ' + str(len(up_biogrid_gene_list)) + ' -----')
print(up_biogrid_df.shape)

In [None]:
# STRING
string_df = pd.read_csv('./data/kg_data/STRING/9606.protein.links.detailed.v11.0_sym.csv', low_memory=False)
src_list = list(string_df['Source'])
tar_list = list(string_df['Target'])
# ADJUST ALL GENES TO UPPERCASE
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_tar_list = []
for tar in tar_list:
    up_tar = tar.upper()
    up_tar_list.append(up_tar)
up_string_conn_dict = {'src': up_src_list, 'dest': up_tar_list}
up_string_df = pd.DataFrame(up_string_conn_dict)
up_string_df = up_string_df.drop_duplicates()
print(up_string_df)
up_string_df.to_csv('./data/kg_data/STRING/up_string.csv', index = False, header = True)
up_string_gene_list = list(set(list(up_string_df['src']) + list(up_string_df['dest'])))
print('----- NUMBER OF GENES IN STRING: ' + str(len(up_string_gene_list)) + ' -----')

In [None]:
# intersect the [common genes] with the genes in the different databases [KEGG / BioGRID / STRING]
selected_database = 'KEGG'
# selected_database = 'BioGRID'
# selected_database = 'STRING'
intersected_omics_gene_list = list(merged_tran_v1_df['gene_name'])
if selected_database == 'KEGG':
    edge_common_genes = list(set(intersected_omics_gene_list) & set(kegg_gene_list))
elif selected_database == 'BioGRID':
    edge_common_genes = list(set(intersected_omics_gene_list) & set(up_biogrid_gene_list))
elif selected_database == 'STRING':
    edge_common_genes = list(set(intersected_omics_gene_list) & set(up_string_gene_list))

# filter the genes in the different databases [KEGG / BioGRID / STRING] with the [common genes]
if selected_database == 'KEGG':
    filtered_up_kegg_df = up_kegg_df[up_kegg_df['src'].isin(edge_common_genes) & up_kegg_df['dest'].isin(edge_common_genes)]
    src_list = list(filtered_up_kegg_df['src'])
    dest_list = list(filtered_up_kegg_df['dest'])
    all_edge_gene_list = sorted(list(set(src_list + dest_list)))
    print('----- NUMBER OF INTERSECTED GENES IN KEGG: ' + str(len(all_edge_gene_list)) + ' -----')
    edge_common_genes = all_edge_gene_list
    filtered_up_kegg_df = filtered_up_kegg_df.drop_duplicates()
    filtered_up_kegg_df = filtered_up_kegg_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG EDGE CONNECTIONS: ' + str(len(filtered_up_kegg_df)) + ' -----')
    filtered_up_kegg_path_df = up_kegg_path_df[up_kegg_path_df['src'].isin(edge_common_genes) & up_kegg_path_df['dest'].isin(edge_common_genes)]
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.drop_duplicates()
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG PATHWAY CONNECTIONS: ' + str(len(filtered_up_kegg_path_df)) + ' -----')
if selected_database == 'BioGRID':
    filtered_up_biogrid_df = up_biogrid_df[up_biogrid_df['src'].isin(edge_common_genes) & up_biogrid_df['dest'].isin(edge_common_genes)]
    src_list = list(filtered_up_biogrid_df['src'])
    dest_list = list(filtered_up_biogrid_df['dest'])
    all_edge_gene_list = sorted(list(set(src_list + dest_list)))
    print('----- NUMBER OF INTERSECTED GENES IN BioGRID: ' + str(len(all_edge_gene_list)) + ' -----')
    edge_common_genes = all_edge_gene_list
    filtered_up_biogrid_df = filtered_up_biogrid_df.drop_duplicates()
    filtered_up_biogrid_df = filtered_up_biogrid_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW BioGRID EDGE CONNECTIONS: ' + str(len(filtered_up_biogrid_df)) + ' -----')
if selected_database == 'STRING':
    filtered_up_string_df = up_string_df[up_string_df['src'].isin(edge_common_genes) & up_string_df['dest'].isin(edge_common_genes)]
    src_list = list(filtered_up_string_df['src'])
    dest_list = list(filtered_up_string_df['dest'])
    all_edge_gene_list = sorted(list(set(src_list + dest_list)))
    print('----- NUMBER OF INTERSECTED GENES IN STRING: ' + str(len(all_edge_gene_list)) + ' -----')
    edge_common_genes = all_edge_gene_list
    filtered_up_string_df = filtered_up_string_df.drop_duplicates()
    filtered_up_string_df = filtered_up_string_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW STRING EDGE CONNECTIONS: ' + str(len(filtered_up_string_df)) + ' -----')

In [None]:
# Filtering the omics genes in the different databases [KEGG / BioGRID / STRING] with the [all_edge_gene_list]
merged_tran_v1_df = merged_tran_v1_df[merged_tran_v1_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
merged_core_promoter_df = merged_core_promoter_df[merged_core_promoter_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
merged_distal_promoter_df = merged_distal_promoter_df[merged_distal_promoter_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
merged_downstream_df = merged_downstream_df[merged_downstream_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
merged_proximal_promoter_df = merged_proximal_promoter_df[merged_proximal_promoter_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
merged_upstream_df = merged_upstream_df[merged_upstream_df['gene_name'].isin(all_edge_gene_list)].reset_index(drop=True)
display(merged_tran_v1_df)
display(merged_upstream_df)

#### 2.1.2 Important GWAS Genes

In [None]:
t6_df = pd.read_csv('./data/gwas_data/t6.txt', delimiter = '\t')
t6_genes = t6_df['Locus'].tolist()
gene_names_list = merged_tran_v1_df['gene_name'].tolist()
t6_common_genes = list(set(gene_names_list) & set(t6_genes))
print('t6_common_genes:', len(t6_common_genes))
print(t6_common_genes)

### 2.2 Sample Interaction

In [None]:
# Check intersection of subjects between the phenodata and omics data
subject_list = phenodata_df['subject'].tolist()
t2ds_label_subject_list = t2ds_label_df['subject'].tolist()
merged_tran_v1_subject_list = merged_tran_v1_df.columns.tolist()[1:]
merged_upstream_subject_list = merged_upstream_df.columns.tolist()[1:]
intersected_subject_list = sorted(list(set(subject_list) & set(t2ds_label_subject_list) & set(merged_tran_v1_subject_list) & set(merged_upstream_subject_list)))
# Even this is the string, sort the list by numerical order
intersected_subject_list = sorted(intersected_subject_list, key=lambda x: int(x))
print('Number of intersected subjects:', len(intersected_subject_list))

In [None]:
# Keep the subjects that are in both the phenodata and omics data
phenodata_df = phenodata_df[phenodata_df['subject'].isin(intersected_subject_list)].reset_index(drop=True)
t2ds_label_df = t2ds_label_df[t2ds_label_df['subject'].isin(intersected_subject_list)].reset_index(drop=True)
merged_tran_v1_df = merged_tran_v1_df[['gene_name'] + intersected_subject_list]
merged_core_promoter_df = merged_core_promoter_df[['gene_name'] + intersected_subject_list]
merged_distal_promoter_df = merged_distal_promoter_df[['gene_name'] + intersected_subject_list]
merged_downstream_df = merged_downstream_df[['gene_name'] + intersected_subject_list]
merged_proximal_promoter_df = merged_proximal_promoter_df[['gene_name'] + intersected_subject_list]
merged_upstream_df = merged_upstream_df[['gene_name'] + intersected_subject_list]
display(phenodata_df)
display(t2ds_label_df)
display(merged_tran_v1_df)
display(merged_upstream_df)

### 2.3 Statistical Analysis on Samples

In [None]:
### filter out different labels
t2ds_df = t2ds_label_df[t2ds_label_df['t2ds'] == 1]['subject']
pret2ds_df = t2ds_label_df[t2ds_label_df['pret2ds'] == 1]['subject']
no_t2ds_df = t2ds_label_df[(t2ds_label_df['t2ds'] != 1 ) & (t2ds_label_df['pret2ds'] != 1)]['subject']
# check the intersection
# t2ds
t2ds_list = list(t2ds_df) 
t2ds_set = set(t2ds_list)
# pret2ds
pret2ds_list = list(pret2ds_df)
pret2ds_set = set(pret2ds_list)
# no_t2ds
no_t2ds_list = list(no_t2ds_df)
no_t2ds_set = set(no_t2ds_list)
# [t2ds / pret2ds]
t2ds_pret2ds_intersection = t2ds_set.intersection(pret2ds_set)
if t2ds_pret2ds_intersection==set(): print('No intersections of t2ds and pret2ds')
# [t2ds / no_t2ds]
t2ds_no_t2ds_intersection = t2ds_set.intersection(no_t2ds_set)
if t2ds_no_t2ds_intersection==set(): print('No intersections of t2ds and no_t2ds')
# [pret2ds / no_t2ds]
pret2ds_no_t2ds_intersection = pret2ds_set.intersection(no_t2ds_set)
if pret2ds_no_t2ds_intersection==set(): print('No intersections of pret2ds and no_t2ds')

#### 2.3.1 Cleaning clinical data

In [None]:
### cleaning clinical data [phenodata_df]
# intersections between label and clinical data
label_phenodata_df = phenodata_df.merge(t2ds_label_df, left_on='subject', right_on='subject', how='inner')
label_phenodata_df = label_phenodata_df.drop(columns=['t2ds', 'pret2ds'])
label_phenodata_df = label_phenodata_df.dropna(subset=['id'])
# check v1 features, v2 features and their intersections 
label_phenodata_df_col_name_list = list(label_phenodata_df.columns)
print(len(label_phenodata_df_col_name_list))
print(label_phenodata_df_col_name_list)
# v1 features
label_phenodata_df_v1_col_name_list = [col for col in label_phenodata_df_col_name_list if '_v1' in col]
print('--- Number of v1: ', len(label_phenodata_df_v1_col_name_list))
label_phenodata_df_v1_col_realname_list = [name.removesuffix('_v1') for name in label_phenodata_df_v1_col_name_list]
print(label_phenodata_df_v1_col_realname_list)
# v2 features
label_phenodata_df_v2_col_name_list = [col for col in label_phenodata_df_col_name_list if '_v2' in col]
print('--- Number of v2: ', len(label_phenodata_df_v2_col_name_list))
label_phenodata_df_v2_col_realname_list = [name.removesuffix('_v2') for name in label_phenodata_df_v2_col_name_list]
print(label_phenodata_df_v2_col_realname_list)
# gc [growth curve model] features
label_phenodata_df_gc_col_name_list = [col for col in label_phenodata_df_col_name_list if '_gc' in col]
print('--- Number of gc: ', len(label_phenodata_df_gc_col_name_list))
label_phenodata_df_gc_col_realname_list = [name.removesuffix('_gc') for name in label_phenodata_df_gc_col_name_list]
print(label_phenodata_df_gc_col_realname_list)
# ns [naive slope (simple regression)] features
label_phenodata_df_ns_col_name_list = [col for col in label_phenodata_df_col_name_list if '_ns' in col]
print('--- Number of ns: ', len(label_phenodata_df_ns_col_name_list))
label_phenodata_df_ns_col_realname_list = [name.removesuffix('_ns') for name in label_phenodata_df_ns_col_name_list]
print(label_phenodata_df_ns_col_realname_list)
# Not known class features
v1_v2_gc_ns_list = label_phenodata_df_v1_col_name_list + label_phenodata_df_v2_col_name_list +\
                         label_phenodata_df_gc_col_name_list + label_phenodata_df_ns_col_name_list
not_known_col_name_list = [name for name in label_phenodata_df_col_name_list if name not in v1_v2_gc_ns_list]
print('--- Number of not known features: ', len(not_known_col_name_list))
print(not_known_col_name_list)
# [v1 / v2] intersection
v1_v2_intersection_set = set(label_phenodata_df_v1_col_realname_list).intersection(set(label_phenodata_df_v2_col_realname_list))
print('--- Number of intersected features of v1 & v2: ', len(v1_v2_intersection_set))
print(v1_v2_intersection_set)

# reserve only first visit columns [v1]
label_phenodata_df_id_v1_col_name_list = not_known_col_name_list + label_phenodata_df_v1_col_name_list
redundant_feat_list = ['teststrnz_invn_v1', 'srage_logz_invn_v1', 'il6_logz_invn_v1', 'tg_logz_v1', 'ldlz_v1', 'igf1_invnz_v1', 'hdlz_v1', 'hba1cz_v1', 'glucz_v1', 'dhea_logz_v1', '_ins_logz_v1', 'A1Cz_v1']
label_phenodata_df_id_v1_col_name_list = [item for item in label_phenodata_df_id_v1_col_name_list if item not in redundant_feat_list]
print(label_phenodata_df_id_v1_col_name_list)
v1_label_phenodata_df = label_phenodata_df[label_phenodata_df_id_v1_col_name_list]
# # check v1 are all float types
# pd.set_option('display.max_rows', 120)
# print(v1_label_phenodata_df.dtypes)

# data imputation by replacing nan values
v1_label_phenodata_nan_col_list = v1_label_phenodata_df.columns[v1_label_phenodata_df.isna().any()].tolist()
for nan_col_name in v1_label_phenodata_nan_col_list:
    v1_label_phenodata_df[nan_col_name] = v1_label_phenodata_df[nan_col_name].fillna(v1_label_phenodata_df[nan_col_name].mean())

# check if there are any NaN values in the DataFrame
if v1_label_phenodata_df.isnull().values.any():
    print('\n--- DataFrame contains NaN values ---\n')
else:
    print('\n--- DataFrame does not contain NaN values ---\n')


In [None]:
# Check unique value
pd.set_option('display.max_rows', 10)
display(v1_label_phenodata_df)
print(v1_label_phenodata_df.shape)
print(v1_label_phenodata_df.nunique())

#### 2.3.2 Statistical analysis for clinical features

In [None]:
### Stat analysis for clinical features
# [423 t2ds] in clinical dataframe
t2ds_phenodata_df = v1_label_phenodata_df[v1_label_phenodata_df['subject'].isin(t2ds_list)].reset_index(drop=True)
display(t2ds_phenodata_df)
# [588 pret2ds] in clinical dataframe
pret2ds_phenodata_df = v1_label_phenodata_df[v1_label_phenodata_df['subject'].isin(pret2ds_list)].reset_index(drop=True)
display(pret2ds_phenodata_df)
# [3788 no_t2ds] in clinical dataframe
no_t2ds_phenodata_df = v1_label_phenodata_df[v1_label_phenodata_df['subject'].isin(no_t2ds_list)].reset_index(drop=True)
display(no_t2ds_phenodata_df)

from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
# label_phenodata_df_rvid_v1_col_name_list = label_phenodata_df_id_v1_col_name_list[4:]
label_phenodata_df_rvid_v1_col_name_list = label_phenodata_df_id_v1_col_name_list[5:] # remove sex feature
print(label_phenodata_df_id_v1_col_name_list)
print(label_phenodata_df_rvid_v1_col_name_list)
for col_name in label_phenodata_df_rvid_v1_col_name_list:
    t2ds_feature_list = list(t2ds_phenodata_df[col_name])
    pret2ds_feature_list = list(pret2ds_phenodata_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_phenodata_df[col_name])
    # if col_name == 'sex':
    #     # [t2ds / pret2ds]
    #     _, p_value_mw_t2ds_pret2ds = mannwhitneyu(t2ds_feature_list, pret2ds_feature_list)
    #     p_value_ks_t2ds_pret2ds_list.append(p_value_mw_t2ds_pret2ds)
    #     print('[t2ds/ pret2ds] Mann-Whitney U test p-value: ', p_value_ks_t2ds_pret2ds)
    #     # [t2ds / no_t2ds]
    #     _, p_value_mw_t2ds_no_t2ds = mannwhitneyu(t2ds_feature_list, no_t2ds_feature_list)
    #     p_value_ks_t2ds_no_t2ds_list.append(p_value_mw_t2ds_no_t2ds)
    #     print('[t2ds/ no_t2ds] Mann-Whitney U test p-value: ', p_value_ks_t2ds_no_t2ds)
    #     # [pret2ds / no_t2ds]
    #     _, p_value_mw_pret2ds_no_t2ds = mannwhitneyu(pret2ds_feature_list, no_t2ds_feature_list)
    #     p_value_ks_pret2ds_no_t2ds_list.append(p_value_mw_pret2ds_no_t2ds)
    #     print('[pret2ds/ no_t2ds] Mann-Whitney U test p-value: ', p_value_ks_pret2ds_no_t2ds)

    # else:
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

print(len(label_phenodata_df_rvid_v1_col_name_list))
print(len(p_value_ks_t2ds_pret2ds_list))
print(len(p_value_ks_t2ds_no_t2ds_list))
print(len(p_value_ks_pret2ds_no_t2ds_list))

label_phenodata_rvid_v1_col_name_pvalue_df = pd.DataFrame({
    'features': label_phenodata_df_rvid_v1_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
import os
if os.path.exists('./data/stat_data/') == False:
    os.mkdir('./data/stat_data/')
label_phenodata_rvid_v1_col_name_pvalue_df.to_csv('./data/stat_data/label_phenodata_rvid_v1_col_name_pvalue.csv', index=False, header=True)

In [None]:
label_phenodata_df_rvid_v1_continous_name_list = label_phenodata_df_id_v1_col_name_list[5:] # remove [sex]
t2ds_phenodata_average_list = t2ds_phenodata_df[label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()
pret2ds_phenodata_average_list = pret2ds_phenodata_df[label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()
no_t2ds_phenodata_average_list = no_t2ds_phenodata_df[label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()

In [None]:
print(p_value_ks_pret2ds_no_t2ds_list)
print(len(p_value_ks_pret2ds_no_t2ds_list))
print(label_phenodata_df_rvid_v1_continous_name_list)
print(len(label_phenodata_df_rvid_v1_continous_name_list))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.colors import TwoSlopeNorm
import pandas as pd

cmap = 'Oranges_r'
# cmap = 'Oranges'
list1 = p_value_ks_t2ds_pret2ds_list
list2 = p_value_ks_t2ds_no_t2ds_list
list3 = p_value_ks_pret2ds_no_t2ds_list

#retrieve unique labels
ylabels = label_phenodata_df_rvid_v1_continous_name_list
print(len(ylabels))
xlabels = ['T2ds vs Pre_T2ds', 'T2ds vs No_T2ds', 'Pre_T2ds vs No_T2ds']
ylabels_num_list = list(np.arange(0, len(ylabels))) + list(np.arange(0, len(ylabels))) + list(np.arange(0, len(ylabels)))
xlabels_num_list = len(ylabels) * [0] + len(ylabels) * [1] + len(ylabels) * [2]
xn = len(xlabels)
yn = len(ylabels)
#retrieve size and color information    
s = np.array(list1 + list2 + list3)
c = np.array(list1 + list2 + list3)

#preparation of the figure with its grid
fig, ax = plt.subplots(figsize=(20, 10))
ax.set_xlim(-0.5, xn-0.5)
ax.set_ylim(-0.5, yn-0.5)
ax.set(xticks=np.arange(xn), yticks=np.arange(yn),
       xticklabels=xlabels, yticklabels=ylabels)

ax.set_xticks(np.arange(xn)-0.5, minor=True)
ax.set_yticks(np.arange(yn)-0.5, minor=True)
# Rotate x-axis labels
plt.xticks(rotation=45, ha='right', fontsize=8)
# plt.xticks(rotation=90)
ax.grid(which='minor')
#ensure circles are displayed as circles
ax.set_aspect("equal", "box")

#create circles patches and colorbar
# R = 0.4 - s/s.max()/2
R = [0.3] * len(s)
# R = 0.3-s/(s.max()/0.3)
circles = [plt.Circle((xlabels_num_list[i], ylabels_num_list[i]), radius=r) for i, r in enumerate(R)]
norm = TwoSlopeNorm(vmin=0, vmax=1, vcenter=0.1)
col = PatchCollection(circles, array=c, cmap=cmap, norm=norm)
ax.add_collection(col)
fig.colorbar(col, shrink=0.2, aspect=10)

In [None]:
bar_label_phenodata_df_rvid_v1_continous_name_list = label_phenodata_df_id_v1_col_name_list[6:] # remove [sex, age]
bar_t2ds_phenodata_average_list = t2ds_phenodata_df[bar_label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()
bar_pret2ds_phenodata_average_list = pret2ds_phenodata_df[bar_label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()
bar_no_t2ds_phenodata_average_list = no_t2ds_phenodata_df[bar_label_phenodata_df_rvid_v1_continous_name_list].mean().tolist()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data for three bar plots
x1 = np.array(bar_t2ds_phenodata_average_list)
x2 = np.array(bar_pret2ds_phenodata_average_list)
x3 = np.array(bar_no_t2ds_phenodata_average_list)
y1 = np.array(bar_label_phenodata_df_rvid_v1_continous_name_list)

# Create a figure and subplots with shared y-axis
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True)

# Set colors for each bar plot
colors = ['C0', 'C1', 'C2']

# Plot the first bar plot
axes[0].barh(y1, x1, color=colors[0])
axes[0].set_title('T2ds bar plot')

# Plot the second bar plot
axes[1].barh(y1, x2, color=colors[1])
axes[1].set_title('Pre_T2ds bar plot')

# Plot the third bar plot
axes[2].barh(y1, x3, color=colors[2])
axes[2].set_title('No_T2ds bar plot')

# Set common labels and title for the subplots
fig.text(0.5, -0.04, 'Average Values for Each Type of Patient', ha='center')
fig.text(-0.01, 0.5, 'Visit 1 Features', va='center', rotation='vertical')
fig.suptitle('Three Classifications of Patients Bar Plots')

# Adjust the spacing between subplots
plt.tight_layout()

#### 2.3.3 Statistical analysis with Fold-Change for clinical features

In [None]:
import math

def calculate_log2_fold_change(sample1, sample2):
    fold_change = sample2 / sample1
    log2_fold_change = math.log2(fold_change)
    return log2_fold_change

def sample_log2_fold_change_comparison(list1, list2):
    log2_fold_change_list = []
    for sample1, sample2 in zip(list1, list2):
        log2_fold_change = calculate_log2_fold_change(sample1, sample2)
        log2_fold_change_list.append(log2_fold_change)
    
    print("Log2 fold changes:")
    for i, log2_fc in enumerate(log2_fold_changes):
        print(f"Sample {i+1}: {log2_fc}")

    return log2_fold_change_list

# # Fold-change comparisons with math domain error (cause the elements in the those lists have negative values)
# t2ds_pret2ds_fold_change_list = sample_log2_fold_change_comparison(t2ds_phenodata_average_list, pret2ds_phenodata_average_list)
# t2ds_no_t2ds_fold_change_list = sample_log2_fold_change_comparison(t2ds_phenodata_average_list, no_t2ds_phenodata_average_list)
# pret2ds_no_t2ds_fold_change_list = sample_log2_fold_change_comparison(pret2ds_phenodata_average_list, no_t2ds_phenodata_average_list)

#### 2.3.4 Statistical analysis for transcriptomics data

In [None]:
# Keep the subject in the columns for certain types of patients
t2ds_merged_tran_v1_df = merged_tran_v1_df[['gene_name'] + t2ds_list] # [68 t2ds]
display(t2ds_merged_tran_v1_df)
t2ds_merged_tran_v1_transposed_df = t2ds_merged_tran_v1_df.T
t2ds_merged_tran_v1_transposed_df.columns = t2ds_merged_tran_v1_transposed_df.iloc[0]
t2ds_merged_tran_v1_transposed_df = t2ds_merged_tran_v1_transposed_df[1:] 
# convert index to first column with the name 'subject' and remove the index name
t2ds_merged_tran_v1_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_tran_v1_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
display(t2ds_merged_tran_v1_transposed_df)

pret2ds_merged_tran_v1_df = merged_tran_v1_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
display(pret2ds_merged_tran_v1_df)
pret2ds_merged_tran_v1_transposed_df = pret2ds_merged_tran_v1_df.T
pret2ds_merged_tran_v1_transposed_df.columns = pret2ds_merged_tran_v1_transposed_df.iloc[0]
pret2ds_merged_tran_v1_transposed_df = pret2ds_merged_tran_v1_transposed_df[1:]
pret2ds_merged_tran_v1_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_tran_v1_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
display(pret2ds_merged_tran_v1_transposed_df)

no_t2ds_merged_tran_v1_df = merged_tran_v1_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
display(no_t2ds_merged_tran_v1_df)
no_t2ds_merged_tran_v1_transposed_df = no_t2ds_merged_tran_v1_df.T
no_t2ds_merged_tran_v1_transposed_df.columns = no_t2ds_merged_tran_v1_transposed_df.iloc[0]
no_t2ds_merged_tran_v1_transposed_df = no_t2ds_merged_tran_v1_transposed_df[1:]
no_t2ds_merged_tran_v1_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_tran_v1_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
display(no_t2ds_merged_tran_v1_transposed_df)

In [None]:
from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_tran_v1_col_name_list = list(t2ds_merged_tran_v1_transposed_df.columns)[1:]
for col_name in merged_tran_v1_col_name_list:
    t2ds_feature_list = list(t2ds_merged_tran_v1_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_tran_v1_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_tran_v1_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_transcriptomics_pvalue_df = pd.DataFrame({
    'gene_names': merged_tran_v1_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_transcriptomics_pvalue_df)
merged_transcriptomics_pvalue_df.to_csv('./data/stat_data/merged_transcriptomics_pvalue.csv', index=False, header=True)

#### 2.3.5 Statistical analysis for epigenomics data

In [None]:
### Keep the subject in the columns for certain types of patients for core promoters
# t2ds for core promoters
t2ds_merged_core_promoter_df = merged_core_promoter_df[['gene_name'] + t2ds_list] # [68 t2ds]
t2ds_merged_core_promoter_transposed_df = t2ds_merged_core_promoter_df.T
t2ds_merged_core_promoter_transposed_df.columns = t2ds_merged_core_promoter_transposed_df.iloc[0]
t2ds_merged_core_promoter_transposed_df = t2ds_merged_core_promoter_transposed_df[1:]
t2ds_merged_core_promoter_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_core_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# pret2ds for core promoters
pret2ds_merged_core_promoter_df = merged_core_promoter_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
pret2ds_merged_core_promoter_transposed_df = pret2ds_merged_core_promoter_df.T
pret2ds_merged_core_promoter_transposed_df.columns = pret2ds_merged_core_promoter_transposed_df.iloc[0]
pret2ds_merged_core_promoter_transposed_df = pret2ds_merged_core_promoter_transposed_df[1:]
pret2ds_merged_core_promoter_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_core_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# no_t2ds for core promoters
no_t2ds_merged_core_promoter_df = merged_core_promoter_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
no_t2ds_merged_core_promoter_transposed_df = no_t2ds_merged_core_promoter_df.T
no_t2ds_merged_core_promoter_transposed_df.columns = no_t2ds_merged_core_promoter_transposed_df.iloc[0]
no_t2ds_merged_core_promoter_transposed_df = no_t2ds_merged_core_promoter_transposed_df[1:]
no_t2ds_merged_core_promoter_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_core_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)

from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_core_promoter_col_name_list = list(t2ds_merged_core_promoter_transposed_df.columns)[1:]
for col_name in merged_core_promoter_col_name_list:
    t2ds_feature_list = list(t2ds_merged_core_promoter_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_core_promoter_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_core_promoter_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_core_promoter_pvalue_df = pd.DataFrame({
    'gene_names': merged_core_promoter_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_core_promoter_pvalue_df)
merged_core_promoter_pvalue_df.to_csv('./data/stat_data/merged_core_promoter_pvalue.csv', index=False, header=True)

In [None]:
# Keep the subject in the columns for certain types of patients for proximal promoters
# t2ds for proximal promoters
t2ds_merged_proximal_promoter_df = merged_proximal_promoter_df[['gene_name'] + t2ds_list] # [68 t2ds]
t2ds_merged_proximal_promoter_transposed_df = t2ds_merged_proximal_promoter_df.T
t2ds_merged_proximal_promoter_transposed_df.columns = t2ds_merged_proximal_promoter_transposed_df.iloc[0]
t2ds_merged_proximal_promoter_transposed_df = t2ds_merged_proximal_promoter_transposed_df[1:]
t2ds_merged_proximal_promoter_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_proximal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# pret2ds for proximal promoters
pret2ds_merged_proximal_promoter_df = merged_proximal_promoter_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
pret2ds_merged_proximal_promoter_transposed_df = pret2ds_merged_proximal_promoter_df.T
pret2ds_merged_proximal_promoter_transposed_df.columns = pret2ds_merged_proximal_promoter_transposed_df.iloc[0]
pret2ds_merged_proximal_promoter_transposed_df = pret2ds_merged_proximal_promoter_transposed_df[1:]
pret2ds_merged_proximal_promoter_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_proximal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# no_t2ds for proximal promoters
no_t2ds_merged_proximal_promoter_df = merged_proximal_promoter_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
no_t2ds_merged_proximal_promoter_transposed_df = no_t2ds_merged_proximal_promoter_df.T
no_t2ds_merged_proximal_promoter_transposed_df.columns = no_t2ds_merged_proximal_promoter_transposed_df.iloc[0]
no_t2ds_merged_proximal_promoter_transposed_df = no_t2ds_merged_proximal_promoter_transposed_df[1:]
no_t2ds_merged_proximal_promoter_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_proximal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)

from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_proximal_promoter_col_name_list = list(t2ds_merged_proximal_promoter_transposed_df.columns)[1:]
for col_name in merged_proximal_promoter_col_name_list:
    t2ds_feature_list = list(t2ds_merged_proximal_promoter_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_proximal_promoter_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_proximal_promoter_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_proximal_promoter_pvalue_df = pd.DataFrame({
    'gene_names': merged_proximal_promoter_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_proximal_promoter_pvalue_df)
merged_proximal_promoter_pvalue_df.to_csv('./data/stat_data/merged_proximal_promoter_pvalue.csv', index=False, header=True)


In [None]:
# Keep the subject in the columns for certain types of patients for distal promoters
# t2ds for distal promoters
t2ds_merged_distal_promoter_df = merged_distal_promoter_df[['gene_name'] + t2ds_list] # [68 t2ds]
t2ds_merged_distal_promoter_transposed_df = t2ds_merged_distal_promoter_df.T
t2ds_merged_distal_promoter_transposed_df.columns = t2ds_merged_distal_promoter_transposed_df.iloc[0]
t2ds_merged_distal_promoter_transposed_df = t2ds_merged_distal_promoter_transposed_df[1:]
t2ds_merged_distal_promoter_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_distal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# pret2ds for distal promoters
pret2ds_merged_distal_promoter_df = merged_distal_promoter_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
pret2ds_merged_distal_promoter_transposed_df = pret2ds_merged_distal_promoter_df.T
pret2ds_merged_distal_promoter_transposed_df.columns = pret2ds_merged_distal_promoter_transposed_df.iloc[0]
pret2ds_merged_distal_promoter_transposed_df = pret2ds_merged_distal_promoter_transposed_df[1:]
pret2ds_merged_distal_promoter_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_distal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# no_t2ds for distal promoters
no_t2ds_merged_distal_promoter_df = merged_distal_promoter_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
no_t2ds_merged_distal_promoter_transposed_df = no_t2ds_merged_distal_promoter_df.T
no_t2ds_merged_distal_promoter_transposed_df.columns = no_t2ds_merged_distal_promoter_transposed_df.iloc[0]
no_t2ds_merged_distal_promoter_transposed_df = no_t2ds_merged_distal_promoter_transposed_df[1:]
no_t2ds_merged_distal_promoter_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_distal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)

from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_distal_promoter_col_name_list = list(t2ds_merged_distal_promoter_transposed_df.columns)[1:]
for col_name in merged_distal_promoter_col_name_list:
    t2ds_feature_list = list(t2ds_merged_distal_promoter_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_distal_promoter_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_distal_promoter_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_distal_promoter_pvalue_df = pd.DataFrame({
    'gene_names': merged_distal_promoter_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_distal_promoter_pvalue_df)
merged_distal_promoter_pvalue_df.to_csv('./data/stat_data/merged_distal_promoter_pvalue.csv', index=False, header=True)

In [None]:
# Keep the subject in the columns for certain types of patients for distal promoters
# t2ds for distal promoters
t2ds_merged_upstream_df = merged_upstream_df[['gene_name'] + t2ds_list] # [68 t2ds]
t2ds_merged_upstream_transposed_df = t2ds_merged_upstream_df.T
t2ds_merged_upstream_transposed_df.columns = t2ds_merged_upstream_transposed_df.iloc[0]
t2ds_merged_upstream_transposed_df = t2ds_merged_upstream_transposed_df[1:]
t2ds_merged_upstream_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_upstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# pret2ds for distal promoters
pret2ds_merged_upstream_df = merged_upstream_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
pret2ds_merged_upstream_transposed_df = pret2ds_merged_upstream_df.T
pret2ds_merged_upstream_transposed_df.columns = pret2ds_merged_upstream_transposed_df.iloc[0]
pret2ds_merged_upstream_transposed_df = pret2ds_merged_upstream_transposed_df[1:]
pret2ds_merged_upstream_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_upstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# no_t2ds for distal promoters
no_t2ds_merged_upstream_df = merged_upstream_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
no_t2ds_merged_upstream_transposed_df = no_t2ds_merged_upstream_df.T
no_t2ds_merged_upstream_transposed_df.columns = no_t2ds_merged_upstream_transposed_df.iloc[0]
no_t2ds_merged_upstream_transposed_df = no_t2ds_merged_upstream_transposed_df[1:]
no_t2ds_merged_upstream_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_upstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)

from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_upstream_col_name_list = list(t2ds_merged_upstream_transposed_df.columns)[1:]
for col_name in merged_upstream_col_name_list:
    t2ds_feature_list = list(t2ds_merged_upstream_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_upstream_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_upstream_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_upstream_pvalue_df = pd.DataFrame({
    'gene_names': merged_upstream_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_upstream_pvalue_df)
merged_upstream_pvalue_df.to_csv('./data/stat_data/merged_upstream_pvalue.csv', index=False, header=True)

In [None]:
# Keep the subject in the columns for certain types of patients for distal promoters
# t2ds for distal promoters
t2ds_merged_downstream_df = merged_downstream_df[['gene_name'] + t2ds_list] # [68 t2ds]
t2ds_merged_downstream_transposed_df = t2ds_merged_downstream_df.T
t2ds_merged_downstream_transposed_df.columns = t2ds_merged_downstream_transposed_df.iloc[0]
t2ds_merged_downstream_transposed_df = t2ds_merged_downstream_transposed_df[1:]
t2ds_merged_downstream_transposed_df.reset_index(level=0, inplace=True)
t2ds_merged_downstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# pret2ds for distal promoters
pret2ds_merged_downstream_df = merged_downstream_df[['gene_name'] + pret2ds_list] # [105 pret2ds]
pret2ds_merged_downstream_transposed_df = pret2ds_merged_downstream_df.T
pret2ds_merged_downstream_transposed_df.columns = pret2ds_merged_downstream_transposed_df.iloc[0]
pret2ds_merged_downstream_transposed_df = pret2ds_merged_downstream_transposed_df[1:]
pret2ds_merged_downstream_transposed_df.reset_index(level=0, inplace=True)
pret2ds_merged_downstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
# no_t2ds for distal promoters
no_t2ds_merged_downstream_df = merged_downstream_df[['gene_name'] + no_t2ds_list] # [642 no_t2ds]
no_t2ds_merged_downstream_transposed_df = no_t2ds_merged_downstream_df.T
no_t2ds_merged_downstream_transposed_df.columns = no_t2ds_merged_downstream_transposed_df.iloc[0]
no_t2ds_merged_downstream_transposed_df = no_t2ds_merged_downstream_transposed_df[1:]
no_t2ds_merged_downstream_transposed_df.reset_index(level=0, inplace=True)
no_t2ds_merged_downstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)

from scipy.stats import ks_2samp
p_value_ks_t2ds_pret2ds_list = []
p_value_ks_t2ds_no_t2ds_list = []
p_value_ks_pret2ds_no_t2ds_list = []
merged_downstream_col_name_list = list(t2ds_merged_downstream_transposed_df.columns)[1:]
for col_name in merged_downstream_col_name_list:
    t2ds_feature_list = list(t2ds_merged_downstream_transposed_df[col_name])
    pret2ds_feature_list = list(pret2ds_merged_downstream_transposed_df[col_name])
    no_t2ds_feature_list = list(no_t2ds_merged_downstream_transposed_df[col_name])
    # [t2ds/ pret2ds]
    ks_stat_t2ds_pret2ds, p_value_ks_t2ds_pret2ds = ks_2samp(t2ds_feature_list, pret2ds_feature_list)
    p_value_ks_t2ds_pret2ds_list.append(p_value_ks_t2ds_pret2ds)
    # print('[t2ds/ pret2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_pret2ds)
    # [t2ds/ no_t2ds]
    ks_stat_t2ds_no_t2ds, p_value_ks_t2ds_no_t2ds = ks_2samp(t2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_t2ds_no_t2ds_list.append(p_value_ks_t2ds_no_t2ds)
    # print('[t2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_t2ds_no_t2ds)
    # [pret2ds/ no_t2ds]
    ks_stat_pret2ds_no_t2ds, p_value_ks_pret2ds_no_t2ds = ks_2samp(pret2ds_feature_list, no_t2ds_feature_list)
    p_value_ks_pret2ds_no_t2ds_list.append(p_value_ks_pret2ds_no_t2ds)
    # print('[pret2ds/ no_t2ds] Kolmogorov-Smirnov test p-value: ', p_value_ks_pret2ds_no_t2ds)

merged_downstream_pvalue_df = pd.DataFrame({
    'gene_names': merged_downstream_col_name_list,
    't2ds_pret2ds_pvalue': p_value_ks_t2ds_pret2ds_list,
    't2ds_no_t2ds_pvalue': p_value_ks_t2ds_no_t2ds_list,
    'pret2ds_no_t2ds_pvalue': p_value_ks_pret2ds_no_t2ds_list
})
display(merged_downstream_pvalue_df)
merged_downstream_pvalue_df.to_csv('./data/stat_data/merged_downstream_pvalue.csv', index=False, header=True)

## 3. Knowledge Graph Construction

In [None]:
subject_list = list(v1_label_phenodata_df.subject)
print(subject_list)
print('Number of subjects: ', len(subject_list))

### 3.1 Build Up Graph Features

In [None]:
### Convert categorical feature into one-hot feature
v1_label_phenodata_onehot_df = v1_label_phenodata_df.drop(columns =['id', 'fc', 'gpedid'])
# One-hot encode the 'sex' column
one_hot_encoded = pd.get_dummies(v1_label_phenodata_onehot_df['sex'], prefix='sex', dtype=int)
v1_label_phenodata_onehot_df = v1_label_phenodata_onehot_df.drop(columns = ['sex'])
# Concatenate the original DataFrame with the one-hot encoded column
v1_label_phenodata_onehot_df = pd.concat([v1_label_phenodata_onehot_df, one_hot_encoded], axis=1)
display(v1_label_phenodata_onehot_df)

In [None]:
### Categorize the feature into different levels by percentile
# check the 5 and 95 percentiles of each feature
feature_10percentile_list = []
fearure_90percentile_list = []
# features_to_remove = ['subject', 'sex']
v1_label_phenodata_feature_list = list(v1_label_phenodata_df.columns)[4:]
print('Number of numerical features: ', len(v1_label_phenodata_feature_list))
# for loop each feature
v1_label_phenodata_category_df = v1_label_phenodata_df.drop(columns =['id', 'fc', 'gpedid'])
for feature in v1_label_phenodata_feature_list:
    feature_value_array = np.array(v1_label_phenodata_df[feature])
    feature_10percentile = np.percentile(feature_value_array, 10)
    feature_10percentile_list.append(feature_10percentile)
    feature_90percentile = np.percentile(feature_value_array, 90)
    fearure_90percentile_list.append(feature_90percentile)
    # print('---------- ' + str(feature) + ' ----------')
    # print('--- 10 percentile of ' + str(feature), feature_10percentile)
    # print('--- 90 percentile of ' + str(feature), feature_90percentile)
    v1_label_phenodata_category_df[feature] = pd.cut(v1_label_phenodata_df[feature], bins=[float('-inf'), feature_10percentile, feature_90percentile, float('inf')], include_lowest=True, labels=[1,2,3])
# convert 5 and 95 percentile of each feature into dataframe
feature_percentile_df = pd.DataFrame({
    'features': v1_label_phenodata_feature_list,
    'feature_10percentile': feature_10percentile_list,
    'feature_90percentile': fearure_90percentile_list
})
display(v1_label_phenodata_category_df)
subfeature_list = []
for feature in v1_label_phenodata_feature_list:
    feature_unique_values = v1_label_phenodata_category_df[feature].nunique()
    for number in np.arange(1, feature_unique_values+1):
        feature_name = feature + '-' + str(number)
        subfeature_list.append(feature_name)

subfeature_node_idx_list = list(np.arange(len(subfeature_list)))
subfeature_node_dict = {k:v for k, v in zip(subfeature_list, subfeature_node_idx_list)}
subfeature_dict_df = pd.DataFrame({'subfeature_node_idx': subfeature_node_idx_list,
                                'subfeature_names': subfeature_list})
if os.path.exists('./data/filtered_data/') == False:
    os.mkdir('./data/filtered_data/')
subfeature_dict_df.to_csv('./data/filtered_data/subfeature_dict_df.csv', index=False, header=True)
display(subfeature_dict_df)

In [None]:
### Create [node_idx and feature] map
num_subfeature = subfeature_dict_df.shape[0]
num_subject = v1_label_phenodata_category_df.shape[0]
# Formalize the map between [node_idx] and [subject]
subject_list = list(v1_label_phenodata_category_df.subject)
subject_name_list = ['subject-' + str(subject) for subject in subject_list]
subject_node_idx_list = list(np.arange(num_subfeature, num_subfeature+num_subject))
# Formalize the subject dictionary
subject_node_dict = {k:v for k, v in zip(subject_list, subject_node_idx_list)}
subject_node_name_dict = {k:v for k, v in zip(subject_name_list, subject_node_idx_list)}
subject_node_index_dict =  {k:v for k, v in zip(subject_node_idx_list, subject_name_list)}
subject_dict_df = pd.DataFrame({'subject_node_idx': subject_node_idx_list,
                            'subject_number': subject_list,
                            'subject_name': subject_name_list})
print(subject_node_dict)
display(subject_dict_df)
subject_dict_df.to_csv('./data/filtered_data/subject_dict_df.csv', index=False, header=True)

# Concatenate [subfeature_dict_df] and [subject_dict_df]
subject_number_dict_df = subject_dict_df.drop(columns=['subject_number'])
subject_number_dict_df = subject_number_dict_df.rename(columns={'subject_node_idx': 'node_idx',
                                                            'subject_name': 'node_name'})
subfeature_dict_df = subfeature_dict_df.rename(columns={'subfeature_node_idx': 'node_idx',
                                                            'subfeature_names': 'node_name'})
node_idx_name_map_df = pd.concat([subfeature_dict_df, subject_number_dict_df])
node_idx_name_map_df = node_idx_name_map_df.reset_index(drop=True)
display(node_idx_name_map_df)
node_idx_name_map_df.to_csv('./data/filtered_data/node_idx_name_map_df.csv', index=False, header=True)
node_name_idx_dict = node_idx_name_map_df.set_index('node_name')['node_idx'].to_dict()
print(node_name_idx_dict)

In [None]:
### Formalize the subject and subfeature into the graph
# Convert [intersected_v1_label_phenodata_category_df] into [node_idx] dataframe
v1_label_phenodata_category_name_df = v1_label_phenodata_category_df.copy()
for feature in v1_label_phenodata_feature_list:
    v1_label_phenodata_category_name_df[feature] = feature + '-' + v1_label_phenodata_category_df[feature].astype(int).astype(str)
v1_label_phenodata_category_name_df['subject'] = 'subject-' + v1_label_phenodata_category_df['subject'].astype(int).astype(str)
v1_label_phenodata_category_num_df = v1_label_phenodata_category_name_df.copy()
v1_label_phenodata_category_num_df = v1_label_phenodata_category_num_df.replace(node_name_idx_dict)
display(v1_label_phenodata_category_num_df)

# Build up features internal relationship
subfeature_from_list = []
subfeature_to_list = []
v1_label_phenodata_feature_list.remove('sex')
for feature in v1_label_phenodata_feature_list: # [v1_label_phenodata_feature_list] (without [sex / subject])
    feature_unique_values = v1_label_phenodata_category_df[feature].nunique()
    for number in np.arange(1, feature_unique_values):
        feature_from_name = feature + '-' + str(number)
        feature_to_name = feature + '-' + str(number+1)
        subfeature_from_list.append(feature_from_name)
        subfeature_to_list.append(feature_to_name)
subfeature_name_edge_df = pd.DataFrame({'subfeature_from_node_idx': subfeature_from_list,
                                    'subfeature_to_node_idx': subfeature_to_list})
subfeature_num_edge_df = subfeature_name_edge_df.replace(node_name_idx_dict)
subfeature_num_edge_df = subfeature_num_edge_df.rename(columns={'subfeature_from_node_idx': 'from_node_idx',
                                                            'subfeature_to_node_idx': 'to_node_idx'})

# display(subfeature_num_edge_df)                

# Convert the number into the [edge_index]
v1_label_phenodata_category_num_columns = list(v1_label_phenodata_category_num_df.columns)[1:] 
v1_label_phenodata_category_num_columns.remove('sex') # remove [sex]
display(v1_label_phenodata_category_num_df)
print(v1_label_phenodata_category_num_columns)
v1_label_phenodata_category_num_dflist = []
for column_name in v1_label_phenodata_category_num_columns:
    subject_nodeidx = list(v1_label_phenodata_category_num_df['subject'])
    subfeature_idx = list(v1_label_phenodata_category_num_df[column_name].astype(int))
    tmp_df = pd.DataFrame({'subject_node_idx': subject_nodeidx, 
                            'subfeature_idx': subfeature_idx})
    v1_label_phenodata_category_num_dflist.append(tmp_df)
v1_label_phenodata_category_edge_df = pd.concat(v1_label_phenodata_category_num_dflist)
# display(intersected_v1_label_phenodata_category_edge_df)
v1_label_phenodata_category_edge_df = v1_label_phenodata_category_edge_df.rename(columns={'subject_node_idx': 'from_node_idx',
                                                                                                                'subfeature_idx': 'to_node_idx'})
num_edge_df = pd.concat([subfeature_num_edge_df, v1_label_phenodata_category_edge_df])
display(num_edge_df)
num_edge_df.to_csv('./data/filtered_data/num_edge_df.csv', index=False, header=True)

### 3.1.1 Parse Clinical Feature

In [None]:
display(v1_label_phenodata_onehot_df)
v1_label_phenodata_onehot_nodeidx_df = v1_label_phenodata_onehot_df.copy()
#pd.set_option('future.no_silent_downcasting', True)
v1_label_phenodata_onehot_nodeidx_df['subject'] = v1_label_phenodata_onehot_nodeidx_df['subject'].replace(subject_node_dict)
display(v1_label_phenodata_onehot_nodeidx_df)
v1_label_phenodata_onehot_nodeidx_df.to_csv('./data/filtered_data/v1_label_phenodata_onehot_nodeidx_df.csv', index=False, header=True)

### 3.1.2 Parse Transcriptomics Data

In [None]:
display(merged_tran_v1_df)
merged_tran_v1_transposed_df = merged_tran_v1_df.T
merged_tran_v1_transposed_df.columns = merged_tran_v1_transposed_df.iloc[0]
merged_tran_v1_transposed_df = merged_tran_v1_transposed_df[1:] 
# convert index to first column with the name 'subject' and remove the index name
merged_tran_v1_transposed_df.reset_index(level=0, inplace=True)
merged_tran_v1_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_tran_v1_nodeidx_df = merged_tran_v1_transposed_df.copy()
merged_tran_v1_nodeidx_df['subject'] = merged_tran_v1_transposed_df['subject'].replace(subject_node_dict)
merged_tran_v1_nodeidx_df = merged_tran_v1_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
display(merged_tran_v1_nodeidx_df)
merged_tran_v1_nodeidx_df.to_csv('./data/filtered_data/merged_tran_v1_nodeidx_df.csv', index=False, header=True)

### 3.1.3 Parse Epigenomics Data

In [None]:
# Core promoter
merged_core_promoter_transposed_df = merged_core_promoter_df.T
merged_core_promoter_transposed_df.columns = merged_core_promoter_transposed_df.iloc[0]
merged_core_promoter_transposed_df = merged_core_promoter_transposed_df[1:] 
# Convert index to first column with the name 'subject' and remove the index name
merged_core_promoter_transposed_df.reset_index(level=0, inplace=True)
merged_core_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_core_promoter_nodeidx_df = merged_core_promoter_transposed_df.copy()
merged_core_promoter_nodeidx_df['subject'] = merged_core_promoter_transposed_df['subject'].replace(subject_node_dict)
merged_core_promoter_nodeidx_df = merged_core_promoter_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
merged_core_promoter_nodeidx_df.to_csv('./data/filtered_data/merged_core_promoter_nodeidx_df.csv', index=False, header=True)

In [None]:
# Proximal promoter
merged_proximal_promoter_transposed_df = merged_proximal_promoter_df.T
merged_proximal_promoter_transposed_df.columns = merged_proximal_promoter_transposed_df.iloc[0]
merged_proximal_promoter_transposed_df = merged_proximal_promoter_transposed_df[1:] 
# Convert index to first column with the name 'subject' and remove the index name
merged_proximal_promoter_transposed_df.reset_index(level=0, inplace=True)
merged_proximal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_proximal_promoter_nodeidx_df = merged_proximal_promoter_transposed_df.copy()
merged_proximal_promoter_nodeidx_df['subject'] = merged_proximal_promoter_transposed_df['subject'].replace(subject_node_dict)
merged_proximal_promoter_nodeidx_df = merged_proximal_promoter_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
merged_proximal_promoter_nodeidx_df.to_csv('./data/filtered_data/merged_proximal_promoter_nodeidx_df.csv', index=False, header=True)

In [None]:
# Distal promoter
merged_distal_promoter_transposed_df = merged_distal_promoter_df.T
merged_distal_promoter_transposed_df.columns = merged_distal_promoter_transposed_df.iloc[0]
merged_distal_promoter_transposed_df = merged_distal_promoter_transposed_df[1:] 
# Convert index to first column with the name 'subject' and remove the index name
merged_distal_promoter_transposed_df.reset_index(level=0, inplace=True)
merged_distal_promoter_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_distal_promoter_nodeidx_df = merged_distal_promoter_transposed_df.copy()
merged_distal_promoter_nodeidx_df['subject'] = merged_distal_promoter_transposed_df['subject'].replace(subject_node_dict)
merged_distal_promoter_nodeidx_df = merged_distal_promoter_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
merged_distal_promoter_nodeidx_df.to_csv('./data/filtered_data/merged_distal_promoter_nodeidx_df.csv', index=False, header=True)

In [None]:
# Upstream
merged_upstream_transposed_df = merged_upstream_df.T
merged_upstream_transposed_df.columns = merged_upstream_transposed_df.iloc[0]
merged_upstream_transposed_df = merged_upstream_transposed_df[1:] 
# Convert index to first column with the name 'subject' and remove the index name
merged_upstream_transposed_df.reset_index(level=0, inplace=True)
merged_upstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_upstream_nodeidx_df = merged_upstream_transposed_df.copy()
merged_upstream_nodeidx_df['subject'] = merged_upstream_transposed_df['subject'].replace(subject_node_dict)
merged_upstream_nodeidx_df = merged_upstream_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
merged_upstream_nodeidx_df.to_csv('./data/filtered_data/merged_upstream_nodeidx_df.csv', index=False, header=True)

In [None]:
# Downstream
merged_downstream_transposed_df = merged_downstream_df.T
merged_downstream_transposed_df.columns = merged_downstream_transposed_df.iloc[0]
merged_downstream_transposed_df = merged_downstream_transposed_df[1:] 
# Convert index to first column with the name 'subject' and remove the index name
merged_downstream_transposed_df.reset_index(level=0, inplace=True)
merged_downstream_transposed_df.rename(columns={'index': 'subject'}, inplace=True)
merged_downstream_nodeidx_df = merged_downstream_transposed_df.copy()
merged_downstream_nodeidx_df['subject'] = merged_downstream_transposed_df['subject'].replace(subject_node_dict)
merged_downstream_nodeidx_df = merged_downstream_nodeidx_df.rename(columns={'subject': 'subject_nodeidx'})
merged_downstream_nodeidx_df.to_csv('./data/filtered_data/merged_downstream_nodeidx_df.csv', index=False, header=True)

### 3.1.4 Intersected Nodes in Gene Regulatory Network

In [None]:
### Formalize the gene node
gene_name_list = list(merged_tran_v1_df.gene_name)
gene_node_idx_list = list(np.arange(0, len(gene_name_list)))
gene_num_dict = {k:v for k, v in zip(gene_node_idx_list, gene_name_list)}
gene_name_dict = {k:v for k, v in zip(gene_name_list, gene_node_idx_list)}
gene_num_dict_df = pd.DataFrame({'gene_node_idx': gene_node_idx_list,
                                 'gene_node_name': gene_name_list})
display(gene_num_dict_df)
gene_num_dict_df.to_csv('./data/filtered_data/gene_num_dict_df.csv', index=False, header=True)

In [None]:
# Filter gene in the [merged_tran_v1_df]
if selected_database == 'KEGG':
    up_kegg_df = pd.read_csv('./data/kg_data/KEGG/up_kegg.csv')
    up_kegg_df = up_kegg_df.sort_values(by=['src', 'dest'])
    up_kegg_df = up_kegg_df[up_kegg_df['src'].isin(gene_name_list)]
    up_kegg_df = up_kegg_df[up_kegg_df['dest'].isin(gene_name_list)]
    up_kegg_df = up_kegg_df.reset_index(drop=True)
    # Check the number of genes in the [up_kegg_df]
    up_kegg_src_list = list(up_kegg_df.src)
    up_kegg_dest_list = list(up_kegg_df.dest)
    up_kegg_gene_list = list(set(up_kegg_src_list + up_kegg_dest_list))
    print('Number of genes in the [up_kegg_df]: ', len(up_kegg_gene_list))
    # Replace the gene name in the [up_kegg_df] with the node index
    up_kegg_df = up_kegg_df.replace({'src': gene_name_dict, 'dest': gene_name_dict})
    up_kegg_df = up_kegg_df.rename(columns={'src': 'From', 'dest': 'To'})
    up_kegg_df.to_csv('./data/filtered_data/gene_num_edge_df.csv', index=False, header=True)

    reverse_up_kegg_df = up_kegg_df[['To', 'From']]
    reverse_up_kegg_df = reverse_up_kegg_df.rename(columns={'To': 'From', 'From': 'To'})
    concat_df = pd.concat([up_kegg_df, reverse_up_kegg_df], axis=0)
    reverse_gene_num_edge_df = concat_df.sort_values(by=['From', 'To']).drop_duplicates().reset_index(drop=True)
    reverse_gene_num_edge_df.to_csv('./data/filtered_data/reverse_gene_num_edge_df.csv', index=False, header=True)
    display(reverse_gene_num_edge_df)
if selected_database == 'BioGRID':
    up_biogrid_df = pd.read_csv('./data/kg_data/BioGRID/up_biogrid.csv')
    up_biogrid_df = up_biogrid_df.sort_values(by=['src', 'dest'])
    up_biogrid_df = up_biogrid_df[up_biogrid_df['src'].isin(gene_name_list)]
    up_biogrid_df = up_biogrid_df[up_biogrid_df['dest'].isin(gene_name_list)]
    up_biogrid_df = up_biogrid_df.reset_index(drop=True)
    # Check the number of genes in the [up_biogrid_df]
    up_biogrid_src_list = list(up_biogrid_df.src)
    up_biogrid_dest_list = list(up_biogrid_df.dest)
    up_biogrid_gene_list = list(set(up_biogrid_src_list + up_biogrid_dest_list))
    print('Number of genes in the [up_biogrid_df]: ', len(up_biogrid_gene_list))
    # Replace the gene name in the [up_biogrid_df] with the node index
    up_biogrid_df = up_biogrid_df.replace({'src': gene_name_dict, 'dest': gene_name_dict})
    up_biogrid_df = up_biogrid_df.rename(columns={'src': 'From', 'dest': 'To'})
    up_biogrid_df.to_csv('./data/filtered_data/gene_num_edge_df.csv', index=False, header=True)

    reverse_up_biogrid_df = up_biogrid_df[['To', 'From']]
    reverse_up_biogrid_df = reverse_up_biogrid_df.rename(columns={'To': 'From', 'From': 'To'})
    concat_df = pd.concat([up_biogrid_df, reverse_up_biogrid_df], axis=0)
    reverse_gene_num_edge_df = concat_df.sort_values(by=['From', 'To']).drop_duplicates().reset_index(drop=True)
    reverse_gene_num_edge_df.to_csv('./data/filtered_data/reverse_gene_num_edge_df.csv', index=False, header=True)
    display(reverse_gene_num_edge_df)
if selected_database == 'STRING':
    up_string_df = pd.read_csv('./data/kg_data/STRING/up_string.csv')
    up_string_df = up_string_df.sort_values(by=['src', 'dest'])
    up_string_df = up_string_df[up_string_df['src'].isin(gene_name_list)]
    up_string_df = up_string_df[up_string_df['dest'].isin(gene_name_list)]
    up_string_df = up_string_df.reset_index(drop=True)
    # Check the number of genes in the [up_string_df]
    up_string_src_list = list(up_string_df.src)
    up_string_dest_list = list(up_string_df.dest)
    up_string_gene_list = list(set(up_string_src_list + up_string_dest_list))
    print('Number of genes in the [up_string_df]: ', len(up_string_gene_list))
    # Replace the gene name in the [up_string_df] with the node index
    up_string_df = up_string_df.replace({'src': gene_name_dict, 'dest': gene_name_dict})
    up_string_df = up_string_df.rename(columns={'src': 'From', 'dest': 'To'})
    up_string_df.to_csv('./data/filtered_data/gene_num_edge_df.csv', index=False, header=True)

    reverse_up_string_df = up_string_df[['To', 'From']]
    reverse_up_string_df = reverse_up_string_df.rename(columns={'To': 'From', 'From': 'To'})
    concat_df = pd.concat([up_string_df, reverse_up_string_df], axis=0)
    reverse_gene_num_edge_df = concat_df.sort_values(by=['From', 'To']).drop_duplicates().reset_index(drop=True)
    reverse_gene_num_edge_df.to_csv('./data/filtered_data/reverse_gene_num_edge_df.csv', index=False, header=True)
    display(reverse_gene_num_edge_df)

### 3.1.5 Formalize Gene Features

In [None]:
import warnings
from pandas.errors import PerformanceWarning
if os.path.exists('./data/post_data/') == False:
    os.mkdir('./data/post_data/')


### Convert the [merged_tran_v1_nodeidx_df] to numpy array as [gene_tran_x]
print('--------------- Gene Transcriptomics ---------------')
merged_tran_v1_nodeidx_numpy_df = merged_tran_v1_nodeidx_df.copy()
merged_tran_v1_nodeidx_numpy_df = merged_tran_v1_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_tran_x_df = merged_tran_v1_nodeidx_numpy_df.copy()
gene_tran_x = merged_tran_v1_nodeidx_numpy_df.to_numpy()
print('----- Gene Transcriptomics X Features -----')
print(gene_tran_x.shape)
np.save('./data/post_data/gene_tran_x.npy', gene_tran_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_tran_x_df = pd.DataFrame()
for col in gene_tran_x_df.columns:
    if gene_tran_x_df[col].max() == gene_tran_x_df[col].min():
        norm_gene_tran_x_df[col] = gene_tran_x_df[col]
    else:
        norm_gene_tran_x_df[col] = (gene_tran_x_df[col] - gene_tran_x_df[col].min()) / (gene_tran_x_df[col].max() - gene_tran_x_df[col].min())
norm_gene_tran_x = norm_gene_tran_x_df.to_numpy()
print('----- Gene Norm Transcriptomics X Features -----')
print(norm_gene_tran_x.shape)
np.save('./data/post_data/norm_gene_tran_x.npy', norm_gene_tran_x)


### Convert the [merged_core_promoter_nodeidx_df] to numpy array as [gene_core_promoter_x]
print('--------------- Gene Core Promoter ---------------')
merged_core_promoter_nodeidx_numpy_df = merged_core_promoter_nodeidx_df.copy()
merged_core_promoter_nodeidx_numpy_df = merged_core_promoter_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_core_promoter_x_df = merged_core_promoter_nodeidx_numpy_df.copy()
gene_core_promoter_x = merged_core_promoter_nodeidx_numpy_df.to_numpy()
print('----- Gene Core Promoter X Features -----')
print(gene_core_promoter_x.shape)
np.save('./data/post_data/gene_core_promoter_x.npy', gene_core_promoter_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_core_promoter_x_df = pd.DataFrame()
for col in gene_core_promoter_x_df.columns:
    if gene_core_promoter_x_df[col].max() == gene_core_promoter_x_df[col].min():
        norm_gene_core_promoter_x_df[col] = gene_core_promoter_x_df[col]
    else:
        norm_gene_core_promoter_x_df[col] = (gene_core_promoter_x_df[col] - gene_core_promoter_x_df[col].min()) / (gene_core_promoter_x_df[col].max() - gene_core_promoter_x_df[col].min())
norm_gene_core_promoter_x = norm_gene_core_promoter_x_df.to_numpy()
print('----- Gene Norm Core Promoter X Features -----')
print(norm_gene_core_promoter_x.shape)
np.save('./data/post_data/norm_gene_core_promoter_x.npy', norm_gene_core_promoter_x)


### Convert the [merged_proximal_promoter_nodeidx_df] to numpy array as [gene_proximal_promoter_x]
print('--------------- Gene Proximal Promoter ---------------')
merged_proximal_promoter_nodeidx_numpy_df = merged_proximal_promoter_nodeidx_df.copy()
merged_proximal_promoter_nodeidx_numpy_df = merged_proximal_promoter_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_proximal_promoter_x_df = merged_proximal_promoter_nodeidx_numpy_df.copy()
gene_proximal_promoter_x = merged_proximal_promoter_nodeidx_numpy_df.to_numpy()
print('----- Gene Proximal Promoter X Features -----')
print(gene_proximal_promoter_x.shape)
np.save('./data/post_data/gene_proximal_promoter_x.npy', gene_proximal_promoter_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_proximal_promoter_x_df = pd.DataFrame()
for col in gene_proximal_promoter_x_df.columns:
    if gene_proximal_promoter_x_df[col].max() == gene_proximal_promoter_x_df[col].min():
        norm_gene_proximal_promoter_x_df[col] = gene_proximal_promoter_x_df[col]
    else:
        norm_gene_proximal_promoter_x_df[col] = (gene_proximal_promoter_x_df[col] - gene_proximal_promoter_x_df[col].min()) / (gene_proximal_promoter_x_df[col].max() - gene_proximal_promoter_x_df[col].min())
norm_gene_proximal_promoter_x = norm_gene_proximal_promoter_x_df.to_numpy()
print('----- Gene Norm Proximal Promoter X Features -----')
print(norm_gene_proximal_promoter_x.shape)
np.save('./data/post_data/norm_gene_proximal_promoter_x.npy', norm_gene_proximal_promoter_x)


### Convert the [merged_distal_promoter_nodeidx_df] to numpy array as [gene_distal_promoter_x]
print('--------------- Gene Distal Promoter ---------------')
merged_distal_promoter_nodeidx_numpy_df = merged_distal_promoter_nodeidx_df.copy()
merged_distal_promoter_nodeidx_numpy_df = merged_distal_promoter_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_distal_promoter_x_df = merged_distal_promoter_nodeidx_numpy_df.copy()
gene_distal_promoter_x = merged_distal_promoter_nodeidx_numpy_df.to_numpy()
print('----- Gene Distal Promoter X Features -----')
print(gene_distal_promoter_x.shape)
np.save('./data/post_data/gene_distal_promoter_x.npy', gene_distal_promoter_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_distal_promoter_x_df = pd.DataFrame()
for col in gene_distal_promoter_x_df.columns:
    if gene_distal_promoter_x_df[col].max() == gene_distal_promoter_x_df[col].min():
        norm_gene_distal_promoter_x_df[col] = gene_distal_promoter_x_df[col]
    else:
        norm_gene_distal_promoter_x_df[col] = (gene_distal_promoter_x_df[col] - gene_distal_promoter_x_df[col].min()) / (gene_distal_promoter_x_df[col].max() - gene_distal_promoter_x_df[col].min())
norm_gene_distal_promoter_x = norm_gene_distal_promoter_x_df.to_numpy()
print('----- Gene Norm Distal Promoter X Features -----')
print(norm_gene_distal_promoter_x.shape)
np.save('./data/post_data/norm_gene_distal_promoter_x.npy', norm_gene_distal_promoter_x)


### Convert the [merged_upstream_nodeidx_df] to numpy array as [gene_upstream_x]
print('--------------- Gene Upstream ---------------')
merged_upstream_nodeidx_numpy_df = merged_upstream_nodeidx_df.copy()
merged_upstream_nodeidx_numpy_df = merged_upstream_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_upstream_x_df = merged_upstream_nodeidx_numpy_df.copy()
gene_upstream_x = merged_upstream_nodeidx_numpy_df.to_numpy()
print('----- Gene Upstream X Features -----')
print(gene_upstream_x.shape)
np.save('./data/post_data/gene_upstream_x.npy', gene_upstream_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_upstream_x_df = pd.DataFrame()
for col in gene_upstream_x_df.columns:
    if gene_upstream_x_df[col].max() == gene_upstream_x_df[col].min():
        norm_gene_upstream_x_df[col] = gene_upstream_x_df[col]
    else:
        norm_gene_upstream_x_df[col] = (gene_upstream_x_df[col] - gene_upstream_x_df[col].min()) / (gene_upstream_x_df[col].max() - gene_upstream_x_df[col].min())
norm_gene_upstream_x = norm_gene_upstream_x_df.to_numpy()
print('----- Gene Norm Upstream X Features -----')
print(norm_gene_upstream_x.shape)
np.save('./data/post_data/norm_gene_upstream_x.npy', norm_gene_upstream_x)


### Convert the [merged_downstream_nodeidx_df] to numpy array as [gene_downstream_x]
print('--------------- Gene Downstream ---------------')
merged_downstream_nodeidx_numpy_df = merged_downstream_nodeidx_df.copy()
merged_downstream_nodeidx_numpy_df = merged_downstream_nodeidx_numpy_df.drop(columns='subject_nodeidx')
gene_downstream_x_df = merged_downstream_nodeidx_numpy_df.copy()
gene_downstream_x = merged_downstream_nodeidx_numpy_df.to_numpy()
print('----- Gene Downstream X Features -----')
print(gene_downstream_x.shape)
np.save('./data/post_data/gene_downstream_x.npy', gene_downstream_x)
# Suppress the PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_gene_downstream_x_df = pd.DataFrame()
for col in gene_downstream_x_df.columns:
    if gene_downstream_x_df[col].max() == gene_downstream_x_df[col].min():
        norm_gene_downstream_x_df[col] = gene_downstream_x_df[col]
    else:
        norm_gene_downstream_x_df[col] = (gene_downstream_x_df[col] - gene_downstream_x_df[col].min()) / (gene_downstream_x_df[col].max() - gene_downstream_x_df[col].min())
norm_gene_downstream_x = norm_gene_downstream_x_df.to_numpy()
print('----- Gene Norm Downstream X Features -----')
print(norm_gene_downstream_x.shape)
np.save('./data/post_data/norm_gene_downstream_x.npy', norm_gene_downstream_x)


### Concatenate [gene_tran_x] + [gene_core_promoter_x] + [gene_proximal_promoter_x] + [gene_distal_promoter_x] + [gene_upstream_x] + [gene_downstream_x]
print('--------------- Concatenate Gene Features ---------------')
gene_x = np.concatenate((gene_tran_x, gene_core_promoter_x, gene_proximal_promoter_x, gene_distal_promoter_x, gene_upstream_x, gene_downstream_x), axis=1)
print('----- Concatenate Gene Features -----')
print(gene_x.shape)
np.save('./data/post_data/gene_x.npy', gene_x)


### Concatenate [norm_gene_tran_x] + [norm_gene_core_promoter_x] + [norm_gene_proximal_promoter_x] + [norm_gene_distal_promoter_x] + [norm_gene_upstream_x] + [norm_gene_downstream_x]
print('--------------- Concatenate Gene Features ---------------')
norm_gene_x = np.concatenate((norm_gene_tran_x, norm_gene_core_promoter_x, norm_gene_proximal_promoter_x, norm_gene_distal_promoter_x, norm_gene_upstream_x, norm_gene_downstream_x), axis=1)
print('----- Concatenate Gene Features -----')
print(norm_gene_x.shape)
np.save('./data/post_data/norm_gene_x.npy', norm_gene_x)

### 3.1.6 Formalize Gene Graph

In [None]:
# Convert the [gene_num_edge_df] to numpy array as [gene_edge_index]
if selected_database =='KEGG':
    gene_edge_index = up_kegg_df.to_numpy().T
if selected_database =='BioGRID':
    gene_edge_index = up_biogrid_df.to_numpy().T
if selected_database =='STRING':
    gene_edge_index = up_string_df.to_numpy().T
print('----- Gene Edge Index -----')
print(gene_edge_index.shape)
print(gene_edge_index)
np.save('./data/post_data/gene_edge_index.npy', gene_edge_index)

### 3.1.7 Formalize Key Genes

In [None]:
# Convert the t6_common_genes with corresponding gene index by [filtered_data/gene_num_dict_df.csv]
t6_common_genes = sorted(list(t6_common_genes))
print('Number of common genes: ', len(t6_common_genes))
print(t6_common_genes)
t6_common_genes_idx = []
for gene in t6_common_genes:
    t6_common_genes_idx.append(gene_name_dict[gene])
print('Number of common genes: ', len(t6_common_genes_idx))
print(t6_common_genes_idx)

# Convert the [t6_common_genes_idx] to numpy array as [t6_common_genes_idx]
t6_common_genes_idx = np.array(t6_common_genes_idx)
np.save('./data/post_data/key_gene_idx.npy', t6_common_genes_idx)

# save key gene names and index as csv file
key_gene_df = pd.DataFrame({
    'gene_node_idx': t6_common_genes_idx,
    'gene_name': t6_common_genes
})
key_gene_df.to_csv('./data/filtered_data/key_gene_df.csv', index=False, header=True)


### 3.1.8 Formalize Patient Feature

In [None]:
### [x, edge_index]
# [x] = [subfeature_phenodata_x] + [subject_phenodata_x]
if os.path.exists('./data/post_data/') == False:
    os.mkdir('./data/post_data/')
# display(v1_label_phenodata_onehot_nodeidx_df)
x_v1_label_phenodata_onehot_nodeidx_df = v1_label_phenodata_onehot_nodeidx_df.drop(columns=['subject'])
# display(x_v1_label_phenodata_onehot_nodeidx_df)
subject_phenodata_x = x_v1_label_phenodata_onehot_nodeidx_df.to_numpy()
subfeature_dict_df = pd.read_csv('./data/filtered_data/subfeature_dict_df.csv')
num_subfeature = subfeature_dict_df.shape[0]
num_feature = x_v1_label_phenodata_onehot_nodeidx_df.shape[1]
subfeature_phenodata_x = np.zeros((num_subfeature, num_feature))
x = np.vstack((subfeature_phenodata_x, subject_phenodata_x))
print('----- X Shape -----')
print(x.shape)
np.save('./data/post_data/x.npy', x)

# [norm_x] = [subfeature_phenodata_x] + [norm_subject_phenodata_x]
warnings.simplefilter(action='ignore', category=PerformanceWarning)
norm_subject_phenodata_x_df = pd.DataFrame()
for col in x_v1_label_phenodata_onehot_nodeidx_df.columns:
    if x_v1_label_phenodata_onehot_nodeidx_df[col].max() == x_v1_label_phenodata_onehot_nodeidx_df[col].min():
        norm_subject_phenodata_x_df[col] = x_v1_label_phenodata_onehot_nodeidx_df[col]
    else:
        norm_subject_phenodata_x_df[col] = (x_v1_label_phenodata_onehot_nodeidx_df[col] - x_v1_label_phenodata_onehot_nodeidx_df[col].min()) / (x_v1_label_phenodata_onehot_nodeidx_df[col].max() - x_v1_label_phenodata_onehot_nodeidx_df[col].min())
norm_subject_phenodata_x = norm_subject_phenodata_x_df.to_numpy()
norm_subfeature_phenodata_x = np.zeros((num_subfeature, num_feature))
norm_x = np.vstack((norm_subfeature_phenodata_x, norm_subject_phenodata_x))
print('----- Norm X Shape -----')
print(norm_x.shape)
np.save('./data/post_data/norm_x.npy', norm_x)

# [edge_index]
num_edge_df = pd.read_csv('./data/filtered_data/num_edge_df.csv')
display(num_edge_df)
edge_index = np.transpose(num_edge_df.to_numpy())
np.save('./data/post_data/edge_index.npy', edge_index)

### 3.1.9 Formalize Patient Graph

In [None]:
print(len(subject_list))
t2ds_label_df

In [None]:
### [label]
### filter out different labels
t2ds_df = t2ds_label_df[t2ds_label_df['t2ds'] == 1]['subject']
t2ds_df = t2ds_df.reset_index(drop=True).to_frame()
t2ds_df.columns = ['subject']
t2ds_df['t2ds'] = t2ds_df.shape[0] * [1]
t2ds_df['pret2ds'] = t2ds_df.shape[0] * [0]
t2ds_df['no_t2ds'] = t2ds_df.shape[0] * [0]
# display(t2ds_df)
pret2ds_df = t2ds_label_df[t2ds_label_df['pret2ds'] == 1]['subject']
pret2ds_df = pret2ds_df.reset_index(drop=True).to_frame()
pret2ds_df.columns = ['subject']
pret2ds_df['t2ds'] = pret2ds_df.shape[0] * [0]
pret2ds_df['pret2ds'] = pret2ds_df.shape[0] * [1]
pret2ds_df['no_t2ds'] = pret2ds_df.shape[0] * [0]
# display(pret2ds_df)
no_t2ds_df = t2ds_label_df[(t2ds_label_df['t2ds'] != 1 ) & (t2ds_label_df['pret2ds'] != 1)]['subject']
no_t2ds_df = no_t2ds_df.reset_index(drop=True).to_frame()
no_t2ds_df.columns = ['subject']
no_t2ds_df['t2ds'] = no_t2ds_df.shape[0] * [0]
no_t2ds_df['pret2ds'] = no_t2ds_df.shape[0] * [0]
no_t2ds_df['no_t2ds'] = no_t2ds_df.shape[0] * [1]
# display(no_t2ds_df)
label_phenodata_onehot_df = pd.concat([t2ds_df, pret2ds_df, no_t2ds_df])
node_idx_name_map_df = pd.read_csv('./data/filtered_data/node_idx_name_map_df.csv')
label_phenodata_onehot_df['subject'] = 'subject-' + label_phenodata_onehot_df['subject'].astype(str)
# display(label_phenodata_onehot_df)
label_phenodata_onehot_nodeidx_df = label_phenodata_onehot_df.merge(node_idx_name_map_df, left_on='subject', right_on='node_name')
label_phenodata_onehot_nodeidx_df = label_phenodata_onehot_nodeidx_df[['node_idx', 't2ds', 'pret2ds', 'no_t2ds']].sort_values(by=['node_idx']).reset_index(drop=True)
display(label_phenodata_onehot_nodeidx_df)
label_phenodata_onehot_nodeidx_df.to_csv('./data/filtered_data/label_phenodata_onehot_nodeidx_df.csv', index=False, header=True)

### 3.2 Build Up Common GNN Graph

#### 3.2.1 Formalize Phenotype x Data

In [None]:
pheno_x_df = v1_label_phenodata_onehot_df.drop(columns=['subject'])
display(pheno_x_df)

pheno_x = pheno_x_df.to_numpy()
print('----- Pheno X Features -----')
print(pheno_x.shape)
print(pheno_x)
np.save('./data/post_data/pheno_x.npy', pheno_x) 

# Compute the norm for each combination of columns
norm_pheno_x_df = pd.DataFrame()
for col in pheno_x_df.columns:
    norm_pheno_x_df[col] = (pheno_x_df[col] - pheno_x_df[col].min()) / (pheno_x_df[col].max() - pheno_x_df[col].min())
display(norm_pheno_x_df)

norm_pheno_x = norm_pheno_x_df.to_numpy()
print('----- Norm Pheno X Features -----')
print(norm_pheno_x.shape)
print(norm_pheno_x)
np.save('./data/post_data/norm_pheno_x.npy', norm_pheno_x) 


#### 3.2.2 Formalize All x

In [None]:
subfeature_dict_df = pd.read_csv('./data/filtered_data/subfeature_dict_df.csv')
num_subfeature = subfeature_dict_df.shape[0]

# unnormlized data
geno_pheno_x = np.hstack((gene_x, pheno_x))
dim_geno_pheno_x = geno_pheno_x.shape[1]
subfeature_all_x = np.zeros((num_subfeature, dim_geno_pheno_x))
all_x = np.vstack((subfeature_all_x, geno_pheno_x))
print('----- ALL X Features -----')
print(all_x.shape)
# print(all_x)
np.save('./data/post_data/all_x.npy', all_x)

# normlized data
norm_geno_pheno_x = np.hstack((norm_gene_x, norm_pheno_x))
dim_norm_geno_pheno_x = norm_geno_pheno_x.shape[1]
norm_subfeature_all_x = np.zeros((num_subfeature, dim_norm_geno_pheno_x))
norm_all_x = np.vstack((norm_subfeature_all_x, norm_geno_pheno_x))
print('----- Norm ALL X Features -----')
print(norm_all_x.shape)
# print(norm_all_x)
np.save('./data/post_data/norm_all_x.npy', norm_all_x)