## 1.Download the datasets from UCSC website

### 1.1 Download the multi-omics data

* Parse the data from the UCSC Xena website in PANCAN cohort:
https://xenabrowser.net/datapages/?cohort=TCGA%20Pan-Cancer%20(PANCAN)&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

<!-- * Copy number (gene-level) - gene-level copy number (gistic2_thresholded)
    * Dataset: https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FGistic2_mutation_Gistic2_all_thresholded.by_genes&host=https%3A%2F%2Ftcga.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443 -->
* Copy number - somatic mutation (SNP and INDEL) - Gene level non-silent mutation
    * Dataset: https://xenabrowser.net/datapages/?dataset=mc3.v0.2.8.PUBLIC.nonsilentGene.xena&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

* DNA methylation (Methylation450K)
    * Dataset: https://xenabrowser.net/datapages/?dataset=jhu-usc.edu_PANCAN_HumanMethylation450.betaValue_whitelisted.tsv.synapse_download_5096262.xena&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
    * ID Map: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL16304

* Gene expression RNAseq - TOIL RSEM fpkm
    * Dataset: https://xenabrowser.net/datapages/?dataset=tcga_RSEM_gene_fpkm&host=https%3A%2F%2Ftoil.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
    
* Protein expression - RPPA
    * Dataset: https://xenabrowser.net/datapages/?dataset=TCGA-RPPA-pancan-clean.xena&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

### 1.2 Download the clinical data

### 1.2 Download the clinical data

* Phenotype - Curated clinical data
    * Dataset: https://xenabrowser.net/datapages/?dataset=Survival_SupplementalTable_S1_20171025_xena_sp&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

* Phenotype - Immune subtype
    * Dataset: https://xenabrowser.net/datapages/?dataset=Subtype_Immune_Model_Based.txt&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

* Phenotype - Molecular subtype
    * Dataset: https://xenabrowser.net/datapages/?dataset=TCGASubtype.20170308.tsv&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

* Phenotype - sample type and primary disease
    * Dataset: https://xenabrowser.net/datapages/?dataset=TCGA_phenotype_denseDataOnlyDownload.tsv&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443


## 2.Read the files

### 2.1 Read DNA methylation data

In [None]:
import pandas as pd

In [None]:
methylation_value = pd.read_csv('./UCSC-raw/jhu-usc.edu_PANCAN_HumanMethylation450.betaValue_whitelisted.tsv.synapse_download_5096262.xena', delimiter='\t')

In [None]:
methylation_value

### 2.2 Read Platform annotations for 450k methylation 

In [None]:
import pandas as pd
# Reading and processing basic data
try:
    annotation = pd.read_table('./UCSC-raw/GPL16304-47833.txt', delimiter='\t')
    annotation['Distance_closest_TSS'] = annotation['Distance_closest_TSS'].astype(int)
    annotation = annotation[~annotation['Closest_TSS'].apply(lambda x: len(str(x).split(';')) > 1)]
except ValueError as e:
    print(f"Unable to convert 'Closest_TSS' column to integer: {e}")
    problematic_rows = annotation['Distance_closest_TSS'].apply(lambda x: not str(x).isnumeric())
    print("Problematic rows:")
    print(annotation.loc[problematic_rows])

annotation

In [None]:
map_annotation= annotation[['ID', 'Closest_TSS','Closest_TSS_gene_name', 'Distance_closest_TSS']]
map_annotation

### 2.3 Read mutation data

In [None]:
# copynumber = pd.read_csv('./UCSC-raw/Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes',sep='\t')
# copynumber
mutation = pd.read_csv('./UCSC-raw/mc3.v0.2.8.PUBLIC.nonsilentGene.xena',sep='\t')
mutation

In [None]:
original_gene_names = mutation['sample'].copy()

# Use a regular expression to remove the period and any characters following it
mutation['sample'] = mutation['sample'].str.replace(r'\..*', '', regex=True)

# Determine how many rows were changed by comparing the new values to the original ones
rows_changed = (original_gene_names != mutation['sample']).sum()

# Output the updated DataFrame and the number of rows that were changed
mutation, rows_changed

### 2.4 Read gene expression data, mapping information and substitue the the gene name, and import Ensembl Dataset as a standard

In [None]:
gene_expression = pd.read_csv('./UCSC-raw/tcga_RSEM_gene_fpkm', sep='\t')
gene_expression

In [None]:
gene_expression_map = pd.read_csv('./UCSC-raw/probeMap_gencode.v23.annotation.gene.probemap', sep='\t')

In [None]:
gene_expression_map

In [None]:
expression_merged = pd.merge(gene_expression, gene_expression_map, left_on='sample', right_on='id', how='left')

In [None]:
expression_merged.drop(columns=['sample','chrom', 'chromStart','chromEnd','strand','id'], inplace=True)

In [None]:
# set gene to the first column
cols = ['gene'] + [col for col in expression_merged if col != 'gene']
gene_expression = expression_merged[cols]

In [None]:
gene_expression 

In [None]:
original_gene_names = gene_expression['gene'].copy()

# Use a regular expression to remove the period and any characters following it
gene_expression['gene'] = gene_expression['gene'].str.replace(r'\..*', '', regex=True)

# Determine how many rows were changed by comparing the new values to the original ones
rows_changed = (original_gene_names != gene_expression['gene']).sum()

# Output the updated DataFrame and the number of rows that were changed
gene_expression, rows_changed

In [None]:
# this will be used in part4
ensembl_data_unique_gene = pd.read_csv("./UCSC-raw/Meta-Data/mart_export_unique_gene.txt")
ensembl_data_unique_gene

### 2.5 Read clinical data

In [None]:
survival = pd.read_csv('./UCSC-raw/Survival_SupplementalTable_S1_20171025_xena_sp', sep='\t')

In [None]:
survival

### 2.6 Read immune subtype data

In [None]:
immune_subtype = pd.read_csv('./UCSC-raw/Subtype_Immune_Model_Based.txt',sep='\t')

In [None]:
immune_subtype

### 2.7 Read proteomics data

In [None]:
protein = pd.read_csv('./UCSC-raw/TCGA-RPPA-pancan-clean.xena',sep='\t')

In [None]:
protein

### 2.8 Read molecular subtype

In [None]:
cellsub = pd.read_csv('./UCSC-raw/TCGASubtype.20170308.tsv', sep='\t')

In [None]:
cellsub

### 2.9 Read sample type and primary disease

In [None]:
dense = pd.read_csv('./UCSC-raw/TCGA_phenotype_denseDataOnlyDownload.tsv', sep='\t')

In [None]:
dense

### 2.10 Unify patient samples within methylation, copynumer,  gene expression, clinical, proteomics, molecular subtype, and sample type, primary disease datasets

In [None]:
# molecular subtype
cellsub.rename(columns={'sampleID': 'sample'}, inplace=True)
cellsub

In [None]:
# Upstream_df_filtered, Distal_Promoter_df_filtered, Proximal_Promoter_df_filtered, Core_Promoter_df_filtered, Downstream_df_filtered
# mutation_filtered, gene_expression_filtered
# survival, protein, cellsub, dense

# Extract column names starting with 'TCGA' from methylation datasets
tcga_columns_methylation = [col for col in methylation_value.columns if col.startswith('TCGA')]

# Extract 'TCGA' columns from other datasets
tcga_columns_mutation = [col for col in mutation.columns if col.startswith('TCGA')]
tcga_columns_gene_expression = [col for col in gene_expression.columns if col.startswith('TCGA')]
tcga_columns_survival = [col for col in survival['sample'] if col.startswith('TCGA')]
tcga_columns_protein = [col for col in protein.columns if col.startswith('TCGA')]
tcga_columns_cellsub = [col for col in cellsub['sample'] if col.startswith('TCGA')]
tcga_columns_dense = [col for col in dense['sample'] if col.startswith('TCGA')]
tcga_columns_immune_subtype = [col for col in immune_subtype['sample'] if col.startswith('TCGA')]
# Find the intersection of TCGA column names across all DataFrames
common_tcga_columns = set(tcga_columns_methylation) & set(tcga_columns_mutation) & set(tcga_columns_gene_expression) & set(tcga_columns_survival) & set(tcga_columns_protein) & set(tcga_columns_cellsub) & set(tcga_columns_dense) & set(tcga_columns_immune_subtype)

# Convert the intersection back to a list, if needed
common_tcga_columns_list = sorted(list(common_tcga_columns))

# Print the number and the list of common TCGA columns
print(f"Number of common TCGA columns: {len(common_tcga_columns)}")

In [None]:
# Define columns to keep along with common TCGA columns
additional_cols_methylation = ['sample']

# Filter each methylation DataFrame
methylation_value = methylation_value[additional_cols_methylation + common_tcga_columns_list]


In [None]:
methylation_value

In [None]:
# Define columns to keep along with common TCGA columns
additional_cols_mutation = ['sample']

# Filter the mutation DataFrame
mutation = mutation[additional_cols_mutation + common_tcga_columns_list]
mutation

In [None]:
# Define columns to keep along with common TCGA columns
additional_cols_gene_expression = ['gene']

# Filter the gene expression DataFrame
gene_expression = gene_expression[additional_cols_gene_expression + common_tcga_columns_list]
gene_expression

In [None]:
# Define columns to keep along with common TCGA columns
additional_cols_protein = ['SampleID']

# Filter the protein DataFrame
protein = protein[additional_cols_protein + common_tcga_columns_list]
protein

In [None]:
# Filter rows based on common TCGA identifiers
immune_subtype_filtered = immune_subtype[immune_subtype['sample'].isin(common_tcga_columns_list)]
immune_subtype_filtered = immune_subtype_filtered.sort_values(by=['sample']).reset_index(drop=True)

survival_filtered = survival[survival['sample'].isin(common_tcga_columns_list)]
survival_filtered = survival_filtered.sort_values(by=['sample']).reset_index(drop=True)

cellsub.rename(columns={'sampleID': 'sample'}, inplace=True)
cellsub_filtered = cellsub[cellsub['sample'].isin(common_tcga_columns_list)]
cellsub_filtered = cellsub_filtered.sort_values(by=['sample']).reset_index(drop=True)

dense_filtered = dense[dense['sample'].isin(common_tcga_columns_list)]
dense_filtered = dense_filtered.sort_values(by=['sample']).reset_index(drop=True)

In [None]:
immune_subtype_filtered

## 3.Methylation data process

### 3.1 Define methylation region

In [None]:
import pandas as pd
import numpy as np

#Define vectorized area determination function for methylation data
def vectorized_determine_region(distances):
    regions = ['Upstream', 'Distal Promoter', 'Proximal Promoter', 'Core Promoter', 'Downstream']
    conditions = [
        (-6000 <= distances) & (distances < -3000),
        (-3000 <= distances) & (distances < -250),
        (-250 <= distances) & (distances < -50),
        (-50 <= distances) & (distances <= 0),
        (0 < distances) & (distances <= 3000)
    ]
    return np.select(conditions, regions, default=None)

### 3.2 Merge the annotation files to the methylation data and apply region function

In [None]:
# Merging basic data and methylation data
methylation_merged_df = pd.merge(map_annotation, methylation_value, left_on='ID', right_on='sample', how='right')

# Determining the region for each row outside the loop
methylation_merged_df['Region'] = vectorized_determine_region(methylation_merged_df['Distance_closest_TSS'])

methylation_merged_df = methylation_merged_df.dropna(subset=['Region'])  # Remove rows without a region

# Initializing a dictionary to store data for each region
regions_data = {region: pd.DataFrame() for region in ["Upstream", "Distal Promoter", "Proximal Promoter", "Core Promoter", "Downstream"]}

In [None]:
methylation_merged_df

In [None]:
# Delete the 'sample' column
methylation_merged_df = methylation_merged_df.drop('sample', axis=1)
# Delete the 'ID' column
methylation_merged_df = methylation_merged_df.drop('ID', axis=1)
# Delete the 'Distance_closest_TSS' column
methylation_merged_df = methylation_merged_df.drop('Distance_closest_TSS', axis=1)


In [None]:
methylation_merged_df

In [None]:
original_gene_names = methylation_merged_df['Closest_TSS_gene_name'].copy()

# Use a regular expression to remove the period and any characters following it
methylation_merged_df['Closest_TSS_gene_name'] = methylation_merged_df['Closest_TSS_gene_name'].str.replace(r'\..*', '', regex=True)

# Determine how many rows were changed by comparing the new values to the original ones
rows_changed = (original_gene_names != methylation_merged_df['Closest_TSS_gene_name']).sum()

# Output the updated DataFrame and the number of rows that were changed
methylation_merged_df, rows_changed

In [None]:
methylation_merged_df['Closest_TSS'] = methylation_merged_df['Closest_TSS'].astype(int)
methylation_merged_df['Closest_TSS_gene_name'] = methylation_merged_df['Closest_TSS_gene_name'].astype(str)
methylation_merged_df['Region'] = methylation_merged_df['Region'].astype(str)

In [None]:
print(methylation_merged_df[['Closest_TSS', 'Closest_TSS_gene_name', 'Region']].dtypes)

### 3.3 Calculate the average methylation value of five regions

In [None]:
# find columns not started with 'TCGA'
non_tcga_columns = methylation_merged_df.filter(regex='^(?!TCGA)').columns

print("Columns not starting with TCGA:")
print(non_tcga_columns)

In [None]:
# obtain all regions
regions = methylation_merged_df['Region'].unique()
regions

In [None]:
import pandas as pd
# Initialize empty DataFrames for each region
Upstream_df = pd.DataFrame()
Distal_Promoter_df = pd.DataFrame()
Proximal_Promoter_df = pd.DataFrame()
Core_Promoter_df = pd.DataFrame()
Downstream_df = pd.DataFrame()

# Operate on each region
for region in regions:
    # Get all data for this region
    region_data = methylation_merged_df[methylation_merged_df['Region'] == region]
    
    # Group and calculate the average for each (TSS, Region) combination
    grouped = region_data.groupby(['Closest_TSS_gene_name', 'Region'], as_index=False).mean()
    
    # Since we split the data into different files based on Region, we can delete this column
    grouped = grouped.drop(columns=['Region'])
    
    # Print the shape of the grouped data
    print(f"Shape of {region}: {grouped.shape}")
    
    # Assign the grouped data to the respective DataFrame
    if region == 'Upstream':
        Upstream_df = grouped
    elif region == 'Distal Promoter':
        Distal_Promoter_df = grouped
    elif region == 'Proximal Promoter':
        Proximal_Promoter_df = grouped
    elif region == 'Core Promoter':
        Core_Promoter_df = grouped
    elif region == 'Downstream':
        Downstream_df = grouped

    # Optionally, save the data for this region to a new csv file
    # grouped.to_csv(f"{region}_averaged_tss_data.csv", index=False)


### 3.4 Unify gene and TSS for five methylation value files

In [None]:
import pandas as pd

# from methylation files above DataFrame 
dfs = [Upstream_df, Distal_Promoter_df, Proximal_Promoter_df, Core_Promoter_df, Downstream_df]

# merge those files to find all combos 
all_genes_tss = pd.concat(dfs)['Closest_TSS_gene_name'].drop_duplicates()

In [None]:
all_genes_tss

In [None]:
# Merge unique combinations back into each DataFrame and fill NaN values with 0
# Upstream
Upstream_df = pd.merge(all_genes_tss, Upstream_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Upstream_df: {Upstream_df.shape}")

# Distal Promoter
Distal_Promoter_df = pd.merge(all_genes_tss, Distal_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Distal_Promoter_df: {Distal_Promoter_df.shape}")

# Proximal Promoter
Proximal_Promoter_df = pd.merge(all_genes_tss, Proximal_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Proximal_Promoter_df: {Proximal_Promoter_df.shape}")

# Core Promoter
Core_Promoter_df = pd.merge(all_genes_tss, Core_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Core_Promoter_df: {Core_Promoter_df.shape}")

# Downstream
Downstream_df = pd.merge(all_genes_tss, Downstream_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Downstream_df: {Downstream_df.shape}")

## 4.Unify genes

### 4.1 Unify gene and TSS for methylation, copynumer, and gene expression data

In [None]:
mutation.rename(columns={'sample': 'gene_name'}, inplace=True)
mutation = mutation.dropna(subset=['gene_name'])
mutation = mutation.groupby('gene_name', as_index=False).mean()
mutation = mutation.sort_values(by=['gene_name']).reset_index(drop=True)
mutation

In [None]:
gene_expression.rename(columns={'gene': 'gene_name'}, inplace=True)
gene_expression = gene_expression.dropna(subset=['gene_name'])
gene_expression = gene_expression.groupby('gene_name', as_index=False).mean()
gene_expression = gene_expression.sort_values(by=['gene_name']).reset_index(drop=True)
gene_expression

In [None]:
protein.rename(columns={'SampleID': 'gene_name'}, inplace=True)
protein = protein.dropna(subset=['gene_name'])
protein = protein.groupby('gene_name', as_index=False).mean()
protein = protein.sort_values(by=['gene_name']).reset_index(drop=True)
protein

In [None]:
# Upstream
Upstream_df.rename(columns={'Closest_TSS_gene_name': 'gene_name'}, inplace=True)
Upstream_df = Upstream_df.dropna(subset=['gene_name'])
Upstream_df = Upstream_df.groupby('gene_name', as_index=False).mean()
Upstream_df = Upstream_df.sort_values(by=['gene_name']).reset_index(drop=True)
# Find indices where 'row_name' is 'unknown'
Upstream_indices_to_drop = Upstream_df[Upstream_df['gene_name'] == 'unknown'].index
Upstream_df.drop(Upstream_indices_to_drop, inplace=True)

# Distal Promoter
Distal_Promoter_df.rename(columns={'Closest_TSS_gene_name': 'gene_name'}, inplace=True)
Distal_Promoter_df = Distal_Promoter_df.dropna(subset=['gene_name'])
Distal_Promoter_df = Distal_Promoter_df.groupby('gene_name', as_index=False).mean()
Distal_Promoter_df = Distal_Promoter_df.sort_values(by=['gene_name']).reset_index(drop=True)
# Find indices where 'row_name' is 'unknown'
Distal_Promoter_indices_to_drop = Distal_Promoter_df[Distal_Promoter_df['gene_name'] == 'unknown'].index
Distal_Promoter_df.drop(Distal_Promoter_indices_to_drop, inplace=True)

# Proximal Promoter
Proximal_Promoter_df.rename(columns={'Closest_TSS_gene_name': 'gene_name'}, inplace=True)
Proximal_Promoter_df = Proximal_Promoter_df.dropna(subset=['gene_name'])
Proximal_Promoter_df = Proximal_Promoter_df.groupby('gene_name', as_index=False).mean()
Proximal_Promoter_df = Proximal_Promoter_df.sort_values(by=['gene_name']).reset_index(drop=True)
# Find indices where 'row_name' is 'unknown'
Proximal_Promoter_indices_to_drop = Proximal_Promoter_df[Proximal_Promoter_df['gene_name'] == 'unknown'].index
Proximal_Promoter_df.drop(Proximal_Promoter_indices_to_drop, inplace=True)

# Core Promoter
Core_Promoter_df.rename(columns={'Closest_TSS_gene_name': 'gene_name'}, inplace=True)
Core_Promoter_df = Core_Promoter_df.dropna(subset=['gene_name'])
Core_Promoter_df = Core_Promoter_df.groupby('gene_name', as_index=False).mean()
Core_Promoter_df = Core_Promoter_df.sort_values(by=['gene_name']).reset_index(drop=True)
# Find indices where 'row_name' is 'unknown'
Core_Promoter_indices_to_drop = Core_Promoter_df[Core_Promoter_df['gene_name'] == 'unknown'].index
Core_Promoter_df.drop(Core_Promoter_indices_to_drop, inplace=True)

# Downstream
Downstream_df.rename(columns={'Closest_TSS_gene_name': 'gene_name'}, inplace=True)
Downstream_df = Downstream_df.dropna(subset=['gene_name'])
Downstream_df = Downstream_df.groupby('gene_name', as_index=False).mean()
Downstream_df = Downstream_df.sort_values(by=['gene_name']).reset_index(drop=True)
# Find indices where 'row_name' is 'unknown'
Downstream_indices_to_drop = Downstream_df[Downstream_df['gene_name'] == 'unknown'].index
Downstream_df.drop(Downstream_indices_to_drop, inplace=True)

display(Upstream_df)
display(Distal_Promoter_df)
display(Proximal_Promoter_df)
display(Core_Promoter_df)
display(Downstream_df)

In [None]:
Upstream_df.head()

In [None]:
mutation = mutation[mutation['gene_name'].isin(ensembl_data_unique_gene['Gene name'])]
gene_expression = gene_expression[gene_expression['gene_name'].isin(ensembl_data_unique_gene['Gene name'])]
Upstream_df = Upstream_df[Upstream_df['gene_name'].isin(ensembl_data_unique_gene['Gene name'])]
protein = protein[protein['gene_name'].isin(ensembl_data_unique_gene['Gene name'])]

In [None]:
import pandas as pd

# Convert the gene name columns from each DataFrame to sets
mutation_genes = set(mutation['gene_name'])
print(f"Number of mutation genes: {len(mutation_genes)}")
gene_expression_genes = set(gene_expression['gene_name'])
print(f"Number of gene expression genes: {len(gene_expression_genes)}")
methylation_genes = set(Upstream_df['gene_name'])
print(f"Number of methylation genes: {len(methylation_genes)}")
protein_genes = set(protein['gene_name'])
print(f"Number of protein genes: {len(protein_genes)}")
# Find the intersection of the three sets
common_genes = mutation_genes | gene_expression_genes | methylation_genes | protein_genes

# Convert the intersection back to a list, if needed
common_genes_list = list(common_genes)

# Print the number of common genes
print(f"Number of common genes: {len(common_genes)}")

In [None]:
#count the Transcript type
ensembl_data_type = pd.read_csv("./UCSC-raw/Meta-Data/mart_export.txt")
ensembl_data_type= ensembl_data_type.rename(columns={'Gene name': 'gene_name'})
ensembl_data_type = ensembl_data_type.drop_duplicates(subset='gene_name', keep='first')
ensembl_data_type
# Now, merge the two dataframes on the 'gene_name' column
Upstream_df_transcript = pd.merge(Upstream_df, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (Upstream_df_transcript['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

protein_transcript = pd.merge(protein, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (protein_transcript['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

mutation_transcript = pd.merge(mutation, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (mutation_transcript['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

gene_expression_transcript = pd.merge(gene_expression, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (gene_expression_transcript['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

#### 4.1.1 Make the gene expression, mutation, copy number and proteomics data inputted

In [None]:
common_genes_df = pd.DataFrame(common_genes, columns=['gene_name'])
common_genes_df = common_genes_df.sort_values(by=['gene_name']).reset_index(drop=True)
common_genes_df

In [None]:
merged_data_transcript_Upstream_df = pd.merge(common_genes_df, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (merged_data_transcript_Upstream_df['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

In [None]:
unique_values = merged_data_transcript_Upstream_df['Transcript type'].value_counts()
unique_values

In [None]:
gene_expression_inputted = pd.merge(common_genes_df ,gene_expression,on='gene_name',how='outer').fillna(0)
protein_inputted = pd.merge(common_genes_df, protein, on='gene_name', how='outer').fillna(0)

Upstream_df_inputted = pd.merge(common_genes_df, Upstream_df,on='gene_name',how='outer').fillna(0)
Distal_Promoter_df_inputted = pd.merge(common_genes_df, Distal_Promoter_df,on='gene_name',how='outer').fillna(0)
Proximal_Promoter_df_inputted = pd.merge(common_genes_df, Proximal_Promoter_df,on='gene_name',how='outer').fillna(0)
Core_Promoter_df_inputted = pd.merge(common_genes_df, Core_Promoter_df,on='gene_name',how='outer').fillna(0)
Downstream_df_inputted = pd.merge(common_genes_df, Downstream_df,on='gene_name',how='outer').fillna(0)

mutation_inputted = pd.merge(common_genes_df, mutation, on='gene_name', how='outer').fillna(-1)

display(gene_expression_inputted)

#### 4.1.2 Intersecting genes with various databases

In [None]:
import pandas as pd
# Add the gene names from databases like [KEGG / BioGRID] to intersect with the common genes
# KEGG
kegg_pathway_df = pd.read_csv('./Regulatory-network-data/KEGG/full_kegg_pathway_list.csv')
kegg_pathway_df = kegg_pathway_df[['source', 'target', 'pathway_name']]
kegg_df = kegg_pathway_df[kegg_pathway_df['pathway_name'].str.contains('signaling pathway|signaling pathways', case=False)]
print(kegg_df['pathway_name'].value_counts())
kegg_df = kegg_df.rename(columns={'source': 'src', 'target': 'dest'})
src_list = list(kegg_df['src'])
dest_list = list(kegg_df['dest'])
path_list = list(kegg_df['pathway_name'])
# ADJUST ALL GENES TO UPPERCASE
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_dest_list = []
for dest in dest_list:
    up_dest = dest.upper()
    up_dest_list.append(up_dest)
up_kegg_conn_dict = {'src': up_src_list, 'dest': up_dest_list}
up_kegg_df = pd.DataFrame(up_kegg_conn_dict)
up_kegg_df = up_kegg_df.drop_duplicates()
up_kegg_df.to_csv('./Regulatory-network-data/KEGG/up_kegg.csv', index=False, header=True)
kegg_gene_list = list(set(list(up_kegg_df['src']) + list(up_kegg_df['dest'])))
print('----- NUMBER OF GENES IN KEGG: ' + str(len(kegg_gene_list)) + ' -----')
print(up_kegg_df.shape)

up_kegg_path_conn_dict = {'src': up_src_list, 'dest': up_dest_list, 'path': path_list}
up_kegg_path_df = pd.DataFrame(up_kegg_path_conn_dict)
up_kegg_path_df = up_kegg_path_df.drop_duplicates()
up_kegg_path_df.to_csv('./Regulatory-network-data/KEGG/up_kegg_path.csv', index=False, header=True)
kegg_path_gene_list = list(set(list(up_kegg_path_df['src']) + list(up_kegg_path_df['dest'])))
print('----- NUMBER OF GENES IN KEGG PATH: ' + str(len(kegg_path_gene_list)) + ' -----')
print(up_kegg_path_df.shape)

In [None]:
# BioGRID
biogrid_df = pd.read_table('./Regulatory-network-data/BioGrid/BIOGRID-ALL-3.5.174.mitab.Symbol.txt', delimiter = '\t')
eh_list = list(biogrid_df['e_h'])
et_list = list(biogrid_df['e_t'])
# ADJUST ALL GENES TO UPPERCASE
up_eh_list = []
for eh in eh_list:
    up_eh = eh.upper()
    up_eh_list.append(up_eh)
up_et_list = []
for et in et_list:
    up_et = et.upper()
    up_et_list.append(up_et)
up_biogrid_conn_dict = {'src': up_eh_list, 'dest': up_et_list}
up_biogrid_df = pd.DataFrame(up_biogrid_conn_dict)
print(up_biogrid_df)
print(up_biogrid_df.shape)
up_biogrid_df.to_csv('./Regulatory-network-data/BioGrid/up_biogrid.csv', index = False, header = True)
up_biogrid_gene_list = list(set(list(up_biogrid_df['src']) + list(up_biogrid_df['dest'])))
print('----- NUMBER OF GENES IN BioGRID: ' + str(len(up_biogrid_gene_list)) + ' -----')

In [None]:
# STRING
string_df = pd.read_csv('./Regulatory-network-data/STRING/9606.protein.links.detailed.v11.0_sym.csv', low_memory=False)
src_list = list(string_df['Source'])
tar_list = list(string_df['Target'])
# ADJUST ALL GENES TO UPPERCASE
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_tar_list = []
for tar in tar_list:
    up_tar = tar.upper()
    up_tar_list.append(up_tar)
up_string_conn_dict = {'src': up_src_list, 'dest': up_tar_list}
up_string_df = pd.DataFrame(up_string_conn_dict)
print(up_string_df)
up_string_df.to_csv('./Regulatory-network-data/STRING/up_string.csv', index = False, header = True)
up_string_gene_list = list(set(list(up_string_df['src']) + list(up_string_df['dest'])))
print('----- NUMBER OF GENES IN STRING: ' + str(len(up_string_gene_list)) + ' -----')

In [None]:
# intersect the [common genes] with the genes in the different databases [KEGG / BioGRID / STRING]
selected_database = 'KEGG'
# selected_database = 'BioGRID'
# selected_database = 'STRING'
if selected_database == 'KEGG':
    edge_common_genes = list(set(common_genes) & set(kegg_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN KEGG: ' + str(len(edge_common_genes)) + ' -----')
elif selected_database == 'BioGRID':
    edge_common_genes = list(set(common_genes) & set(up_biogrid_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN BioGRID: ' + str(len(edge_common_genes)) + ' -----')
elif selected_database == 'STRING':
    edge_common_genes = list(set(common_genes) & set(up_string_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN STRING: ' + str(len(edge_common_genes)) + ' -----')

# filter the genes in the different databases [KEGG / BioGRID / STRING] with the [common genes]
if selected_database == 'KEGG':
    filtered_up_kegg_df = up_kegg_df[up_kegg_df['src'].isin(edge_common_genes) & up_kegg_df['dest'].isin(edge_common_genes)]
    filtered_up_kegg_df = filtered_up_kegg_df.drop_duplicates()
    filtered_up_kegg_df = filtered_up_kegg_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG EDGE CONNECTIONS: ' + str(len(filtered_up_kegg_df)) + ' -----')
    filtered_up_kegg_path_df = up_kegg_path_df[up_kegg_path_df['src'].isin(edge_common_genes) & up_kegg_path_df['dest'].isin(edge_common_genes)]
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.drop_duplicates()
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG PATHWAY CONNECTIONS: ' + str(len(filtered_up_kegg_path_df)) + ' -----')
elif selected_database == 'BioGRID':
    filtered_up_biogrid_df = up_biogrid_df[up_biogrid_df['src'].isin(edge_common_genes) & up_biogrid_df['dest'].isin(edge_common_genes)]
    filtered_up_biogrid_df = filtered_up_biogrid_df.drop_duplicates()
    filtered_up_biogrid_df = filtered_up_biogrid_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW BioGRID EDGE CONNECTIONS: ' + str(len(filtered_up_biogrid_df)) + ' -----')
elif selected_database == 'STRING':
    filtered_up_string_df = up_string_df[up_string_df['src'].isin(edge_common_genes) & up_string_df['dest'].isin(edge_common_genes)]
    filtered_up_string_df = filtered_up_string_df.drop_duplicates()
    filtered_up_string_df = filtered_up_string_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW STRING EDGE CONNECTIONS: ' + str(len(filtered_up_string_df)) + ' -----')

In [None]:
if selected_database == 'KEGG':
    display(filtered_up_kegg_df)
    display(filtered_up_kegg_path_df)
elif selected_database == 'BioGRID':
    display(filtered_up_biogrid_df)
elif selected_database == 'STRING':
    display(filtered_up_string_df)

#### 4.1.3 Filtering the gene names across the gene expression, cnv, proteomics and methylation

In [None]:
# select common genes in mutation data
mutation_filtered = mutation_inputted.loc[mutation_inputted['gene_name'].isin(edge_common_genes)]
mutation_filtered = mutation_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in gene expression data
gene_expression_filtered = gene_expression_inputted.loc[gene_expression_inputted['gene_name'].isin(edge_common_genes)]
gene_expression_filtered = gene_expression_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in protein data
protein_filtered = protein_inputted.loc[protein_inputted['gene_name'].isin(edge_common_genes)]
protein_filtered = protein_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in upstream data
Upstream_df_filtered = Upstream_df_inputted.loc[Upstream_df_inputted['gene_name'].isin(edge_common_genes)]
Upstream_df_filtered = Upstream_df_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in distal promoter data
Distal_Promoter_df_filtered = Distal_Promoter_df_inputted.loc[Distal_Promoter_df_inputted['gene_name'].isin(edge_common_genes)]
Distal_Promoter_df_filtered = Distal_Promoter_df_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in proximal promoter data
Proximal_Promoter_df_filtered = Proximal_Promoter_df_inputted.loc[Proximal_Promoter_df_inputted['gene_name'].isin(edge_common_genes)]
Proximal_Promoter_df_filtered = Proximal_Promoter_df_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in core promoter data
Core_Promoter_df_filtered = Core_Promoter_df_inputted.loc[Core_Promoter_df_inputted['gene_name'].isin(edge_common_genes)]
Core_Promoter_df_filtered = Core_Promoter_df_filtered.sort_values(by=['gene_name']).reset_index(drop=True)
# select common genes in downstream data
Downstream_df_filtered = Downstream_df_inputted.loc[Downstream_df_inputted['gene_name'].isin(edge_common_genes)]
Downstream_df_filtered = Downstream_df_filtered.sort_values(by=['gene_name']).reset_index(drop=True)

In [None]:
mutation_filtered

In [None]:
#count the Transcript type
common_genes_transcript = pd.merge(mutation_filtered, ensembl_data_type, on='gene_name', how='left')
protein_coding_count = (common_genes_transcript['Transcript type'] == 'protein_coding').sum()
print(f'Number of rows with "protein_coding": {protein_coding_count}')

In [None]:
unique_values = common_genes_transcript['Transcript type'].value_counts()
unique_values

## 5.Gene name/patient samples/ pheotype file lists

### 5.1 gene name and patient samples lists

In [None]:
gene_list = gene_expression_filtered['gene_name']
gene_list

In [None]:
protein_list = protein_filtered['gene_name'].tolist()
print(len(protein_list))
protein_list

In [None]:
intersection = list(set(gene_list) & set(protein_list))
len(intersection)

In [None]:
patient_sample_list = pd.DataFrame(common_tcga_columns_list,columns=['sample'])
patient_sample_list

### 5.2 phenotype lists

In [None]:
immune_subtype_filtered

In [None]:
survival_filtered

In [None]:
survival_nan_column_proportions = survival_filtered.isna().mean()

# Display the results
print(survival_nan_column_proportions)

In [None]:
# Calculate the proportion of NaN values in each column
survival_nan_column_proportions = survival_filtered.isna().mean()

# Identify columns to be dropped (where proportion of NaN values is greater than 1/3)
columns_to_drop = survival_nan_column_proportions[survival_nan_column_proportions > 1/3].index.tolist()

# Drop these columns from the DataFrame
survival_filtered = survival_filtered.drop(columns=columns_to_drop)

# List of columns that were dropped
print("Columns dropped:", columns_to_drop)

In [None]:
cellsub_nan_column_proportions = cellsub_filtered.isna().mean()

# Display the results
print(cellsub_nan_column_proportions)

In [None]:
# Calculate the proportion of NaN values in each column
cellsub_nan_column_proportions = cellsub_filtered.isna().mean()

# Identify columns to be dropped (where proportion of NaN values is greater than 1/3)
columns_to_drop = cellsub_nan_column_proportions[cellsub_nan_column_proportions > 1/3].index.tolist()

# Drop these columns from the DataFrame
cellsub_filtered = cellsub_filtered.drop(columns=columns_to_drop)

# List of columns that were dropped
print("Columns dropped:", columns_to_drop)

In [None]:
cellsub_filtered

In [None]:
import pandas as pd

# extract phenotype names
immune_phenotypes = immune_subtype_filtered.columns[1:].tolist()
survival_phenotypes = survival_filtered.columns[2:].tolist() # _PATIENT infor is not needed (sample id)
dense_phenotypes = dense_filtered.columns[2:].tolist() # sample_type_id infor is not needed (all = 1)
cellsub_phenotypes = cellsub_filtered.columns[1:].tolist()

# creat phenotype name and source
phenotype_list = []
phenotype_list.extend([(p, 'immunesub') for p in immune_phenotypes])
phenotype_list.extend([(p, 'survival') for p in survival_phenotypes])
phenotype_list.extend([(p, 'dense') for p in dense_phenotypes])
phenotype_list.extend([(p, 'cellsub') for p in cellsub_phenotypes])

# list DataFrame
phenotype_lists = pd.DataFrame(phenotype_list, columns=['Phenotype_Name', 'Phenotype_Source'])
phenotype_lists

## 6.Save processed datasets

### 6.1 Keep the consistency for dataframes on genes and samples

In [None]:
# [gene_list]
# gene-tran
sorted_gene_list = gene_list.sort_values()
sorted_gene = sorted_gene_list.tolist()
sorted_gene_tran = [gene + '-TRAN' for gene in sorted_gene]
sorted_gene_tran_df = pd.DataFrame(sorted_gene_tran, columns=['Gene'])
display(sorted_gene_tran_df)
# gene-meth
sorted_gene_methy = [gene + '-METH' for gene in sorted_gene]
sorted_gene_methy_df = pd.DataFrame(sorted_gene_methy, columns=['Gene'])
display(sorted_gene_methy_df)
# gene-protein
sorted_gene_protein = [gene + '-PROT' for gene in sorted_gene]
sorted_gene_protein_df = pd.DataFrame(sorted_gene_protein, columns=['Gene'])
display(sorted_gene_protein_df)
# all-gene
sorted_gene_all = sorted_gene_tran + sorted_gene_methy + sorted_gene_protein
sorted_all_gene_df = pd.DataFrame(sorted_gene_all, columns=['Gene'])
display(sorted_all_gene_df)

In [None]:
# [patient-sample-list]
sorted_patient_sample_list = patient_sample_list.sort_values(by='sample')['sample'].tolist()
print(sorted_patient_sample_list)
sorted_patient_sample_df = patient_sample_list.sort_values(by='sample').reset_index(drop=True)
display(sorted_patient_sample_df)

In [None]:
Upstream_df_filtered = Upstream_df_filtered[['gene_name'] + sorted_patient_sample_list]
Distal_Promoter_df_filtered = Distal_Promoter_df_filtered[['gene_name'] + sorted_patient_sample_list]
Proximal_Promoter_df_filtered = Proximal_Promoter_df_filtered[['gene_name'] + sorted_patient_sample_list]
Core_Promoter_df_filtered = Core_Promoter_df_filtered[['gene_name'] + sorted_patient_sample_list]
Downstream_df_filtered = Downstream_df_filtered[['gene_name'] + sorted_patient_sample_list]

display(Upstream_df_filtered)

In [None]:
mutation_filtered = mutation_filtered[['gene_name'] + sorted_patient_sample_list].sort_values(by='gene_name').reset_index(drop=True)
mutation_filtered

In [None]:
gene_expression_filtered = gene_expression_filtered[['gene_name'] + sorted_patient_sample_list].sort_values(by='gene_name').reset_index(drop=True)
gene_expression_filtered

In [None]:
protein_filtered = protein_filtered[['gene_name'] + sorted_patient_sample_list].sort_values(by='gene_name').reset_index(drop=True)
protein_filtered

In [None]:
immune_subtype_filtered = immune_subtype_filtered.sort_values(by='sample').reset_index(drop=True)
immune_subtype_filtered

In [None]:
survival_filtered = survival_filtered.sort_values(by='sample').reset_index(drop=True)
survival_filtered

In [None]:
dense_filtered = dense_filtered.sort_values(by='sample').reset_index(drop=True)
dense_filtered

In [None]:
cellsub_filtered = cellsub_filtered.sort_values(by='sample').reset_index(drop=True)
cellsub_filtered

### 6.2 Create output folder and save processed datasets

In [None]:
import os

# outputfile name
output_folder = 'UCSC-process'
# create folder if not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# DataFrame needed to be saved
dataframes = {
    'gene-tran-list.csv': sorted_gene_tran_df,
    'gene-methy-list.csv': sorted_gene_methy_df,
    'gene-protein-list.csv': sorted_gene_protein_df,
    'gene-all-list.csv': sorted_all_gene_df,
    'gene-kegg-edge-list.csv': filtered_up_kegg_df,
    'gene-kegg-path-edge-list.csv': filtered_up_kegg_path_df,
    # 'gene-biogrid-edge-list.csv': filtered_up_biogrid_df,
    # 'gene-string-edge-list.csv': filtered_up_string_df,
    'patient-sample-list.csv': sorted_patient_sample_df,
    'phenotype-lists.csv': phenotype_lists,
    'processed-genotype-methy-Upstream.csv': Upstream_df_filtered,
    'processed-genotype-methy-Distal-Promoter.csv': Distal_Promoter_df_filtered,
    'processed-genotype-methy-Proximal-Promoter.csv': Proximal_Promoter_df_filtered,
    'processed-genotype-methy-Core-Promoter.csv': Core_Promoter_df_filtered,
    'processed-genotype-methy-Downstream.csv': Downstream_df_filtered,
    'processed-genotype-mutation.csv': mutation_filtered,
    'processed-genotype-gene-expression.csv': gene_expression_filtered,
    'processed-genotype-proteomics.csv': protein_filtered,
    'processed-phenotype-immune-subtype-transposed.csv': immune_subtype_filtered,
    'processed-phenotype-survival-transposed.csv': survival_filtered,
    'processed-phenotype-dense-transposed.csv': dense_filtered,
    'processed-phenotype-cellsub-transposed.csv': cellsub_filtered
}

# save to output folder
for file_name, df in dataframes.items():
    df.to_csv(os.path.join(output_folder, file_name), index=False)

## 7.Convert the processed data into node dictionary

In [1]:
# load processed data
import pandas as pd
import os

# read the file names under the folder
# Define the path to the output folder where CSV files are stored
output_folder = 'UCSC-process'

# List of file names you saved earlier
file_names = [
    'gene-tran-list', 'gene-methy-list', 'gene-protein-list', 'gene-all-list', 
    'gene-kegg-edge-list', 'gene-kegg-path-edge-list', 
    # 'gene-biogrid-edge-list',
    # 'gene-string-edge-list',
    'patient-sample-list', 'phenotype-lists', 'processed-genotype-methy-Upstream', 
    'processed-genotype-methy-Distal-Promoter', 
    'processed-genotype-methy-Proximal-Promoter', 
    'processed-genotype-methy-Core-Promoter', 'processed-genotype-methy-Downstream', 
    'processed-genotype-mutation', 'processed-genotype-gene-expression', 
    'processed-genotype-proteomics', 'processed-phenotype-immune-subtype-transposed', 
    'processed-phenotype-survival-transposed', 'processed-phenotype-dense-transposed', 
    'processed-phenotype-cellsub-transposed'
]

# Dictionary to hold the dataframes
dataframes = {}

# Read each file and assign to a dataframe
for file_name in file_names:
    full_path = os.path.join(output_folder, file_name + '.csv')
    dataframes[file_name] = pd.read_csv(full_path)

In [2]:
# Assign each dataframe to a variable
sorted_gene_tran_df = dataframes['gene-tran-list']
sorted_gene_methy_df = dataframes['gene-methy-list']
sorted_gene_protein_df = dataframes['gene-protein-list']
sorted_all_gene_df = dataframes['gene-all-list']
filtered_up_kegg_df = dataframes['gene-kegg-edge-list']
filtered_up_kegg_path_df = dataframes['gene-kegg-path-edge-list']
# filtered_up_biogrid_df = dataframes['gene-biogrid-edge-list']
# filtered_up_string_df = dataframes['gene-string-edge-list']
sorted_patient_sample_df = dataframes['patient-sample-list']
phenotype_lists = dataframes['phenotype-lists']
Upstream_df_filtered = dataframes['processed-genotype-methy-Upstream']
Distal_Promoter_df_filtered = dataframes['processed-genotype-methy-Distal-Promoter']
Proximal_Promoter_df_filtered = dataframes['processed-genotype-methy-Proximal-Promoter']
Core_Promoter_df_filtered = dataframes['processed-genotype-methy-Core-Promoter']
Downstream_df_filtered = dataframes['processed-genotype-methy-Downstream']
copynumber_filtered = dataframes['processed-genotype-mutation']
gene_expression_filtered = dataframes['processed-genotype-gene-expression']
protein_filtered = dataframes['processed-genotype-proteomics']
immune_subtype_filtered = dataframes['processed-phenotype-immune-subtype-transposed']
survival_filtered = dataframes['processed-phenotype-survival-transposed']
dense_filtered = dataframes['processed-phenotype-dense-transposed']
cellsub_filtered = dataframes['processed-phenotype-cellsub-transposed']

In [3]:
# outputfile name
graph_output_folder = 'UCSC-graph-data'
# create folder if not exist
if not os.path.exists(graph_output_folder):
    os.makedirs(graph_output_folder)

### 7.0 Adding Sequence Genes

In [31]:
gene_expression_filtered

Unnamed: 0,gene_name,TCGA-05-4384-01,TCGA-05-4396-01,TCGA-05-4405-01,TCGA-05-4410-01,TCGA-05-4417-01,TCGA-05-5423-01,TCGA-05-5429-01,TCGA-05-5715-01,TCGA-06-0125-01,...,TCGA-Z6-A8JE-01,TCGA-Z6-A9VB-01,TCGA-Z6-AAPN-01,TCGA-Z7-A8R5-01,TCGA-Z7-A8R6-01,TCGA-ZA-A8F6-01,TCGA-ZG-A8QW-01,TCGA-ZG-A8QX-01,TCGA-ZG-A8QY-01,TCGA-ZG-A8QZ-01
0,ABL1,4.3667,4.6657,4.4764,4.1400,4.3779,2.9544,3.1876,3.4504,5.1264,...,4.0037,4.7361,4.3421,3.6748,4.8934,4.6201,4.3385,4.3992,4.6456,4.2350
1,ABL2,2.0465,1.1706,2.6325,1.6187,2.2723,0.7748,-0.1993,1.5266,1.8119,...,2.8301,2.2723,2.7572,1.0222,1.1447,1.7660,0.9716,0.8805,0.8246,0.7832
2,ACAA1,6.4613,6.3759,5.6124,5.0930,4.6259,5.9072,5.7616,4.5330,6.0470,...,4.3695,3.8827,4.6399,6.1488,5.5142,4.7587,6.9933,6.9110,6.4041,6.0798
3,ACACA,4.7005,3.7678,4.3385,4.9510,4.0926,3.3307,3.3938,4.5910,4.3060,...,4.3924,4.5224,3.8611,3.4411,4.4101,3.1620,5.6038,6.1530,6.7101,5.6062
4,ACACB,2.0707,0.9493,0.6969,0.4967,0.1257,-0.3566,-0.7588,-0.6193,1.4704,...,0.1776,-0.3566,0.7233,0.7664,0.2029,3.6905,1.6374,1.6187,2.0076,2.2633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112,ZFYVE16,3.3321,3.1653,3.1278,2.7594,2.8819,3.1360,2.6278,2.3308,3.1475,...,3.2080,2.0430,2.7336,3.0038,3.1620,2.9875,3.1523,3.4183,3.3321,3.7094
2113,ZFYVE9,2.0395,3.3856,0.7579,0.0990,1.6234,1.2756,1.5758,1.8078,3.1733,...,2.1988,2.7889,2.3479,1.3109,2.4412,2.1541,2.3048,3.1028,3.6042,2.7465
2114,ZMAT3,2.4623,1.5013,1.9786,1.9601,1.5013,0.5470,1.9675,1.4174,3.8471,...,1.7273,3.2870,0.8726,1.9111,0.2762,2.8974,1.2333,0.7999,1.6964,1.2023
2115,ZNF274,3.5742,3.3911,3.4451,3.3675,3.3364,3.3773,3.3278,1.9675,2.6873,...,3.6520,2.5658,2.1047,2.8137,3.4700,2.7292,3.2617,2.9013,3.1844,3.3134


In [25]:
seq_df = pd.read_csv('./DNA-sequence/Chunk1_gene_transcript_protein_sequences.csv')
display(seq_df)

Unnamed: 0,gene_id,gene_name,contig,start,end,strand,dna_sequence,transcript_sequence,protein_sequence
0,ENSG00000000003,TSPAN6,X,100627108,100639991,-1,AGCTCTTCAGTAGTTTCTGAACATCTAGACGGTAGGATGTAGAATA...,AGTTGTGGACGCTCGTAAGTTTTCGGCAGTTTCCGGGGAGACTCGG...,MASPSRRLQTKPVITCFKSVLLIYTFIFWITGVILLAVGIWGKVSL...
1,ENSG00000000005,TNMD,X,100584936,100599885,1,AGCCGACTCACTTGCAACTCCACCTCAGCAGTGGTCTCTCAGTCCT...,AGCCGACTCACTTGCAACTCCACCTCAGCAGTGGTCTCTCAGTCCT...,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...
2,ENSG00000000419,DPM1,20,50934867,50959140,-1,ACTGCGCTAGTGGACAGCCGAGCCCACCGCAGCCCACGATTAGCAC...,AGTTCCGCCATGGCCTCCTTGGAAGTCAGTCGTAGTCCTCGCAGGT...,MASLEVSRSPRRSRRELEVRSPRQNKYSVLLPTYNERENLPLIVWL...
3,ENSG00000000457,SCYL3,1,169849631,169894267,-1,GCACCTCTACTGTTTGCTACAAGTGGCCAGCAGCCATTTTGGATTT...,GTAGTGGCCACAGCCTTACAGGCAGGCAGGGGTGGTTGGTGTCAAC...,MGSENSALKSYTLREPPFTLPSGLAVYPAVLQDGKFASVFVYKREN...
4,ENSG00000000460,FIRRM,1,169662007,169854080,1,AACCCGCTCGGGTCCCCTTCCACACTGTGGAAGCTTTGTTCTTTCG...,ACTGCGAGTTTCCGGTCTGGGCTTTGGCGGGTCTGGTTTGAAGCTC...,MFLPHMNHLTLEQTFFSQVLPKTVKLFDDMMYELTSQARGLSSQNL...
...,...,...,...,...,...,...,...,...,...
1569,ENSG00000082497,SERTAD4,1,210232796,210246631,1,ATTACCATAACCGTCTGCAGCGACGGCGGCGCAGCGCCCCAGTCGC...,ATTACCATAACCGTCTGCAGCGACGGCGGCGCAGCGCCCCAGTCGC...,MTLVLSMNRFCEPIVSEGAAEIAGYQTLWEADSYGGPSPPGPAQAP...
1570,ENSG00000082512,TRAF5,1,211326615,211374946,1,AGACGCACGTGAGGGAAATCAGATGACTGGACTTGTAGATACTAAC...,AGGAGCAGCAGCCGCGCCTGCAGACCGGCCTCGCGGAGCCCGCGCG...,MAYSEEHKGMPCGFIRQNSGNSISLDFEPSIEYQFVERLEERYKCA...
1571,ENSG00000082515,MRPL22,5,154941073,154969411,1,GCTTGAACTCGGCGGCTTCCGTAGCGGGAGGGCGAAAGATGGCGGC...,GCTTGAACTCGGCGGCTTCCGTAGCGGGAGGGCGAAAGATGGCGGC...,MAAAVLGQLGALWIHNLRSRGKLALGVLPQSYIHTSASLDISRKWE...
1572,ENSG00000082516,GEMIN5,5,154887411,154938211,-1,GCCCCGCTCCCTACCTAAGGCGTGAGGCTACGAGCGGTCGGCTGTG...,GCCCCGCTCCCTACCTAAGGCGTGAGGCTACGAGCGGTCGGCTGTG...,MGQEPRTLPPSPNWYCARCSDAVPGGLFGFAARTSVFLVRVGPGAG...


In [28]:
sorted_gene_tran = list(sorted_gene_tran_df['Gene'].str.replace('-TRAN', ''))
print(sorted_gene_tran)
seq_gene_name = seq_df['gene_name'].tolist()
print(seq_gene_name)
intersection = list(set(sorted_gene_tran) & set(seq_gene_name))
print(intersection)
print(len(intersection))

['ABL1', 'ABL2', 'ACAA1', 'ACACA', 'ACACB', 'ACADL', 'ACADM', 'ACOX1', 'ACOX2', 'ACOX3', 'ACSBG1', 'ACSBG2', 'ACSL1', 'ACSL3', 'ACSL4', 'ACSL5', 'ACSL6', 'ACTA2', 'ACTB', 'ACTG1', 'ACVR1', 'ACVR1B', 'ACVR1C', 'ACVR2A', 'ACVR2B', 'ADAM17', 'ADCY1', 'ADCY10', 'ADCY2', 'ADCY3', 'ADCY4', 'ADCY5', 'ADCY6', 'ADCY7', 'ADCY8', 'ADCY9', 'ADCYAP1', 'ADCYAP1R1', 'ADGRB1', 'ADIPOQ', 'ADIPOR1', 'ADIPOR2', 'ADORA1', 'ADORA2A', 'ADORA2B', 'ADORA3', 'ADRA1A', 'ADRA1B', 'ADRA1D', 'ADRA2A', 'ADRA2B', 'ADRA2C', 'ADRB1', 'ADRB2', 'ADRB3', 'AFP', 'AGAP2', 'AGER', 'AGRP', 'AGT', 'AGTR1', 'AIFM2', 'AIM2', 'AJUBA', 'AKT1', 'AKT1S1', 'AKT2', 'AKT3', 'ALDOA', 'ALDOB', 'ALDOC', 'AMH', 'AMHR2', 'AMOT', 'ANAPC5', 'ANGPT1', 'ANGPT2', 'ANGPT4', 'ANGPTL4', 'AOX1', 'APAF1', 'APBB1IP', 'APC', 'APC2', 'APH1A', 'APH1B', 'APLN', 'APLNR', 'APOA1', 'APOA2', 'APOA5', 'APOC3', 'AQP7', 'ARAF', 'ARAP3', 'AREG', 'ARF1', 'ARF6', 'ARHGDIA', 'ARHGDIB', 'ARHGDIG', 'ARHGEF12', 'ARNT', 'ARRB1', 'ARRB2', 'ASPH', 'ATF2', 'ATF4', 'ATF6B'

In [30]:
suffix_intersected_gene = [gene + '-TRAN' for gene in intersection] + [gene + '-PROT' for gene in intersection] + [gene + '-METH' for gene in intersection]
print(len(suffix_intersected_gene))

759


### 7.1 Make nodes dictionary

In [4]:
sorted_all_gene_dict = sorted_all_gene_df['Gene'].to_dict()
sorted_all_gene_name_dict = {value: key for key, value in sorted_all_gene_dict.items()}
num_gene = sorted_gene_tran_df.shape[0]
num_gene_protein = sorted_gene_protein_df.shape[0]
nodetype_list = ['Gene-TRAN'] * num_gene + ['Gene-METH'] * num_gene + ['Gene-PROT'] * num_gene_protein
map_all_gene_df = pd.DataFrame({'Gene_num': sorted_all_gene_dict.keys(), 'Gene_name': sorted_all_gene_dict.values(), 'NodeType': nodetype_list})
display(map_all_gene_df)
map_all_gene_df.to_csv(os.path.join(graph_output_folder, 'map-all-gene.csv'), index=False)

Unnamed: 0,Gene_num,Gene_name,NodeType
0,0,ABL1-TRAN,Gene-TRAN
1,1,ABL2-TRAN,Gene-TRAN
2,2,ACAA1-TRAN,Gene-TRAN
3,3,ACACA-TRAN,Gene-TRAN
4,4,ACACB-TRAN,Gene-TRAN
...,...,...,...
6346,6346,ZFYVE16-PROT,Gene-PROT
6347,6347,ZFYVE9-PROT,Gene-PROT
6348,6348,ZMAT3-PROT,Gene-PROT
6349,6349,ZNF274-PROT,Gene-PROT


In [5]:
map_text_all_gene_df = map_all_gene_df.copy()
map_text_all_gene_df['Textual_gene_name'] = map_text_all_gene_df['Gene_name'].str.replace('-METH', ' promoter', regex=False)
map_text_all_gene_df['Textual_gene_name'] = map_text_all_gene_df['Textual_gene_name'].str.replace('-PROT', ' protein', regex=False)
map_text_all_gene_df['Textual_gene_name'] = map_text_all_gene_df['Textual_gene_name'].str.replace('-TRAN', '', regex=False)
display(map_text_all_gene_df)
map_text_all_gene_df.to_csv(os.path.join(graph_output_folder, 'map-text-all-gene.csv'), index=False)

Unnamed: 0,Gene_num,Gene_name,NodeType,Textual_gene_name
0,0,ABL1-TRAN,Gene-TRAN,ABL1
1,1,ABL2-TRAN,Gene-TRAN,ABL2
2,2,ACAA1-TRAN,Gene-TRAN,ACAA1
3,3,ACACA-TRAN,Gene-TRAN,ACACA
4,4,ACACB-TRAN,Gene-TRAN,ACACB
...,...,...,...,...
6346,6346,ZFYVE16-PROT,Gene-PROT,ZFYVE16 protein
6347,6347,ZFYVE9-PROT,Gene-PROT,ZFYVE9 protein
6348,6348,ZMAT3-PROT,Gene-PROT,ZMAT3 protein
6349,6349,ZNF274-PROT,Gene-PROT,ZNF274 protein


### 7.2 Create the edges connection between promoter methylations and proteins

In [6]:
# [Gene-METH - Gene]
sorted_gene_methy = sorted_gene_methy_df['Gene'].tolist()
sorted_gene_list = sorted_gene_tran_df['Gene'].tolist()
sorted_gene_protein = sorted_gene_protein_df['Gene'].tolist()
sorted_intersection = [gene_protein.replace('-PROT', '-TRAN') for gene_protein in sorted_gene_protein]
gene_meth_edge_df = pd.DataFrame({'src': sorted_gene_methy, 'dest': sorted_gene_list})
display(gene_meth_edge_df)
# [Gene - Gene-PROT]
gene_protein_edge_df = pd.DataFrame({'src': sorted_intersection, 'dest': sorted_gene_protein})
display(gene_protein_edge_df)

Unnamed: 0,src,dest
0,ABL1-METH,ABL1-TRAN
1,ABL2-METH,ABL2-TRAN
2,ACAA1-METH,ACAA1-TRAN
3,ACACA-METH,ACACA-TRAN
4,ACACB-METH,ACACB-TRAN
...,...,...
2112,ZFYVE16-METH,ZFYVE16-TRAN
2113,ZFYVE9-METH,ZFYVE9-TRAN
2114,ZMAT3-METH,ZMAT3-TRAN
2115,ZNF274-METH,ZNF274-TRAN


Unnamed: 0,src,dest
0,ABL1-TRAN,ABL1-PROT
1,ABL2-TRAN,ABL2-PROT
2,ACAA1-TRAN,ACAA1-PROT
3,ACACA-TRAN,ACACA-PROT
4,ACACB-TRAN,ACACB-PROT
...,...,...
2112,ZFYVE16-TRAN,ZFYVE16-PROT
2113,ZFYVE9-TRAN,ZFYVE9-PROT
2114,ZMAT3-TRAN,ZMAT3-PROT
2115,ZNF274-TRAN,ZNF274-PROT


In [7]:
print(sorted_all_gene_name_dict['ABL1-TRAN'])
print(sorted_all_gene_name_dict['ABL1-METH'])
print(sorted_all_gene_name_dict['ABL1-PROT'])

0
2117
4234


In [8]:
# replace gene name with gene number
gene_meth_num_edge_df = gene_meth_edge_df.copy()
gene_meth_num_edge_df['src'] = gene_meth_edge_df['src'].map(sorted_all_gene_name_dict)
gene_meth_num_edge_df['dest'] = gene_meth_edge_df['dest'].map(sorted_all_gene_name_dict)
display(gene_meth_num_edge_df)
gene_protein_num_edge_df = gene_protein_edge_df.copy()
gene_protein_num_edge_df['src'] = gene_protein_edge_df['src'].map(sorted_all_gene_name_dict)
gene_protein_num_edge_df['dest'] = gene_protein_edge_df['dest'].map(sorted_all_gene_name_dict)
display(gene_protein_num_edge_df)

Unnamed: 0,src,dest
0,2117,0
1,2118,1
2,2119,2
3,2120,3
4,2121,4
...,...,...
2112,4229,2112
2113,4230,2113
2114,4231,2114
2115,4232,2115


Unnamed: 0,src,dest
0,0,4234
1,1,4235
2,2,4236
3,3,4237
4,4,4238
...,...,...
2112,2112,6346
2113,2113,6347
2114,2114,6348
2115,2115,6349


### 7.3 Concat all of the edges

In [None]:
# selected_database = 'KEGG'
if selected_database == 'KEGG':
    filtered_up_num_df = filtered_up_kegg_df.copy()
elif selected_database == 'BioGRID':
    filtered_up_num_df = filtered_up_biogrid_df.copy()
elif selected_database == 'STRING':
    filtered_up_num_df = filtered_up_string_df.copy()

# Add 'PROT' to the end of each gene name in the 'src' and 'dest' columns
filtered_up_num_df['src'] = filtered_up_num_df['src'].apply(lambda x: x + '-PROT')
filtered_up_num_df['dest'] = filtered_up_num_df['dest'].apply(lambda x: x + '-PROT')

filtered_up_num_df['src'] = filtered_up_num_df['src'].map(sorted_all_gene_name_dict)
filtered_up_num_df['dest'] = filtered_up_num_df['dest'].map(sorted_all_gene_name_dict)
display(filtered_up_num_df)
all_gene_edge_num_df = pd.concat([filtered_up_num_df, gene_meth_num_edge_df, gene_protein_num_edge_df])
display(all_gene_edge_num_df)

num_gene_edge = filtered_up_num_df.shape[0]
num_gene_meth_edge = gene_meth_num_edge_df.shape[0]
num_gene_protein_edge = gene_protein_num_edge_df.shape[0]
edgetype_list = ['Gene-PROT-Gene-PROT'] * num_gene_edge + ['Gene-TRAN-Gene-METH'] * num_gene_meth_edge + ['Gene-TRAN-Gene-PROT'] * num_gene_protein_edge
all_gene_edge_num_df['EdgeType'] = edgetype_list
all_gene_edge_num_df = all_gene_edge_num_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
display(all_gene_edge_num_df)
all_gene_edge_num_df.to_csv(os.path.join(graph_output_folder, 'all-gene-edge-num.csv'), index=False)

In [None]:
# gene edge interactions without map
all_gene_edge_df = all_gene_edge_num_df.copy()
all_gene_edge_df = all_gene_edge_df.replace(sorted_all_gene_dict)

num_gene_edge = filtered_up_num_df.shape[0]
num_gene_meth_edge = gene_meth_edge_df.shape[0]
num_gene_protein_edge = gene_protein_edge_df.shape[0]
# all_gene_edge_df = all_gene_edge_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
all_gene_edge_df.to_csv(os.path.join(graph_output_folder, 'all-gene-edge.csv'), index=False)
display(all_gene_edge_df)

## 8.Load data into graph format

### 8.1 Form up the input samples

recommends the use of the endpoints of OS, PFI, DFI, and DSS for each TCGA cancer type

* OS: overall survial
* PFI: progression-free interval
* DSS: disease-specific survival
* DFI: disease-free interval

In [None]:
survival_filtered

In [None]:
survival_filtered_feature_df = survival_filtered.copy()
survival_filtered_feature_df = survival_filtered_feature_df[['sample', 'cancer type abbreviation', 'OS', 'vital_status']]
display(survival_filtered_feature_df)

nan_counts = survival_filtered_feature_df.isna().sum()  # or df.isnull()
print(nan_counts)

# Convert 'alive' to 0.0 and 'dead' to 1.0
survival_filtered_feature_df['vital_status'] = survival_filtered_feature_df['vital_status'].map({'Alive': 0.0, 'Dead': 1.0})
display(survival_filtered_feature_df)
survival_filtered_feature_df['OS'] == survival_filtered_feature_df['vital_status']


In [None]:
# Check if each row in Column1 and Column2 have the same value
rows_same = (survival_filtered_feature_df['OS'] == survival_filtered_feature_df['vital_status']).all()
print("All rows have the same value in column 'OS' and column 'vital_status' :", rows_same)

In [None]:
survival_filtered_feature_df = survival_filtered_feature_df[['sample', 'OS', 'cancer type abbreviation']]
display(survival_filtered_feature_df)
survival_filtered_feature_df.to_csv(os.path.join(graph_output_folder, 'survival-label.csv'), index=False)

### 8.2 Randomize the input label

In [None]:
# Randomize the survival label
def input_random(randomized, graph_output_folder):
    if randomized == True:
        random_survival_filtered_feature_df = survival_filtered_feature_df.sample(frac = 1).reset_index(drop=True)
        random_survival_filtered_feature_df.to_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'), index=False)
    else:
        random_survival_filtered_feature_df = pd.read_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'))
    display(random_survival_filtered_feature_df)

input_random(randomized=False, graph_output_folder='UCSC-graph-data')

### 8.3 Split the randomized input into 5-fold

In [None]:
# Split deep learning input into training and test
def split_k_fold(k, graph_output_folder):
    random_survival_filtered_feature_df = pd.read_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'))
    num_points = random_survival_filtered_feature_df.shape[0]
    num_div = int(num_points / k)
    num_div_list = [i * num_div for i in range(0, k)]
    num_div_list.append(num_points)
    # Split [random_survival_filtered_feature_df] into [k] folds
    for place_num in range(k):
        low_idx = num_div_list[place_num]
        high_idx = num_div_list[place_num + 1]
        print('\n--------TRAIN-TEST SPLIT WITH TEST FROM ' + str(low_idx) + ' TO ' + str(high_idx) + '--------')
        split_input_df = random_survival_filtered_feature_df[low_idx : high_idx]
        split_input_df.to_csv(os.path.join(graph_output_folder, 'split-random-survival-label-' + str(place_num + 1) + '.csv'), index=False)
        print(split_input_df.shape)

split_k_fold(k=5, graph_output_folder='UCSC-graph-data')

### 8.4 Reprocess the edge_index file after loading

In [None]:
import os
import numpy as np
import pandas as pd

graph_output_folder = 'UCSC-graph-data'
form_data_path = './' + graph_output_folder + '/form_data'
edge_index = np.load(form_data_path + '/edge_index.npy')
# Convert the 2D array into a DataFrame
edge_index_df = pd.DataFrame(edge_index.T, columns=['src', 'dest'])

gene_edge_num_df = pd.read_csv(os.path.join(graph_output_folder, 'all-gene-edge-num.csv'))
src_gene_list = list(gene_edge_num_df['src'])
dest_gene_list = list(gene_edge_num_df['dest'])
edgetype_list = list(gene_edge_num_df['EdgeType'])
gene_edge_num_reverse_df = pd.DataFrame({'src': dest_gene_list, 'dest': src_gene_list, 'EdgeType': edgetype_list})
gene_edge_num_all_df = pd.concat([gene_edge_num_df, gene_edge_num_reverse_df]).drop_duplicates().sort_values(by=['src', 'dest']).reset_index(drop=True)

display(edge_index_df)
display(gene_edge_num_all_df)
merged_gene_edge_num_all_df = pd.merge(gene_edge_num_all_df, edge_index_df, on=['src', 'dest'], how='inner')
display(merged_gene_edge_num_all_df)
merged_gene_edge_num_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-num-all.csv'), index=False)

merged_gene_edge_name_all_df = merged_gene_edge_num_all_df.replace(sorted_all_gene_dict)
display(merged_gene_edge_name_all_df)
merged_gene_edge_name_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-name-all.csv'), index=False)

In [None]:
merged_gene_edge_num_all_df['Textual_edge_type'] = merged_gene_edge_num_all_df['EdgeType'].str.replace('Gene-TRAN-Gene-METH', 'promoting transcription', regex=False)
merged_gene_edge_num_all_df['Textual_edge_type'] = merged_gene_edge_num_all_df['Textual_edge_type'].str.replace('Gene-TRAN-Gene-PROT', 'transcripting protein', regex=False)
merged_gene_edge_num_all_df['Textual_edge_type'] = merged_gene_edge_num_all_df['Textual_edge_type'].str.replace('Gene-PROT-Gene-PROT', 'protein protein interaction', regex=False)
display(merged_gene_edge_num_all_df)
merged_gene_edge_num_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-num-all.csv'), index=False)

merged_gene_edge_name_all_df['Textual_edge_type'] = merged_gene_edge_name_all_df['EdgeType'].str.replace('Gene-TRAN-Gene-METH', 'promoting transcription', regex=False)
merged_gene_edge_name_all_df['Textual_edge_type'] = merged_gene_edge_name_all_df['Textual_edge_type'].str.replace('Gene-TRAN-Gene-PROT', 'transcripting protein', regex=False)
merged_gene_edge_name_all_df['Textual_edge_type'] = merged_gene_edge_name_all_df['Textual_edge_type'].str.replace('Gene-PROT-Gene-PROT', 'protein protein interaction', regex=False)
display(merged_gene_edge_name_all_df)
merged_gene_edge_name_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-name-all.csv'), index=False)

## 9.Generate the knowledge graph embeddings

In [None]:
display(map_text_all_gene_df)
display(merged_gene_edge_name_all_df)
display(merged_gene_edge_num_all_df)

In [None]:
merged_gene_edge_num_all_df

In [None]:
# Creating the dictionary mapping 'Gene_num' to 'Textual_gene_name'
textual_gene_name_to_num_dict = pd.Series(map_text_all_gene_df['Gene_num'].values, index=map_text_all_gene_df['Textual_gene_name']).to_dict()
print(textual_gene_name_to_num_dict)

# Creating the dictionary mapping 'Edge_num' to 'Textual_edge_type'
edge_type_list = sorted(list(set(merged_gene_edge_name_all_df['Textual_edge_type'])))
edge_type_to_num_dict = {edge_type_list[i]: i for i in range(len(edge_type_list))}
print(edge_type_to_num_dict)

triplet_merged_gene_edge_num_all_df = merged_gene_edge_num_all_df.copy()
display(triplet_merged_gene_edge_num_all_df)
triplet_merged_gene_edge_num_all_df['Textual_edge_type'] = triplet_merged_gene_edge_num_all_df['Textual_edge_type'].replace(edge_type_to_num_dict)
display(triplet_merged_gene_edge_num_all_df)

In [None]:
triplet_array = triplet_merged_gene_edge_num_all_df[['src', 'Textual_edge_type', 'dest']].values
print(triplet_array)

triplet_df_array = triplet_merged_gene_edge_num_all_df[['src', 'dest', 'Textual_edge_type']].values
triplet_df = pd.DataFrame(triplet_df_array, columns=['from', 'to', 'rel'])
display(triplet_df)

In [None]:
from torchkge.data_structures import KnowledgeGraph
import torch

kg = KnowledgeGraph(df=triplet_df)
print(kg)

In [None]:
from torch import cuda
from torch.optim import Adam

from torchkge.models import TransEModel, TransRModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from torchkge.utils.datasets import load_fb15k

from tqdm.autonotebook import tqdm

# Define some hyper-parameters for training
emb_dim = 100
rel_emb_dim = 100
lr = 0.0004
n_epochs = 1000
b_size = 640
margin = 0.5

# Define the model and criterion
kg_train = kg

# model = TransEModel(emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
model = TransRModel(emb_dim, rel_emb_dim, kg_train.n_ent, kg_train.n_rel)
criterion = MarginLoss(margin)

# Move everything to CUDA if available
if cuda.is_available():
    cuda.empty_cache()
    model.cuda()
    criterion.cuda()

# Define the torch optimizer to be used
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

sampler = BernoulliNegativeSampler(kg_train)
dataloader = DataLoader(kg_train, batch_size=b_size, use_cuda='all')

iterator = tqdm(range(n_epochs), unit='epoch')
for epoch in iterator:
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        h, t, r = batch[0], batch[1], batch[2]
        n_h, n_t = sampler.corrupt_batch(h, t, r)

        optimizer.zero_grad()

        # forward + backward + optimize
        pos, neg = model(h, t, r, n_h, n_t)
        loss = criterion(pos, neg)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    iterator.set_description(
        'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                              running_loss / len(dataloader)))

model.normalize_parameters()

In [None]:
entity_embeddings = model.get_embeddings()[0]
relation_embeddings = model.get_embeddings()[1]
# outputfile name
graph_output_folder = 'UCSC-graph-data'
embed_folder_path = './' + graph_output_folder + '/embeddings'
# create folder if not exist
if not os.path.exists(embed_folder_path):
    os.makedirs(embed_folder_path)
# Save the entity embeddings
entity_embeddings = entity_embeddings.cpu().numpy()
print(entity_embeddings.shape)
np.save(embed_folder_path + '/entity_embeddings.npy', entity_embeddings)
# Save the edge embeddings
relation_embeddings = relation_embeddings.cpu().numpy()
np.save(embed_folder_path + '/relation_embeddings.npy', relation_embeddings)
print(relation_embeddings.shape)