In [14]:
import pandas as pd
import json
import os
import gzip
import numpy as np

In [15]:
os.chdir(r'C:\RAG_LUMC\Data\biomart')
print(os.getcwd())

C:\RAG_LUMC\Data\biomart


In [None]:
%%bash
bash reorder_mart.sh

In [16]:
os.chdir('../..')

In [17]:
print(os.getcwd())

C:\RAG_LUMC


In [18]:
Add_Synonyms = True

In [19]:

input_file = './Data/biomart/to_be_converted/wikipathways-20241210-gmt-Rattus_norvegicus.gmt'
#input_file = './Data/biomart/to_be_converted/wikipathways-20240910-gmt-Homo_sapiens.gmt'
#output_file = './Data/biomart/to_be_converted/converted_wikipathways-20240910-gmt-Homo_sapiens.gmt'
output_file = './Data/biomart/to_be_converted/converted_wikipathways-20241210-gmt-Rattus_norvegicus.gmt'

json_file = './Data/JSON/ncbi_id_to_symbol.json'

# Load the gene ID to symbol dictionary from JSON
with open(json_file, 'r') as f:
    gene_dict = json.load(f)

# Read the GMT input file
columns = ['header', 'url'] + [f'gene_{i}' for i in range(1000)]  
df = pd.read_csv(input_file, sep='\t', header=None, names=columns, engine='python', dtype=str, na_filter=False)

# Function to replace gene IDs with symbols
def replace_gene_ids(gene_id):
    return gene_dict.get(gene_id, gene_id)

# Apply replacement function to gene columns
gene_columns = df.columns[2:]
for col in gene_columns:
    df[col] = df[col].apply(replace_gene_ids)

# Remove empty columns (if all values in a column are empty strings)
df = df.loc[:, (df != '').any(axis=0)]

# Remove empty rows for gene columns to prevent excess line breaks
df = df.apply(lambda x: x.dropna().tolist(), axis=1).apply(pd.Series)

# Save to the output file without excess newlines
df.to_csv(output_file, sep='\t', header=False, index=False, lineterminator='\n')

print(f'File conversion completed! Output saved to {output_file}')


File conversion completed! Output saved to ./Data/biomart/to_be_converted/converted_wikipathways-20241210-gmt-Rattus_norvegicus.gmt


In [20]:
input_file = './Data/biomart/to_be_converted/reordered_rat_data.txt.gz'  
output_json = './Data/JSON/genes.json' 
output_file = './Data/biomart/rat_genes_consolidated.txt.gz' #remove rat

In [21]:
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

ensure_dir(output_json)
ensure_dir(output_file)

df = pd.read_csv(input_file, compression='gzip')
print(f"Successfully read input file: {input_file}")

df['Gene Synonym'] = df['Gene Synonym'].fillna('').astype(str).str.strip()

df['Gene stable ID'] = df['Gene stable ID'].str.strip().str.upper()

unique_gene_ids = df['Gene stable ID'].nunique()
print(f"Number of unique 'Gene stable ID's: {unique_gene_ids}")
# Grouping the data and applying transformations
grouped = df.groupby('Gene stable ID').agg({
    'Gene name': 'first',
    'Gene description': 'first',
    'Gene Synonym': lambda x: sorted(filter(None, x.unique())),
    'NCBI gene (formerly Entrezgene) description': 'first',
    'NCBI gene (formerly Entrezgene) ID': 'first'
}).reset_index()
# Ensure 'Gene Synonyms' are formatted correctly
grouped.rename(columns={'Gene Synonym': 'Gene Synonyms'}, inplace=True)
grouped['Gene Synonyms'] = grouped['Gene Synonyms'].apply(lambda x: f"[{','.join(x)}]" if x else "[]")

# 1. Removing NCBI description unless Gene description is empty and NCBI description is not
grouped['Gene description'] = grouped.apply(
    lambda row: row['NCBI gene (formerly Entrezgene) description']
    if pd.isna(row['Gene description']) and pd.notna(row['NCBI gene (formerly Entrezgene) description'])
    else row['Gene description'],
    axis=1
)

# Removing descriptions between []. 
grouped['Gene description'] = grouped['Gene description'].str.replace(r'\[.*?\]', '', regex=True).str.strip()

#Dropping the NCBI description column as it's no longer needed, and converting the ID to integer.
grouped.drop(columns=['NCBI gene (formerly Entrezgene) description'], inplace=True)
grouped['NCBI gene (formerly Entrezgene) ID'] = grouped['NCBI gene (formerly Entrezgene) ID'].astype('Int64')

consolidated_entries = grouped.shape[0]
print(f"Number of consolidated entries: {consolidated_entries}")

if consolidated_entries != unique_gene_ids:
    print("Warning: The number of consolidated entries does not match the number of unique 'Gene stable ID's.")
    print(f"Unique 'Gene stable ID's: {unique_gene_ids}, Consolidated entries: {consolidated_entries}")
else:
    print("Success: The number of consolidated entries matches the number of unique 'Gene stable ID's.")

genes_list = grouped.to_dict(orient='records')

with open(output_json, 'w', encoding='utf-8') as f_json:
    json.dump(genes_list, f_json, indent=4)
print(f"Consolidated JSON data has been saved to '{output_json}'.")

ordered_columns = [
    'Gene stable ID',
    'Gene name',
    'Gene description',
    #'Gene Synonyms',
    #'NCBI gene (formerly Entrezgene) ID'
]

missing_columns = set(ordered_columns) - set(grouped.columns)
if missing_columns:
    print(f"Error: Missing columns in the DataFrame: {missing_columns}")
    exit(1)

grouped_ordered = grouped[ordered_columns]

grouped_ordered.to_csv(output_file, index=False, sep=',', compression='gzip')
print(f"Consolidated TXT.GZ data has been saved to '{output_file}'.")


Successfully read input file: ./Data/biomart/to_be_converted/reordered_rat_data.txt.gz
Number of unique 'Gene stable ID's: 30562
Number of consolidated entries: 30562
Success: The number of consolidated entries matches the number of unique 'Gene stable ID's.
Consolidated JSON data has been saved to './Data/JSON/genes.json'.
Consolidated TXT.GZ data has been saved to './Data/biomart/rat_genes_consolidated.txt.gz'.


In [22]:
genes_json_path = './Data/JSON/genes.json'  
#input_gmt_path = './Data/biomart/to_be_converted/converted_wikipathways-20240910-gmt-Homo_sapiens.gmt'
input_gmt_path = './Data/biomart/to_be_converted/converted_wikipathways-20241210-gmt-Rattus_norvegicus.gmt'
#output_gmt_path = './Data/biomart/wikipathways_synonyms_Homo_sapiens.gmt.gz' 
output_gmt_path = './Data/biomart/wikipathways_synonyms_Rattus_norvegicus.gmt.gz'

In [23]:
ensure_dir(output_gmt_path)

# Load genes data
with open(genes_json_path, 'r', encoding='utf-8') as f_json:
    genes_data = json.load(f_json)

# Create gene to synonyms mapping
gene_to_synonyms = {}
for entry in genes_data:
    gene_name = entry.get('Gene name')
    if gene_name is None:
        continue
    gene_name = gene_name.strip()
    
    synonyms_str = entry.get('Gene Synonyms') or ''
    synonyms_str = synonyms_str.strip()
    if synonyms_str.startswith('[') and synonyms_str.endswith(']'):
        synonyms_str = synonyms_str[1:-1]
    
    synonyms = synonyms_str.split('_') if synonyms_str else []
    synonyms = [syn.strip() for syn in synonyms if syn.strip()]
    
    gene_to_synonyms[gene_name] = synonyms

print(f"Loaded {len(gene_to_synonyms)} genes with synonyms.")

# Process GMT file and save as .gmt.gz
def process_gmt(input_path, output_path, gene_synonyms_map):
    base_url = "https://www.wikipathways.org/instance/"
    with open(input_path, 'r', encoding='utf-8') as infile, gzip.open(output_path, 'wt', encoding='utf-8') as outfile:
        for line_number, line in enumerate(infile, 1):
            line = line.strip()
            if not line:
                continue  
            parts = line.split('\t')
            if len(parts) < 3:
                print(f"Warning: Line {line_number} in GMT file does not have enough columns. Skipping.")
                continue
            pathway_name_full, pathway_url_full, *genes = parts
            
            if '%' in pathway_name_full:
                pathway_name = pathway_name_full.split('%')[0].strip()
            else:
                pathway_name = pathway_name_full.strip()
            
            if pathway_url_full.startswith(base_url):
                pathway_url = pathway_url_full.replace(base_url, '').strip()
            else:
                pathway_url = pathway_url_full.strip()
            
            expanded_genes = []
            for gene in genes:
                gene = gene.strip()
                if not gene:
                    continue  
                synonyms = gene_synonyms_map.get(gene, [])
                if synonyms:
                    expanded_gene = f"[{gene}, " + ", ".join(synonyms) + "]"
                else:
                    expanded_gene = f"[{gene}]"
                expanded_genes.append(expanded_gene)
            
            # **4. Remove duplicate genes while preserving order**
            seen = set()
            unique_genes = []
            for gene in expanded_genes:
                if gene not in seen:
                    seen.add(gene)
                    unique_genes.append(gene)
            
            # **5. Assemble the new line**
            new_line = '\t'.join([pathway_name, pathway_url] + unique_genes)
            outfile.write(new_line + '\n')
            
            # **6. Progress Logging**
            if line_number % 1000 == 0:
                print(f"Processed {line_number} lines.")
    print(f"Finished processing GMT file. Output saved to '{output_path}'.")

process_gmt(input_gmt_path, output_gmt_path, gene_to_synonyms)

Loaded 25598 genes with synonyms.
Finished processing GMT file. Output saved to './Data/biomart/wikipathways_synonyms_Rattus_norvegicus.gmt.gz'.
