## Dataset Cleaning
--- 
Gives stats and cleans datasets

In [2]:
import pandas as pd
import json


In [4]:
DGIDB_PATH = "./DGIDB/interactions.tsv"
DGIDB = pd.read_csv(DGIDB_PATH, sep="\t")

HUMANNET_PATH = "./HumanNet/HumanNet-GSP.tsv"
HUMANNET = pd.read_csv(HUMANNET_PATH, sep="\t")

NCBI_PATH = "./ncbi/gene2refseq.gz"
NCBI_INFO = pd.read_csv(NCBI_PATH, sep='\t', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Filter for Homo sapiens genes (tax_id = 9606)
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]

# Create a mapping from gene claim names (Symbol) to NCBI gene IDs (GeneID)
gene_claim_to_id = pd.Series(human_gene2refseq.GeneID.values, index=human_gene2refseq.Symbol).to_dict()
def get_ncbi_gene_id(gene_claim_name):
    return gene_claim_to_id.get(gene_claim_name, None)

gene_claim_name = "HTR2C"
ncbi_gene_id = get_ncbi_gene_id(gene_claim_name)
print(f"NCBI Gene ID for {gene_claim_name}: {ncbi_gene_id}")
id_to_gene_claim = {v: k for k, v in gene_claim_to_id.items()}

NCBI Gene ID for HTR2C: 3358


In [11]:
print(id_to_gene_claim.get("1565"))

None


In [7]:
# Open the JSON file and load its content into a dictionary
with open("../Gen_Hypergraph/output/DGIDB_BIPOLAR/gene_to_index.json", "r") as file:
    dgidb = json.load(file)

In [12]:
converted_dgidb = {
    id_to_gene_claim.get(int(ncbi_id), f"UNKNOWN_{ncbi_id}"): idx
    for ncbi_id, idx in dgidb.items()
}

# Optionally save the new dictionary
with open("gene_claim_to_index.json", "w") as file:
    json.dump(converted_dgidb, file, indent=2)

In [None]:
# Step 3: Map NCBI IDs to claim names
humannet_claim = HUMANNET.copy()
humannet_claim['Gene1'] = HUMANNET['Gene1'].map(id_to_gene_claim)
humannet_claim['Gene2'] = HUMANNET['Gene2'].map(id_to_gene_claim)

# Step 4: Drop rows where mapping failed
humannet_claim.dropna(inplace=True)

# Step 5: Save to new file
humannet_claim.to_csv("humannet_claim.tsv", sep="\t", index=False)

In [None]:
DGIDB['ncbi_gene_id'] = DGIDB['gene_name'].apply(get_ncbi_gene_id)
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else None)

# Count rows where 'ncbi_gene_id' is None
none_count = DGIDB['ncbi_gene_id'].isna().sum()
# Print the count
print(f"Number of rows with None in 'ncbi_gene_id': {none_count}")
DGIDB = DGIDB.dropna(subset=['ncbi_gene_id'])
DGIDB.to_csv("./DGIDB/DrugToGene.tsv", sep="\t", index=False)
DGIDB

Number of rows with None in 'ncbi_gene_id': 9755


Unnamed: 0,gene_claim_name,gene_concept_id,gene_name,interaction_source_db_name,interaction_source_db_version,interaction_type,interaction_score,drug_claim_name,drug_concept_id,drug_name,approved,immunotherapy,anti_neoplastic,ncbi_gene_id
0,CYP2D6,hgnc:2625,CYP2D6,DTC,9/2/20,,0.017709,RACLOPRIDE,ncit:C152139,RACLOPRIDE,False,False,False,1565
1,PPARG,hgnc:9236,PPARG,DTC,9/2/20,,0.840123,KALOPANAX-SAPONIN F,chembl:CHEMBL1833984,CHEMBL:CHEMBL1833984,False,False,False,5468
2,ATAD5,hgnc:25752,ATAD5,DTC,9/2/20,,0.177992,RO-5-3335,chembl:CHEMBL91609,CHEMBL:CHEMBL91609,False,False,False,79915
3,RGS4,hgnc:10000,RGS4,DTC,9/2/20,,0.034319,"3,4-DICHLOROISOCOUMARIN",drugbank:DB04459,"3,4-DICHLOROISOCOUMARIN",False,False,False,5999
4,MAPK1,hgnc:6871,MAPK1,DTC,9/2/20,,0.050007,WITHAFERIN A,iuphar.ligand:13097,WITHAFERIN A,False,False,False,5594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98234,TP53,hgnc:11998,TP53,JAX-CKB,10-Apr-24,,0.014489,PD-0325901,ncit:C52195,MIRDAMETINIB,False,False,True,7157
98235,TP53,hgnc:11998,TP53,JAX-CKB,10-Apr-24,,0.115911,ALRN-6924,ncit:C118669,SULANEMADLIN,False,False,False,7157
98236,TP53,hgnc:11998,TP53,JAX-CKB,10-Apr-24,,0.025758,LY3009120,iuphar.ligand:8943,LY3009120,False,False,False,7157
98237,TP53,hgnc:11998,TP53,JAX-CKB,10-Apr-24,,0.010537,Cabozantinib,rxcui:1363267,CABOZANTINIB S-MALATE,True,False,True,7157


### MSigDB cleaning

In [4]:
MSIGDB_PATH = "./MSigDB/c2.all.v2024.1.Hs.jsonl"
with open(MSIGDB_PATH, 'r') as file:
    MSIGDB = json.load(file)

In [None]:
pathway_to_genes = {pathway: data["geneSymbols"] for pathway, data in MSIGDB.items()}
MSIGDB_df = pd.DataFrame(list(pathway_to_genes.items()), columns=['pathway', 'gene_names'])
MSIGDB_df['ncbi_gene_ids'] = MSIGDB_df['gene_names'].apply(lambda genes: [get_ncbi_gene_id(gene) for gene in genes if get_ncbi_gene_id(gene) is not None])
MSIGDB_df.to_csv("./MSigDB/PathwayToGene.tsv", sep="\t", index=False)


NameError: name 'get_ncbi_gene_id' is not defined

In [2]:
import pandas as pd
MSIGDB_df = pd.read_csv("./MSigDB/PathwayToGene.tsv", sep="\t")

In [4]:
import ast

# Flatten the gene lists and extract unique gene names
unique_genes = set()
for gene_list in MSIGDB_df['gene_names']:
    genes = ast.literal_eval(gene_list)  # Safely parse the string list
    unique_genes.update(genes)

# Sort the unique gene names
sorted_genes = sorted(unique_genes)

# Write to a text file
with open("unique_genes.txt", "w") as f:
    for gene in sorted_genes:
        f.write(f"{gene}\n")

### DDDB Cleaning

In [2]:
import pandas as pd
DDDB = pd.read_csv("../Data/DDDB/NIHMS851432-supplement-1.csv")
NDFRT = pd.read_csv("../Data/NDF-RT/NDF-RT.csv.gz", compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(NDFRT.columns)

Index(['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'Obsolete',
       'CUI', 'Semantic Types', 'Parents', 'CI_ChemClass', 'CI_MoA',
       ...
       'UMLS_CUI', 'UMLS_CUI.1', 'Units', 'Units.1',
       'VA_National_Formulary_Name', 'VA_National_Formulary_Name.1',
       'VANDF_Record', 'VANDF_Record.1', 'VUID', 'VUID.1'],
      dtype='object', length=116)


In [4]:
def get_ndfrt_name(ndfrt_code):
    match_row = NDFRT[NDFRT['Class ID'].str.endswith(ndfrt_code)]
    if not match_row.empty:
        return match_row.iloc[0]['Preferred Label']
    return "Unknown Drug"

# Try again
name = get_ndfrt_name("N0000004713")
print(name)


donepezil


In [5]:
DDDB['ndfrt_preferred_label'] = DDDB['NDF-RT'].apply(get_ndfrt_name)
print(DDDB)

           NDF-RT     SNOMED ndfrt_preferred_label
0     N0000004713   26929004             donepezil
1     N0000004713   56267009             donepezil
2     N0000004713   80098002             donepezil
3     N0000004713  386806002             donepezil
4     N0000004713  425390006             donepezil
...           ...        ...                   ...
3534  N0000146103   19905009          TRIMETHOPRIM
3535  N0000146103  423561003          TRIMETHOPRIM
3536  N0000146103  429271009          TRIMETHOPRIM
3537  N0000146103  192701001          TRIMETHOPRIM
3538  N0000148372  235049008            TACROLIMUS

[3539 rows x 3 columns]


In [6]:
num_nones = DDDB['ndfrt_preferred_label'].isna().sum()

# Print the result
print(f"Number of None values in 'ndfrt_preferred_label': {num_nones}")

Number of None values in 'ndfrt_preferred_label': 0


In [7]:
SNOMED = pd.read_csv("./SNOMED/sct2_Description_Full-en_US1000124_20250301.tsv", sep="\t")

In [8]:
def get_snomed_term(concept_id):
    match_row = SNOMED[SNOMED['conceptId'].astype(str) == str(concept_id)]
    if not match_row.empty:
        return match_row.iloc[0]['term']
    return "Unknown Concept"
# Try again
name = get_snomed_term("13746004")
print(name)

Bipolar disorder


In [9]:
from tqdm import tqdm
tqdm.pandas()

DDDB['snomed_disease'] = DDDB['SNOMED'].progress_apply(get_snomed_term)
DDDB

100%|██████████| 3539/3539 [1:22:31<00:00,  1.40s/it]


Unnamed: 0,NDF-RT,SNOMED,ndfrt_preferred_label,snomed_disease
0,N0000004713,26929004,donepezil,Alzheimer's disease
1,N0000004713,56267009,donepezil,Multi-infarct dementia
2,N0000004713,80098002,donepezil,Diffuse Lewy body disease
3,N0000004713,386806002,donepezil,Impaired cognition (finding)
4,N0000004713,425390006,donepezil,Dementia associated with Parkinson's Disease (...
...,...,...,...,...
3534,N0000146103,19905009,TRIMETHOPRIM,Chronic prostatitis
3535,N0000146103,423561003,TRIMETHOPRIM,Community-acquired methicillin resistant Staph...
3536,N0000146103,429271009,TRIMETHOPRIM,Ventilator-acquired pneumonia (disorder)
3537,N0000146103,192701001,TRIMETHOPRIM,Toxoplasma encephalitis


In [10]:
DDDB.to_csv("./DDDB/DrugToDisease.tsv", sep="\t", index=False)

In [12]:
print(len(DDDB["snomed_disease"].unique()))

1321


In [None]:
# Group by both SNOMED code and disease name to preserve both
grouped = DDDB.groupby(["SNOMED", "snomed_disease"])["ndfrt_preferred_label"].apply(lambda x: x.dropna().unique())

# Initialize a counter
count = 0

# Filter and print those with more than 8 unique drug labels
for (snomed_code, disease_name), drugs in grouped.items():
    if len(drugs) > 8:
        count += 1
        print(f"SNOMED Code: {snomed_code}")
        print(f"Disease Name: {disease_name}")
        print(f"Number of Drugs: {len(drugs)}")
        print("Drugs:", ", ".join(drugs))
        print("-" * 50)

# Print the total count
print(f"Total SNOMED diseases with >8 unique drugs: {count}")

Sep

In [1]:
def write_genes_not_in_exclude(all_genes_path, exclude_genes_path, output_path):
    # Read all genes
    with open(all_genes_path, "r") as f:
        all_genes = set(line.strip() for line in f if line.strip())

    # Read genes to exclude
    with open(exclude_genes_path, "r") as f:
        exclude_genes = set(line.strip() for line in f if line.strip())

    # Compute difference
    remaining_genes = sorted(all_genes - exclude_genes)

    # Write to output
    with open(output_path, "w") as f:
        for gene in remaining_genes:
            f.write(f"{gene}\n")

    print(f"Wrote {len(remaining_genes)} genes to {output_path}")


write_genes_not_in_exclude(
    "../Data/low_conf_genes.txt",
    "../Data/GNN/BipolarOTGenes.txt",
    "../Data/GNN/BipolarFalseGenes.txt"
)

Wrote 8637 genes to ../Data/GNN/BipolarFalseGenes.txt


In [2]:
import pandas as pd
HUMANNET_PATH = "./HumanNet/HumanNet-GSP.tsv"
HUMANNET = pd.read_csv(HUMANNET_PATH, sep="\t")

In [6]:
unique_genes = set(HUMANNET["Gene1"].unique()) | set(HUMANNET["Gene2"].unique())
num_unique_genes = len(unique_genes)
print(num_unique_genes)

8779
