In [6]:
# Make the PYTHONPATH the root directory of the project on the notebook
import os
import sys

os.chdir("..")
sys.path.append("..")

In [24]:
import pandas as pd

drug_combs = pd.read_csv(
    "./data/drug_combs.csv", usecols=["ID", "Drug1", "Drug2", "Cell line", "ZIP", "Bliss", "Loewe", "HSA", "classification"]
)
cell_lines = pd.read_csv("./data/cell_Line.csv")
drugs = pd.read_csv("./data/drug_chemical_info.csv", na_values=["#N/A", "none", "None", 0])

In [21]:
# 1. Verify Drugs
# Combine Drug1 and Drug2 into a single unique set of drugs used in combinations
used_drugs = set(drug_combs['Drug1']).union(set(drug_combs['Drug2']))
reference_drugs = set(drugs['drugName'])

# Find missing drugs (present in combs but NOT in reference list)
missing_drugs = used_drugs - reference_drugs

if len(missing_drugs) == 0:
    print("✅ All drugs in 'drug_combs' exist in the 'drugs' table.")
else:
    print(f"❌ Found {len(missing_drugs)} drugs missing from reference table:")
    print(missing_drugs)

print("-" * 30)

# 2. Verify Cell Lines
used_cells = set(drug_combs['Cell line'])
reference_cells = set(cell_lines['cellName'])

# Find missing cell lines
missing_cells = used_cells - reference_cells

if len(missing_cells) == 0:
    print("✅ All cell lines in 'drug_combs' exist in the 'cell_lines' table.")
else:
    print(f"❌ Found {len(missing_cells)} cell lines missing from reference table:")
    print(missing_cells)

❌ Found 2433 drugs missing from reference table:
{'HALOPERIDOL', 'DIHYDROARTEMISININ (DHA)', 'DRONEDARONE HCL', 'PROCHLORPERAZINE DIMALEATE', 'CEPHALOMANNINE', 'AEE788 (NVP-AEE788)', 'GSK 2250665A', 'NSC 405020', 'LY 2365109 HYDROCHLORIDE', 'LAMIVUDINE', '717906-29-1', 'CCT137690', 'TAK-632', 'METHAZOLAMIDE', 'CHEMBL3348930', 'SELUMETINIB (AZD6244)', '1169562-71-3', 'SIROLIMUS', 'ALISERTIB (MLN8237)', 'ACADESINE', 'AGN-PC-0MU5N5', 'VLX1570', 'SIRTINOL', 'ARSENIC TRIOXIDE', 'TCID', 'ZM 323881 HCL', 'METHYLTHIOURACIL', 'NCGC00162383-02', 'CORTISONE ACETATE', 'LOTEPREDNOL ETABONATE', 'HG6-64-1', '303727-31-3', 'NCGC00263108-01', 'CHEMBL1824695', 'ALFUZOSIN HCL', 'SERATRODAST', 'CHEMBL256963', 'MITOXANTRONE', 'CYROMAZINE', 'KRP-297', '8-AZAGUANINE', 'GDC-0941', 'KPT-8602', 'dimethyl but-2-enedioate', 'ZINC17545571', 'NVP-BSK805 2HCL', 'DESVENLAFAXINE', '(-)-MK 801 MALEATE', 'RIMONABANT', 'PAROXETINE HCL', '431979-47-4', 'NCGC00346563-01', 'FOSTAMATINIB (R788)', '3-methyladenine', 'SODIUM A

In [25]:
cell_lines = cell_lines.dropna(subset=['cellName', 'cosmicId'])
drugs = drugs.dropna(subset=['drugName', 'cIds'])

In [None]:
import sqlite3


drug_combs.columns = ["id", "drug1", "drug2", "cell_line", "zip", "bliss", "loewe", "hsa", "classification"]

# Add a status column to track whether the combination has been processed
drug_combs["status"] = "pending"
    
with sqlite3.connect("./data/drugcombs.sqlite") as conn:
    drug_combs.to_sql("drug_combinations", conn, if_exists="replace", index=False)
    cell_lines.to_sql("cell_lines", conn, if_exists="replace", index=False)
    drugs.to_sql("drugs", conn, if_exists="replace", index=False)
