In [1]:
import os
import shutil
import zipfile
import subprocess
import json

# Define Inputs
corpus = "Duke - Copd Trials"
patient_id = "kp"
queries = """CC: increased SOB. HPI: Pt is a 53 yo man with a 50 pack-year history of smoking, 
and severe COPD who returns to the clinic for follow up after a 4 day hospitalization for a COPD exacerbation. 
Currently he takes Spiriva 1 puff qd, salmeterol 1 puff q12, fluticasone 2 puffs q 4-6 hours,, 
but he remains symptomatic and not fully controlled on his current medications. 
LFTs: Normal Albumiin: 4mg/dl Total bilirubin: 1mg/dl INR: Normal"""

k = 20  # Number of results

# Define paths
corpus_base_path = "duke_corpus"
dataset_path = "dataset/data"
corpus_mapping = {
    "Duke - All Trials": "all",
    "Duke - Oncology Trials": "onco",
    "Duke - Copd Trials": "copd"
}

selected_corpus_folder = corpus_mapping[corpus]
corpus_file_path = os.path.join(corpus_base_path, selected_corpus_folder, "corpus.jsonl")

# Ensure directories exist
os.makedirs(dataset_path, exist_ok=True)


In [2]:
# Check if corpus file exists
if not os.path.exists(corpus_file_path):
    raise FileNotFoundError(f"Corpus file not found at {corpus_file_path}")

# Copy corpus.jsonl to dataset/data folder
shutil.copy(corpus_file_path, os.path.join(dataset_path, "corpus.jsonl"))

print(f"Copied {corpus_file_path} → {dataset_path}/corpus.jsonl")

Copied duke_corpus/copd/corpus.jsonl → dataset/data/corpus.jsonl


In [3]:
def generate_jsonl_file(patient_id, queries):
    """Save query in JSONL format."""
    query_data = {"_id": patient_id, "text": queries}
    jsonl_file = os.path.join(dataset_path, "queries.jsonl")

    with open(jsonl_file, "w") as f:
        json.dump(query_data, f)
        f.write("\n")

    return jsonl_file

# Generate and save queries
queries_file = generate_jsonl_file(patient_id, queries)
print(f"Saved queries to {queries_file}")

Saved queries to dataset/data/queries.jsonl


In [4]:
# Run keyword generation
command = ["python3", "trialgpt_retrieval/keyword_generation.py"]
result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in keyword generation:", result.stderr)
else:
    print("Keyword generation successful:", result.stdout)

Keyword generation successful: 


In [5]:
# command = [
#     "python3", "trialgpt_retrieval/hybrid_fusion_retrieval.py", 
#     "data", "gpt-4o", str(k), "1", "1"
# ]

# result = subprocess.run(command, capture_output=True, text=True)

# if result.returncode != 0:
#     print("Error in hybrid fusion retrieval:", result.stderr)
# else:
#     print("Hybrid fusion retrieval successful:", result.stdout)


Error in hybrid fusion retrieval: Traceback (most recent call last):
  File "/home/jmjl/HealthUniverse/apps/HealthUniverse_triapgpt_duke_lite/trialgpt_retrieval/hybrid_fusion_retrieval.py", line 182, in <module>
    print(process_queries(corpus, q_type, k, bm25_wt, medcpt_wt))
  File "/home/jmjl/HealthUniverse/apps/HealthUniverse_triapgpt_duke_lite/trialgpt_retrieval/hybrid_fusion_retrieval.py", line 144, in process_queries
    medcpt, medcpt_nctids = get_medcpt_corpus_index(corpus)
ValueError: too many values to unpack (expected 2)



In [13]:
command = [
    "python3", "trialgpt_retrieval/hybrid_fusion_retrieval.py", 
    "data", "gpt-4o", str(k), "1", "1"
]

# Open process
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, universal_newlines=True)

# Print output in real-time
for line in process.stdout:
    print(line, end="")  # Print each line as it appears

# Wait for process to complete and capture errors if any
stdout, stderr = process.communicate()

if process.returncode != 0:
    print("Error in hybrid fusion retrieval:", stderr)
else:
    print("Hybrid fusion retrieval successful:", stdout)


Optimized query processing complete.
Hybrid fusion retrieval successful: 


In [14]:
command = ["python3", "trialgpt_retrieval/retrieval.py", "dataset/data/qid2nctids_results.json"]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in trial retrieval:", result.stderr)
else:
    print("Trial retrieval successful:", result.stdout)

Trial retrieval successful: kp
Results saved to dataset/data/retrieved_trials.json



In [23]:
command = ["python3", "trialgpt_matching/run_matching.py", "data", "gpt-4o"]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in matching:", result.stderr)
else:
    print("Matching successful:", result.stdout)

Matching successful: 


In [24]:
command = ["python3", "trialgpt_matching/generate_trial_info.py", "data"]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in generating trial info:", result.stderr)
else:
    print("Trial info generation successful:", result.stdout)


Trial info generation successful: 


In [25]:
command = [
    "python3", "trialgpt_ranking/run_aggregation.py", 
    "data", "gpt-4o", "dataset/data/matching_results.json"
]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in aggregation:", result.stderr)
else:
    print("Aggregation successful:", result.stdout)


Aggregation successful: 


In [26]:
command = [
    "python3", "trialgpt_ranking/rank_results.py", 
    "dataset/data/matching_results.json", 
    "dataset/data/aggregation_results.json"
]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Error in ranking:", result.stderr)
else:
    print("Ranking successful:", result.stdout)


Error in ranking: Traceback (most recent call last):
  File "/home/jmjl/HealthUniverse/apps/trialgpt_duke/trialgpt_ranking/rank_results.py", line 63, in <module>
    agg_results = json.load(open(agg_results_path))
FileNotFoundError: [Errno 2] No such file or directory: 'dataset/data/aggregation_results.json'



In [None]:
ranking_path = "dataset/data/1_FINAL_ranking_results.txt"

if os.path.exists(ranking_path):
    with open(ranking_path, "r") as f:
        rankings = f.read()
    print("Ranking Results:\n", rankings)
else:
    print("Ranking results file not found.")


In [None]:
zip_path = "dataset/data/results.zip"

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(dataset_path):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, dataset_path)
            zipf.write(file_path, arcname)

print(f"Results ZIP file created at {zip_path}")
