Parse the json extracted directly from clinicaltrials.gov

In [None]:
import json

# Load the input JSON file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-studies_523k_complete.json'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-studies_structured.json'

with open(input_file, 'r') as file:
    data = json.load(file)

print(f"Length of input file: {len(data)}")

# Parse the JSON data
def parse_clinical_trials(data):
    parsed_data = []
    for entry in data:
        protocol_section = entry.get("protocolSection", {})
        identification = protocol_section.get("identificationModule", {})
        description = protocol_section.get("descriptionModule", {})
        conditions = protocol_section.get("conditionsModule", {}).get("conditions", [])
        keywords = protocol_section.get("conditionsModule", {}).get("keywords", [])
        interventions = protocol_section.get("armsInterventionsModule", {}).get("interventions", [])
        outcomes = protocol_section.get("outcomesModule", {}).get("primaryOutcomes", [])

        # Extract intervention descriptions
        intervention_descriptions = ""
        for intervention in interventions:
            name = intervention.get("name", "")
            desc = intervention.get("description", "")
            if name:
                intervention_descriptions += f"{name}: {desc}\n" if desc else f"{name}\n"
        intervention_descriptions = intervention_descriptions.strip()

        parsed_entry = {
            "id": identification.get("nctId", ""),
            "title": identification.get("officialTitle") or identification.get("briefTitle", ""),
            "summary": description.get("briefSummary", "").split('\n\n'),
            "condition": conditions,
            "keywords": keywords,
            "intervention": [
                intervention.get("name", "")
                for intervention in interventions
                if intervention.get("type") != "OTHER" and intervention.get("name")
            ],
            "intervention_description": intervention_descriptions,
            "outcome_measure": [outcome.get("measure", "") for outcome in outcomes]
        }

        parsed_data.append(parsed_entry)

    return parsed_data

# Parse the data and write to output JSONL file
parsed_data = parse_clinical_trials(data)

with open(output_file, 'w') as file:
    for entry in parsed_data:
        json.dump(entry, file)
        file.write('\n')

print(f"Length of output file: {len(parsed_data)}")
print(f"Parsed data saved to {output_file}")

# Read the output JSONL file and print the first two lines
with open(output_file, 'r') as file:
    for i, line in enumerate(file):
        print(f"Line {i+1}: {line.strip()}")
        if i == 20:  # Stop after printing two lines
            break


Generate Triplets

In [None]:
import json

# Load the parsed JSON file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-studies_structured.json'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.json'

# Create an empty list to store the resulting triplets
triplets = []

# Read and process each line of the JSONL file
with open(input_file, "r") as infile:
    for line in infile:
        entry = json.loads(line)
        
        # Extract fields
        title = entry.get("title", "").strip()
        summary = [s.strip() for s in entry.get("summary", []) if s.strip()]
        keywords = [k.strip() for k in entry.get("keywords", []) if k.strip()]
        condition = [c.strip() for c in entry.get("condition", []) if c.strip()]
        intervention = [i.strip() for i in entry.get("intervention", []) if i.strip()]
        intervention_description = entry.get("intervention_description", "").strip()
        outcome_measure = [o.strip() for o in entry.get("outcome_measure", []) if o.strip()]
        

        # Create triplets: each element of condition with each element of intervention
        for cond in condition:
            for interv in intervention:
                if cond and interv:
                    triplet = {
                        "query": cond,
                        "pos": [interv],
                        "neg": [""],
                        "category": "intervention"
                    }
                    triplets.append(triplet)
    
        # Create triplets: each element of condition with each element of keywords
        for cond in condition:
            for key in keywords:
                if cond and key:
                    triplet = {
                        "query": cond,
                        "pos": [key],
                        "neg": [""],
                        "category": "keyword"
                    }
                    triplets.append(triplet)

"""       # Create triplets: intervention_description with each element of outcome_measure 
        for out in outcome_measure:
            if intervention_description and out:
                triplet = {
                    "query": intervention_description,
                    "pos": [out],
                    "neg": [""],
                    "category": "outcome"
                }
                triplets.append(triplet)

        # Create triplets: title with each element of summary
        for summ in summary:
            if title and summ:
                triplet = {
                    "query": title,
                    "pos": [summ],
                    "neg": [""],
                    "category": "summary"
                }
                triplets.append(triplet)
 """      
            
# Write the resulting triplets to an output JSONL file
with open(output_file, "w") as outfile:
    for triplet in triplets:
        outfile.write(json.dumps(triplet) + "\n")

# Print the first ten entries of the resulting JSONL
for i in range(min(10, len(triplets))):
    print(json.dumps(triplets[i], indent=2))

# Print the length of the resulting JSONL file
print(f"Total number of triplets: {len(triplets)}")


Clean

In [None]:
import json
import re
import os
# Define the cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.json'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.jsonl'

cleaned_entries = []

with open(input_file, 'r') as f:
    for line in f:
        if line.strip():  # Skip empty lines
            entry = json.loads(line)
            # Clean the 'query', 'pos', and 'neg' fields
            entry['query'] = clean_text(entry['query'])
            entry['pos'] = [clean_text(text) for text in entry['pos']]
           # entry['neg'] = [clean_text(text) for text in entry['neg']]
            cleaned_entries.append(entry)

# Save the cleaned data to a new JSONL file
with open(output_file, 'w') as f:
    for entry in cleaned_entries:
        json.dump(entry, f)
        f.write('\n')

print(f'Cleaned data saved to {output_file}')

# Delete the specified file
file_to_delete = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.json'
if os.path.exists(file_to_delete):
    os.remove(file_to_delete)
    print(f"Deleted file: {file_to_delete}")
else:
    print(f"File not found: {file_to_delete}")

# Delete the specified file
file_to_delete = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-studies_structured.json'
if os.path.exists(file_to_delete):
    os.remove(file_to_delete)
    print(f"Deleted file: {file_to_delete}")
else:
    print(f"File not found: {file_to_delete}")

REMOVE DOUBLONS

In [13]:
import json

file_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.jsonl'

def modify_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    modified_lines = []
    for line in lines:
        entry = json.loads(line)
        if entry['query'].lower() not in [pos.lower() for pos in entry['pos'] if pos]:
            modified_lines.append(json.dumps(entry) + '\n')

    with open(file_path, 'w') as file:
        file.writelines(modified_lines)

modify_file(file_path)

Filtersize 

In [None]:
import random

input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets_v3.2.jsonl'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/SMALL_ctg-triplets_v3.2.jsonl'

# Set the limit of entries to extract
limit = 100000

# Read all lines from the input file
with open(input_file, 'r') as infile:
    lines = infile.readlines()

# Shuffle the lines randomly
random.shuffle(lines)

# Select the first 'limit' entries
selected_lines = lines[:limit]

# Write the selected entries to the output file
with open(output_file, 'w') as outfile:
    for line in selected_lines:
        outfile.write(line)

print(f"Extracted {len(selected_lines)} entries to {output_file}")


FILTER  MAX Token 

In [None]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

# Load the tokenizer
model_name = "BAAI/bge-base-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets_v3.jsonl'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets_v3.jsonl'

# Parameters for token length filtering
max_query_tokens = 256
max_pos_tokens = 256
batch_size = 1024

# Counter for removed triplets
removed_triplets_count = 0

def filter_batch(batch_data):
    global removed_triplets_count
    filtered_data = []
    for entry in batch_data:
        entry = json.loads(entry)  # Convert string to dictionary
        query_tokens = len(tokenizer.tokenize(entry.get("query", "")))
        pos_tokens_list = [len(tokenizer.tokenize(pos)) for pos in entry.get("pos", [])]

        if query_tokens <= max_query_tokens and all(pos_tokens <= max_pos_tokens for pos_tokens in pos_tokens_list):
            filtered_data.append(entry)
        else:
            removed_triplets_count += 1
    return filtered_data

class TripletsDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r') as file:
            for line in file:
                self.data.append(line.strip())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Using PyTorch DataLoader to batch process the data
dataset = TripletsDataset(input_file)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=10)

filtered_data = []
max_query_length = 0
max_pos_length = 0

for batch in data_loader:
    batch_filtered = filter_batch(batch)
    filtered_data.extend(batch_filtered)

    # Calculate max token length for the current batch
    for entry in batch_filtered:
        query_length = len(tokenizer.tokenize(entry.get("query", "")))
        pos_lengths = [len(tokenizer.tokenize(pos)) for pos in entry.get("pos", [])]

        max_query_length = max(max_query_length, query_length)
        if pos_lengths:
            max_pos_length = max(max_pos_length, max(pos_lengths))

# Write the filtered data to output JSONL file
with open(output_file, 'w') as file:
    for entry in filtered_data:
        file.write(json.dumps(entry) + '\n')

print(f"Filtered triplets data saved to {output_file}")
print(f"Length of final filtered data: {len(filtered_data)}")
print(f"Maximum query token length in filtered data: {max_query_length}")
print(f"Maximum positive passage token length in filtered data: {max_pos_length}")
print(f"Number of triplets removed due to exceeding token limits: {removed_triplets_count}")


VIEW JSONL

In [None]:
import json

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets.jsonl'

# Read all lines from the input file
with open(input_file, 'r') as f:
    lines = f.readlines()

# Print the length of the file
print(f'Number of entries in the file: {len(lines)}')

# Print the head of the file (first 5 entries)
for i in range(min(100, len(lines))):
    print(json.loads(lines[i]))

VIEW JSON

In [None]:
import json

# Load the JSON data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/unique_training_terms.json'

# Read and parse the JSON file
with open(input_file, 'r') as f:
    data = json.load(f)  # Load as a single JSON object (list or dict)

# Print the number of entries if it's a list
if isinstance(data, list):
    print(f'Number of entries in the JSON file: {len(data)}')

# Print the head of the data (first 5 entries for a list, or the first 5 keys for a dictionary)
if isinstance(data, list):
    for i in range(min(5, len(data))):
        print(data[i])
elif isinstance(data, dict):
    keys = list(data.keys())
    for key in keys[:5]:
        print(f'{key}: {data[key]}')

List of unique training terms 

In [None]:
import json

# Load the JSON file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets_v3.2.jsonl'

# Read all lines from the input file
with open(input_file, 'r') as f:
    data = f.readlines()

# Extract the terms from the 'query' and 'pos' fields
terms = set()

# Iterate through each line, parse it, and extract the terms
for line in data:
    try:
        record = json.loads(line)
        # Split the 'query' string into words and add them to the set
        terms.update(record['query'].split())

        # Loop through 'pos' list, split each string into words, and add to the set
        for item in record['pos']:
            terms.update(item.split())
    except json.JSONDecodeError:
        print(f"Error parsing line: {line}")
    except KeyError as e:
        print(f"Missing key {e} in line: {line}")

# Remove any empty strings that might have been added
terms.discard('')

# Write the output JSON to a file
output_file = '/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/unique_ct_terms_weight.json'
with open(output_file, 'w') as file:
    json.dump(list(terms), file, indent=4)

print("Unique terms have been written to unique_training_terms.json")


Top Query Analysis 

In [None]:
import os
import json
from collections import Counter

# Path to the JSONL file
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/ctg-triplets_v3.2.jsonl"

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    # Counter to store term frequencies
    term_counter = Counter()

    # Reading the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            # Combine query and positive terms
            terms = [data['query']] + data.get('pos', [])
            # Update the term counter
            term_counter.update(terms)

    # Get the top 200 most frequent unique terms
    top_200_terms = term_counter.most_common(200)

    # Display the results
    print("Top 200 most frequent unique terms in 'query' or 'pos':")
    for term, count in top_200_terms:
        print(f"{term}: {count}")




Transform to good shape (from json to jsonl)

In [None]:
import json
import os
# Paths for input and output files
input_file = ''
output_file = ''


# Load the input JSONL file and transform the data
with open(input_file, 'r') as infile:
    data = json.load(infile)  # Load the entire JSON file
    with open(output_file, 'w') as outfile:
        for trial in data:
            transformed_entry = {
                "query": trial["query"],
                "pos": trial["pos"],
                "neg": trial["neg"],
                "category":tiral["category"]
            }
            outfile.write(json.dumps(transformed_entry) + '\n')
       
print(f"Transformation complete. Transformed data saved to {output_file}")

# Delete the specified file
file_to_delete = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/ctg-triplets_v3.json'
if os.path.exists(file_to_delete):
    os.remove(file_to_delete)
    print(f"Deleted file: {file_to_delete}")
else:
    print(f"File not found: {file_to_delete}")