Load CT Title + BF + detailed desc + Keywords 

In [None]:
import json
import os

# Load the content from the provided file
input_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/ctg-studies (5).json'
output_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text.json'

def create_output_data(input_path):
    with open(input_path, 'r') as file:
        data = json.load(file)

    # Extracting necessary information and creating new JSON structure
    result = []
    for study in data:
        protocol_section = study.get('protocolSection', {})
        identification_module = protocol_section.get('identificationModule', {})
        description_module = protocol_section.get('descriptionModule', {})
        conditions_module = protocol_section.get('conditionsModule', {})

        # Extract titles and summaries
        official_title = identification_module.get('officialTitle', '')
        brief_title = identification_module.get('briefTitle', '')
        brief_summary = description_module.get('briefSummary', '')
        detailed_description = description_module.get('detailedDescription', '')

        # Use official title if available, otherwise use brief title
        title = official_title if official_title else brief_title

        # Construct the complete summary
        summary_parts = [title, brief_summary, detailed_description]
        summary = '. '.join(part for part in summary_parts if part)

        # Add keywords as separate sentences
        keywords = conditions_module.get('keywords', [])
        keywords_sentences = '. '.join(keywords)
        if keywords_sentences:
            summary = f"{summary}. {keywords_sentences}"

        # Clean the text: remove newline characters and asterisks
        summary = summary.replace('\n', '').replace('\r', '').replace('*', '')

        # Extract nctId
        nct_id = identification_module.get('nctId', '')

        # Creating a dictionary with the required fields
        result.append({
            'summary': summary,
            'ntcId': nct_id,
        })

    return result

if __name__ == "__main__":
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    output_data = create_output_data(input_path)

    # Save the result to a new JSON file
    with open(output_path, 'w') as output_file:
        json.dump(output_data, output_file, indent=4)

    print("JSON file created successfully.")

    # Print the first 5 entries (head)
    print(output_data[:5])


Create CT id List 

In [None]:
import pandas as pd
import json

file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/paired_CT_PM.csv"
df = pd.read_csv(file_path)

# Extract the PMid column and convert it to a list
pmid_list = df['nCTid'].astype(str).tolist()

# Delete the last 90 elements from the list
if len(pmid_list) > 90:
    pmid_list = pmid_list[:-90]
else:
    pmid_list = []

# Convert the list to a JSON formatted string
pmid_json = json.dumps(pmid_list)

# Save the JSON list to a file if needed
output_json_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/CTG_id_list.json'
with open(output_json_path, 'w') as json_file:
    json.dump(pmid_list, json_file)

# Save the modified DataFrame back to a CSV file
df.to_csv(file_path, index=False)
print(df.head(1))


Keep only the trials in the list 

In [None]:
import json
# File paths
ctg_id_text_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text.json'
ctid_list_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/CTG_id_list.json'

# Load the list of IDs and convert it to a set for faster lookup
with open(ctid_list_path, 'r') as json_file:
    ctid_set = set(json.load(json_file))

# Load the ctg_id_text JSON file
with open(ctg_id_text_path, 'r') as json_file:
    ctg_data = json.load(json_file)

# Remove entries whose 'ntcId' is not in the set
filtered_data = [entry for entry in ctg_data if entry.get('ntcId') in ctid_set]


# Save the filtered JSON back to a file
output_filtered_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text.json'
with open(output_filtered_path, 'w') as json_file:
    json.dump(filtered_data, json_file)

print(f"Filtered data saved to {output_filtered_path}")

Add the respective ref

In [None]:
import json
import pandas as pd

# File paths
filtered_json_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text.json'
paired_csv_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/paired_CT_PM.csv'
output_json_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text_refs.json'

# Load the paired CSV into a pandas DataFrame
paired_df = pd.read_csv(paired_csv_path)

# Create a dictionary mapping nCTid to PMid
nctid_to_pmid = pd.Series(paired_df.PMid.values, index=paired_df.nCTid).to_dict()

# Load the filtered JSON file
with open(filtered_json_path, 'r') as json_file:
    filtered_data = json.load(json_file)

# Add 'ref' section to each JSON entry
for entry in filtered_data:
    nctid = entry.get('ntcId')
    if nctid in nctid_to_pmid:
        entry['ref'] = nctid_to_pmid[nctid]

# Save the updated JSON back to a file
with open(output_json_path, 'w') as json_file:
    json.dump(filtered_data, json_file, indent=4)

print(f"Updated data with references saved to {output_json_path}")


Make sure to be found in pubmed

In [None]:
import json

# Load data from files
with open('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text_refs.json', 'r') as ctg_file:
    ctg_data = json.load(ctg_file)

with open('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/PM_id_text.json', 'r') as pm_file:
    pm_data = json.load(pm_file)

# Create a set of article IDs from PM_id_text.json
article_ids = {entry['article_id'] for entry in pm_data}

# Filter CTG_id_text_refs.json entries
filtered_ctg_data = [entry for entry in ctg_data if str(entry['ref']) in article_ids]

# Save the filtered data back to the file
with open('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text_refs.json', 'w') as output_file:
    json.dump(filtered_ctg_data, output_file, indent=4)

print(f"Filtered data saved to /home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/CTG_id_text_refs.json")


SLIP for train and test 

In [None]:
import json
import os

# Load the original data
input_file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/LONG_CTG_id_text_refs.json"  # Replace with your actual input file path
with open(input_file_path, 'r') as input_file:
    data = json.load(input_file)

# Calculate split index for 80% training and 20% testing
split_index = int(len(data) * 0.8)

# Split the data into training and testing parts
train_data = data[:split_index]
test_data = data[split_index:]

# Define output paths for train and test files
output_folder = '/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/'
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
train_file_path = os.path.join(output_folder, 'LONG_CTG_id_text_refs_train.json')
test_file_path = os.path.join(output_folder, 'LONG_CTG_id_text_refs_test.json')

# Write the training data
with open(train_file_path, 'w') as train_file:
    json.dump(train_data, train_file, indent=2)

# Write the testing data
with open(test_file_path, 'w') as test_file:
    json.dump(test_data, test_file, indent=2)

print(f"Training and testing files have been successfully created:\n{train_file_path}\n{test_file_path}")

In [None]:
##########################################################

import json

output_path = '/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/LONG_CTG_id_text_refs_test.json'

with open(output_path, 'r') as json_file:
    data = json.load(json_file)

# Print the length of the file
print(f"Number of entries in the file: {len(data)}")

# Print the first 2 entries (head)
print(data[:2])