Load MRCONSO and keep only the english codes 

In [None]:
# Define the input file path (used as both input and output)
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRCONSO_filtered.RRF"

# Function to filter and overwrite the input file with lines where the second column is 'ENG'
def modify_eng_entries(file_path):
    try:
        # Read all lines and filter in memory
        with open(file_path, 'r') as infile:
            filtered_lines = [
                line for line in infile if line.strip().split('|')[1] == 'ENG'
            ]
        
        # Overwrite the file with filtered lines
        with open(file_path, 'w') as outfile:
            outfile.writelines(filtered_lines)
        
        print("File has been modified to include only entries with 'ENG' in the second column.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the modification function
modify_eng_entries(file_path)

# Dictionary to count occurrences of unique elements in the "MSH" column
msh_counts = {}

try:
    # Open the file and process line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into columns using '|' as the delimiter
            columns = line.strip().split('|')
            # Extract the "MSH" column (12th column, index 11)
            if len(columns) > 11:  # Ensure there are enough columns
                msh_value = columns[11]  # Correct index for MSH
                # Count occurrences
                msh_counts[msh_value] = msh_counts.get(msh_value, 0) + 1

    # Convert the result into a sorted list for display
    msh_counts_sorted = sorted(msh_counts.items(), key=lambda x: x[1], reverse=True)

    # Display the results
    print("MSH Column Unique Elements with Occurrences:")
    for msh, count in msh_counts_sorted:
        print(f"{msh}: {count}")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the file path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")


KEEP only one term per CUI 

In [None]:
# File paths
input_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRCONSO.RRF"
output_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRCONSO_filtered.RRF"

# Function to filter unique C codes and save to a subfile
def filter_unique_c_codes_to_subfile(input_path, output_path):
    try:
        unique_entries = {}
        current_c_code = None
        buffer = []
        
        # Read the file
        with open(input_path, 'r') as infile:
            for line in infile:
                columns = line.strip().split('|')
                c_code = columns[0]
                
                # When encountering a new C code, process the previous one
                if c_code != current_c_code and current_c_code is not None:
                    selected_entry = select_best_entry(buffer)
                    unique_entries[current_c_code] = selected_entry
                    buffer = []
                
                # Add the current line to the buffer
                current_c_code = c_code
                buffer.append(columns)
            
            # Process the last buffer
            if buffer:
                selected_entry = select_best_entry(buffer)
                unique_entries[current_c_code] = selected_entry

        # Write the filtered entries to the output file
        with open(output_path, 'w') as outfile:
            for entry in unique_entries.values():
                outfile.write('|'.join(entry) + '\n')
        
        print(f"Filtered entries saved to {output_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Function to select the best entry for a given C code
def select_best_entry(entries):
    # Priority 1: Keep the one with PT in column 13
    pt_entries = [entry for entry in entries if entry[12] == 'PT']
    if pt_entries:
        return pt_entries[0]
    
    
    # Priority 2: Keep a random one with P in column 3 and Y in column 7
    py_entries = [entry for entry in entries if entry[2] == 'P' and entry[6] == 'Y' and entry[11] == 'MSH']
    if py_entries:
        return py_entries[0]
    
    # Priority 2: Keep a random one with P in column 3 and Y in column 7
    py_entries = [entry for entry in entries if entry[2] == 'P' and entry[6] == 'Y']
    if py_entries:
        return py_entries[0]
    
    
    # Priority 3: Keep any random entry
    return entries[0]

# Run the function
filter_unique_c_codes_to_subfile(input_file, output_file)


Load MRREL and count the relation occurence

In [None]:
from collections import Counter
# Define the path to your file
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRREL.RRF"

try:
    # Dictionary to store occurrences
    column_3_counter = Counter()

    with open(file_path, "r", encoding="utf-8") as file:
        # Loop through each line
        for line in file:
            # Split the line by the delimiter `|`
            columns = line.strip().split("|")
            if len(columns) > 3:  # Ensure there are at least 4 columns
                column_3_value = columns[3]  # Extract the third column
                column_3_counter[column_3_value] += 1

    # Print the occurrences of each unique value in column 3
    print("Occurrences of unique entries in column 4:")
    for entry, count in column_3_counter.items():
        print(f"{entry}: {count}")

except FileNotFoundError:
    print(f"File not found: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Create triplet file 

In [None]:
import json

# Define input and output file paths
input_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRREL.RRF"
output_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"

# Function to process the file and generate the JSONL output
def process_file(input_path, output_path):
    try:
        with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
            for line in infile:
                # Split the input line into columns
                columns = line.strip().split('|')
                if len(columns) >= 6:
                    # Create the JSON object for each line
                    entry = {
                        'query': columns[0],
                        'pos': [columns[4]],
                        'neg': [''],
                        'category': columns[3]
                    }
                    # Write the JSON object to the output file
                    outfile.write(json.dumps(entry) + '\n')
        print(f"Processing complete. Output saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
process_file(input_file, output_file)


DELETE rows 

In [None]:
import json

# Define the input file path (used as both input and output in this case)
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"

# Define the categories to keep
categories_to_keep = {'SY', 'RB', 'PAR'}

# Filter the JSONL file and overwrite the input file
def modify_jsonl(file_path, categories):
    try:
        # Read all lines and filter in memory
        with open(file_path, 'r') as infile:
            filtered_entries = [
                line for line in infile if json.loads(line.strip())['category'] in categories
            ]
        
        # Overwrite the file with filtered entries
        with open(file_path, 'w') as outfile:
            outfile.writelines(filtered_entries)
        
        print("File has been modified with filtered entries.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the modification function
modify_jsonl(file_path, categories_to_keep)


DELETE DOUBLONS CUI 

In [None]:
import json

# File path
triplet_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"

# Function to remove entries where query equals the first pos
def remove_duplicate_query_pos(triplet_path):
    try:
        # Read all lines into memory
        with open(triplet_path, 'r') as infile:
            triplets = [json.loads(line.strip()) for line in infile]
        
        # Filter out entries where query equals the first pos
        filtered_triplets = [
            triplet for triplet in triplets if not (triplet['pos'] and triplet['query'] == triplet['pos'][0])
        ]
        
        # Overwrite the file with the filtered entries
        with open(triplet_path, 'w') as outfile:
            for triplet in filtered_triplets:
                outfile.write(json.dumps(triplet) + '\n')
        
        print("Triplet file successfully updated. Entries with duplicate query and first pos removed.")
    except Exception as e:
        print(f"Error updating triplet file: {e}")

# Run the function
remove_duplicate_query_pos(triplet_file)


REPLACE CODE BY STRINGS

In [None]:
import json

# File paths
triplet_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"
mrconso_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRCONSO_filtered.RRF"

# Function to create a mapping from C code to string
def create_c_code_mapping(mrconso_path):
    c_code_map = {}
    try:
        with open(mrconso_path, 'r') as file:
            for line in file:
                columns = line.strip().split('|')
                if len(columns) > 14:  # Ensure sufficient columns
                    c_code = columns[0]
                    term_string = columns[14]
                    c_code_map[c_code] = term_string
    except Exception as e:
        print(f"Error creating C code mapping: {e}")
    return c_code_map

# Function to replace query and pos in the triplet file
def replace_c_codes_in_triplet(triplet_path, c_code_map):
    try:
        updated_triplets = []
        
        # Read all triplet entries
        with open(triplet_path, 'r') as infile:
            triplets = [json.loads(line.strip()) for line in infile]
        
        # Replace C codes with corresponding strings
        for triplet in triplets:
            if triplet['query'] in c_code_map:
                triplet['query'] = c_code_map[triplet['query']]
            else:
                continue  # Skip if query is not found
            
            updated_pos = [c_code_map[pos] for pos in triplet['pos'] if pos in c_code_map]
            if not updated_pos:
                continue  # Skip if no pos entries are found
            
            triplet['pos'] = updated_pos
            updated_triplets.append(triplet)
        
        # Write back updated triplets
        with open(triplet_path, 'w') as outfile:
            for triplet in updated_triplets:
                outfile.write(json.dumps(triplet) + '\n')
        
        print(f"Triplet file successfully updated. Entries without corresponding query or pos were removed.")
    except Exception as e:
        print(f"Error updating triplet file: {e}")

# Run the script
c_code_mapping = create_c_code_mapping(mrconso_file)
replace_c_codes_in_triplet(triplet_file, c_code_mapping)


remove doublons

In [None]:
import json

# File path
triplet_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"

# Function to remove duplicates
def remove_duplicates_from_triplet_file(triplet_path):
    try:
        unique_triplets = set()
        deduplicated_triplets = []

        # Read all triplet entries
        with open(triplet_path, 'r') as infile:
            for line in infile:
                triplet = json.loads(line.strip())
                triplet_str = json.dumps(triplet, sort_keys=True)  # Serialize for comparison
                if triplet_str not in unique_triplets:
                    unique_triplets.add(triplet_str)
                    deduplicated_triplets.append(triplet)

        # Write back only unique triplets
        with open(triplet_path, 'w') as outfile:
            for triplet in deduplicated_triplets:
                outfile.write(json.dumps(triplet) + '\n')

        print(f"Duplicates removed. {len(deduplicated_triplets)} unique entries saved.")
    except Exception as e:
        print(f"Error removing duplicates: {e}")

# Run the script
remove_duplicates_from_triplet_file(triplet_file)


Clean 

In [None]:
import json
import re
import os
# Define the cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl'
output_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl'

cleaned_entries = []

with open(input_file, 'r') as f:
    for line in f:
        if line.strip():  # Skip empty lines
            entry = json.loads(line)
            # Clean the 'query', 'pos', and 'neg' fields
            entry['query'] = clean_text(entry['query'])
            entry['pos'] = [clean_text(text) for text in entry['pos']]
           # entry['neg'] = [clean_text(text) for text in entry['neg']]
            cleaned_entries.append(entry)

# Save the cleaned data to a new JSONL file
with open(output_file, 'w') as f:
    for entry in cleaned_entries:
        json.dump(entry, f)
        f.write('\n')

print(f'Cleaned data saved to {output_file}')

PREVIEW FILES

In [None]:
# Define the path to the filtered file
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/MRCONSO.RRF"

try:
    with open(file_path, "r", encoding="utf-8") as file:
        # Read the first 100 lines
        for i, line in enumerate(file):
            if i < 1000:  # Limit to the first 100 lines
                print(line.strip())
            else:
                break

except FileNotFoundError:
    print(f"File not found: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")



In [None]:
import json

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Test/SY_triplets.jsonl'

# Read all lines from the input file
with open(input_file, 'r') as f:
    lines = f.readlines()

# Print the length of the file
print(f'Number of entries in the file: {len(lines)}')

# Print the head of the file (first 5 entries)
for i in range(min(200, len(lines))):
    print(json.loads(lines[i]))

Top query 

In [None]:
import os
import json
from collections import Counter

# Path to the JSONL file
file_path = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Train/triplet_UMLS.jsonl"

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    # Counter to store term frequencies
    term_counter = Counter()

    # Reading the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            # Combine query and positive terms
            terms = [data['query']] + data.get('pos', [])
            # Update the term counter
            term_counter.update(terms)

    # Get the top 200 most frequent terms
    top_200_terms = term_counter.most_common(200)

    # Get 200 unique terms that appear only once
    unique_terms = [term for term, count in term_counter.items() if count == 1][:200]

    # Display the results
    print("Top 200 most frequent terms in 'query' or 'pos':")
    for term, count in top_200_terms:
        print(f"{term}: {count}")

    print("\n200 unique terms that appear only once:")
    for term in unique_terms:
        print(term)
