Add random negatives 

In [1]:
import json
import random

# Define file paths
input_file_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Test/SY_triplets.jsonl'
output_file_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Test/SY_triplets_w_neg.jsonl'

# Load the input data
with open(input_file_path, 'r') as infile:
    lines = infile.readlines()
    data = [json.loads(line) for line in lines]

# Group entries by category
categories = {}
for entry in data:
    entry['neg'] = entry['pos']  # Assign pos value to neg
    category = entry['category']
    if category not in categories:
        categories[category] = []
    categories[category].append(entry)

# Shuffle neg terms within each category
for category, entries in categories.items():
    neg_pool = [entry['neg'][0] for entry in entries]  # Collect all neg terms in a list
    random.shuffle(neg_pool)  # Shuffle the neg terms
    for i, entry in enumerate(entries):
        entry['neg'] = [neg_pool[i]]  # Assign the shuffled neg term back to the entry

# Write the modified data to the output file
with open(output_file_path, 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

Create test file 

In [None]:
import pandas as pd
import json
import csv
import random

# File paths
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Test/SY_triplets_w_neg.jsonl'
output_pairs_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Triplet_AUC/pos_SY_triplets.csv'
output_neg_pairs_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Triplet_AUC/neg_SY_triplets.csv'
output_unique_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Triplet_AUC/U_terms_SY_triplets.csv'

# Reading JSONL and extracting query, positive pairs, and negative pairs
max_entries = 200000
data = []
neg_data = []
with open(input_file, 'r') as f:
    for line in f:
        try:
            item = json.loads(line)
            query = item.get('query')
            positives = item.get('pos')
            negatives = item.get('neg')
            if isinstance(query, str):
                if isinstance(positives, list):
                    for positive in positives:
                        if isinstance(positive, str):
                            data.append([query, positive])
                if isinstance(negatives, list):
                    for negative in negatives:
                        if isinstance(negative, str):
                            neg_data.append([query, negative])
        except json.JSONDecodeError:
            continue

# Select n random entries for positive and negative pairs
if len(data) > max_entries:
    data = random.sample(data, max_entries)
if len(neg_data) > max_entries:
    neg_data = random.sample(neg_data, max_entries)

# Creating DataFrames and saving to CSV
df = pd.DataFrame(data, columns=['desc1', 'desc2'])
df.dropna(subset=['desc1', 'desc2'], inplace=True)  # Drop rows with NaN or empty cells

df_neg = pd.DataFrame(neg_data, columns=['desc1', 'desc2'])
df_neg.dropna(subset=['desc1', 'desc2'], inplace=True)  # Drop rows with NaN or empty cells

# Cleaning text - removing dots and non-string elements
df['desc1'] = df['desc1'].apply(lambda x: x.replace('.', '') if isinstance(x, str) else x)
df['desc2'] = df['desc2'].apply(lambda x: x.replace('.', '') if isinstance(x, str) else x)
df = df[df['desc1'].apply(lambda x: isinstance(x, str)) & df['desc2'].apply(lambda x: isinstance(x, str))]

df_neg['desc1'] = df_neg['desc1'].apply(lambda x: x.replace('.', '') if isinstance(x, str) else x)
df_neg['desc2'] = df_neg['desc2'].apply(lambda x: x.replace('.', '') if isinstance(x, str) else x)
df_neg = df_neg[df_neg['desc1'].apply(lambda x: isinstance(x, str)) & df_neg['desc2'].apply(lambda x: isinstance(x, str))]

# Save the cleaned positive pairs to CSV
df.to_csv(output_pairs_file, index=False, quoting=csv.QUOTE_ALL)

# Save the cleaned negative pairs to CSV
df_neg.to_csv(output_neg_pairs_file, index=False, quoting=csv.QUOTE_ALL)

# Print the length of the cleaned pairs
print(f"Number of cleaned positive pairs: {len(df)}")
print(f"Number of cleaned negative pairs: {len(df_neg)}")

# Extract unique elements from both desc1 and desc2 (for positives and negatives)
unique_terms = pd.concat([df['desc1'], df['desc2'], df_neg['desc1'], df_neg['desc2']]).unique()
unique_df = pd.DataFrame(unique_terms, columns=['Common_Groups'])

# Saving unique elements to CSV
unique_df.dropna(subset=['Common_Groups'], inplace=True)  # Drop rows with NaN
unique_df.to_csv(output_unique_file, index=False, quoting=csv.QUOTE_ALL)

# Print the length of unique elements
print(f"Number of unique elements: {len(unique_terms)}")

print("Processing complete. Files saved.")


View file 

In [None]:
import json

# Load the JSONL data from the file
input_file = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Clinical_Trial_Triplet_v3/Test/SY_triplets.jsonl'

# Read all lines from the input file
with open(input_file, 'r') as f:
    lines = f.readlines()

# Print the length of the file
print(f'Number of entries in the file: {len(lines)}')

# Print the head of the file (first 5 entries)
for i in range(min(200, len(lines))):
    print(json.loads(lines[i]))