Test Model BOOSTED with neg pairs pre generated 

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score, f1_score, precision_score, recall_score, roc_curve
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt

# Load model
model_name = '/n/data1/hsph/biostat/celehs/lab/jh537/Models/Flarge_V5'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
model = model.cuda()

# Load data
positive_pairs_df = pd.read_csv('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Thomas_AUC/df_pairs.csv')
negative_pairs_df = pd.read_csv('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Thomas_AUC/negative_pairs.csv')

# Remove NaN values
positive_pairs_df.dropna(subset=['desc1', 'desc2'], inplace=True)
negative_pairs_df.dropna(subset=['desc1', 'desc2'], inplace=True)

# Combine both datasets and labels
all_pairs = pd.concat([positive_pairs_df, negative_pairs_df], axis=0).reset_index(drop=True)
labels = np.concatenate([np.ones(len(positive_pairs_df)), np.zeros(len(negative_pairs_df))])

# Define batch size
batch_size = 32  # You can adjust this

# Dataset class for batch processing
class PairsDataset(Dataset):
    def __init__(self, df):
        self.desc1 = df['desc1'].tolist()
        self.desc2 = df['desc2'].tolist()

    def __len__(self):
        return len(self.desc1)

    def __getitem__(self, idx):
        return self.desc1[idx], self.desc2[idx]

# Custom collate function to properly batch text pairs
def collate_fn(batch):
    desc1_batch, desc2_batch = zip(*batch)  # Separates list of tuples into two lists
    return list(desc1_batch), list(desc2_batch)

# Create dataset and dataloader
pairs_dataset = PairsDataset(all_pairs)
pairs_loader = DataLoader(pairs_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Function to compute batched embeddings
def compute_embeddings(batch_texts):
    encoded_input = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
    with torch.no_grad():
        outputs = model(**encoded_input).last_hidden_state[:, 0]  # CLS token embedding
    return outputs.cpu()

# Compute cosine similarities in batches
similarities = []

for texts1, texts2 in tqdm(pairs_loader, desc="Computing similarities in batches"):
    # Compute embeddings for both text pairs
    embeddings1 = compute_embeddings(texts1)
    embeddings2 = compute_embeddings(texts2)

    # Compute cosine similarity using batched matrix operations
    dot_product = torch.sum(embeddings1 * embeddings2, dim=1)
    norm_product = torch.norm(embeddings1, dim=1) * torch.norm(embeddings2, dim=1)
    cosine_similarities = dot_product / norm_product

    similarities.extend(cosine_similarities.numpy())

# Convert lists to numpy arrays
similarities = np.array(similarities)

# Compute AUC-ROC (Threshold-independent)
auc_roc = roc_auc_score(labels, similarities)
print(f"AUC-ROC: {auc_roc:.4f}")

# Compute PR-AUC (Threshold-independent)
precision, recall, _ = precision_recall_curve(labels, similarities)
pr_auc = np.trapz(recall, precision)  # Computes the area under the PR curve
print(f"PR-AUC: {pr_auc:.4f}")

# Compute Average Precision (AP) (Threshold-independent)
ap_score = average_precision_score(labels, similarities)
print(f"Average Precision (AP): {ap_score:.4f}")

# Compute optimal threshold using Youden’s J statistic
fpr, tpr, thresholds = roc_curve(labels, similarities)
j_scores = tpr - fpr
optimal_threshold = thresholds[np.argmax(j_scores)]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Compute F1-score with the optimal threshold
binary_predictions = (similarities >= optimal_threshold).astype(int)
f1 = f1_score(labels, binary_predictions)
precision_at_optimal = precision_score(labels, binary_predictions)
recall_at_optimal = recall_score(labels, binary_predictions)

print(f"F1-score (Optimal Threshold): {f1:.4f}")
print(f"Precision (Optimal Threshold): {precision_at_optimal:.4f}")
print(f"Recall (Optimal Threshold): {recall_at_optimal:.4f}")

# Plot ROC Curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC={auc_roc:.4f})', color='b')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()

# Plot Precision-Recall Curve
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, label=f'PR Curve (AP={ap_score:.4f})', color='g')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


False positive check

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Load the model
model_name = model_name = "BAAI/bge-base-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Set the model to evaluation mode
model = model.cuda()  # Move model to GPU

# Load the negative pairs DataFrame
negative_pairs_df = pd.read_csv('/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Thomas_AUC/negative_pairs.csv')
negative_pairs_df.dropna(subset=['desc1', 'desc2'], inplace=True)  # Remove rows with NaN values

# Compute cosine similarity using matrix operations
def compute_embeddings(text_list):
    encoded_input = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
    with torch.no_grad():
        output = model(**encoded_input)[0][:, 0]
    return output.cpu()

def cosine_similarity_matrix(embeddings_1, embeddings_2):
    norm_1 = torch.norm(embeddings_1, dim=1, keepdim=True)
    norm_2 = torch.norm(embeddings_2, dim=1, keepdim=True)
    similarity_matrix = torch.mm(embeddings_1, embeddings_2.T) / (norm_1 * norm_2.T)
    return similarity_matrix

# Process negative pairs to find false positives in batches
batch_size = 16
false_positive_pairs = []
for start_idx in tqdm(range(0, len(negative_pairs_df), batch_size), desc="Processing negative pairs in batches"):
    end_idx = min(start_idx + batch_size, len(negative_pairs_df))
    batch = negative_pairs_df.iloc[start_idx:end_idx]
    embeddings_1 = compute_embeddings(batch['desc1'].tolist())
    embeddings_2 = compute_embeddings(batch['desc2'].tolist())
    similarity_matrix = cosine_similarity_matrix(embeddings_1, embeddings_2)
    for i, (_, row) in enumerate(batch.iterrows()):
        similarity = similarity_matrix[i, i].item()
        if similarity > 0.55:
            false_positive_pairs.append([row['desc1'], row['desc2'], similarity])

# Create DataFrame with false positive pairs
false_positive_pairs_df = pd.DataFrame(false_positive_pairs, columns=['desc1', 'desc2', 'cosine_similarity'])

# Save false positive pairs to CSV
false_positive_csv_path = '/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Thomas_AUC/false_positive_pairs.csv'
false_positive_pairs_df.to_csv(false_positive_csv_path, index=False)

print(f"CSV file with false positive pairs saved to: {false_positive_csv_path}")
