In [None]:
!pip install transformers safetensors pandas numpy scikit-learn scipy tqdm

In [None]:
!nvidia-smi

In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
!python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())"

In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from safetensors.torch import load_file
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix)
from scipy.stats import bootstrap
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_dir = "bert_base_v2_Jun26"
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(model_dir, state_dict=None)
state_dict = load_file(f"{model_dir}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
df = pd.read_csv("DATA-batch-2-comparison-2-final-for-NLP-classifier-JC-allData-v2.csv")
df.head()

In [None]:
texts = df["Text_snippet"].to_list()
labels = df["label"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
def encode(texts):
    return tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

In [None]:
def predict(model, texts, batch_size=16):
    preds, probs = [], []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = encode(batch)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = model(**inputs).logits
            probs_batch = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            preds_batch = (probs_batch > 0.5).astype(int)
        preds.extend(preds_batch)
        probs.extend(probs_batch)
    return np.array(preds), np.array(probs)

In [None]:
y_pred_train, y_proba_train = predict(model, X_train)
y_pred_test, y_proba_test = predict(model, X_test)

In [None]:
# Save train set with predictions
df_train = pd.DataFrame({
    "Text_snippet": X_train,
    "labels": y_train,
    "preds": y_pred_train,
    "probs": y_proba_train
})
df_train.to_csv("train_with_preds.csv", index=False)
 
# Save test set with predictions
df_test = pd.DataFrame({
    "Text_snippet": X_test,
    "labels": y_test,
    "preds": y_pred_test,
    "probs": y_proba_test
})
df_test.to_csv("test_with_preds.csv", index=False)

In [None]:
# Use original dataframe indices during train/test split
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    texts, labels, df.index, test_size=0.2, random_state=42
)

In [None]:
y_pred_train, y_proba_train = predict(model, X_train)
y_pred_test, y_proba_test = predict(model, X_test)

In [None]:
df = pd.read_csv("DATA-batch-2-comparison-2-final-for-NLP-classifier-JC-allData-v2.csv")

In [None]:
# make empty columns

df["preds"] = np.nan
df["probs"] = np.nan
df["split"] = ""

In [None]:
# add to it

df.loc[idx_train, "preds"] = y_pred_train
df.loc[idx_train, "probs"] = y_proba_train
df.loc[idx_train, "split"] = "train"

In [None]:
df.loc[idx_test, "preds"] = y_pred_test
df.loc[idx_test, "probs"] = y_proba_test
df.loc[idx_test, "split"] = "test"

In [None]:
df.to_csv("all_data_v2_with_preds.csv", index=False)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Ensure correct types
df["label"] = df["label"].astype(int)
df["preds"] = df["preds"].astype(int)
df["probs"] = df["probs"].astype(float)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
def bootstrap_metric(y_true, y_pred, y_proba, metric_func, n_bootstrap=1000, confidence_level=0.95):
    """
    Calculate bootstrap confidence intervals for a given metric
    """
    np.random.seed(42)  # For reproducibility
    n_samples = len(y_true)
    bootstrap_scores = []

    for _ in range(n_bootstrap):
        # Bootstrap sample
        indices = np.random.choice(n_samples, n_samples, replace=True)
        y_true_boot = y_true.iloc[indices] if hasattr(y_true, 'iloc') else y_true[indices]
        y_pred_boot = y_pred.iloc[indices] if hasattr(y_pred, 'iloc') else y_pred[indices]

        if y_proba is not None:
            y_proba_boot = y_proba.iloc[indices] if hasattr(y_proba, 'iloc') else y_proba[indices]
        else:
            y_proba_boot = None

        try:
            if metric_func.__name__ == 'roc_auc_score':
                if y_proba_boot is not None:
                    score = metric_func(y_true_boot, y_proba_boot)
                else:
                    continue
            else:
                score = metric_func(y_true_boot, y_pred_boot)
            bootstrap_scores.append(score)
        except:
            continue

   
    bootstrap_scores = np.array(bootstrap_scores)
    alpha = 1 - confidence_level
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100

    ci_lower = np.percentile(bootstrap_scores, lower_percentile)
    ci_upper = np.percentile(bootstrap_scores, upper_percentile)

    return ci_lower, ci_upper


In [None]:
def calculate_binary_metrics(y_true, y_pred, y_proba=None, pos_label=1):
    """
    Calculate binary classification metrics
    """
    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
   
    # Basic metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # PPV is the same as precision, sensitivity is the same as recall

    ppv = precision
    sensitivity = recall 

    # AUC
    auc = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'npv': npv,
        'ppv': ppv,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'auc': auc
    }

In [None]:
def calculate_metrics_with_ci(df, split_type='train'):
    """
    Calculate metrics with confidence intervals for a specific split
    """
    # Filter data by split
    data = df[df['split'] == split_type].copy()

    if len(data) == 0:
        return None

   
    y_true = data['label']
    y_pred = data['preds']
    y_proba = data['probs'] if 'probs' in data.columns else None

   
    # Get unique labels
    unique_labels = sorted(y_true.unique())
    results = {}
   
    # Calculate metrics for each label
    for label in unique_labels:
        # Convert to binary classification (current label vs rest)
        y_true_binary = (y_true == label).astype(int)
        y_pred_binary = (y_pred == label).astype(int)

        # Calculate base metrics
        metrics = calculate_binary_metrics(y_true_binary, y_pred_binary, y_proba)

       
        # Calculate confidence intervals

        metric_names = ['precision', 'recall', 'f1', 'npv', 'ppv', 'sensitivity', 'specificity']
        metric_funcs = {
            'precision': lambda yt, yp: precision_score(yt, yp, zero_division=0),
            'recall': lambda yt, yp: recall_score(yt, yp, zero_division=0),
            'f1': lambda yt, yp: f1_score(yt, yp, zero_division=0),
            'npv': lambda yt, yp: calculate_binary_metrics(yt, yp)['npv'],
            'ppv': lambda yt, yp: precision_score(yt, yp, zero_division=0),  # Same as precision
            'sensitivity': lambda yt, yp: recall_score(yt, yp, zero_division=0),  # Same as recall
            'specificity': lambda yt, yp: calculate_binary_metrics(yt, yp)['specificity']
        }
       
        label_results = {}

        for metric_name in metric_names:
            ci_lower, ci_upper = bootstrap_metric(
                y_true_binary, y_pred_binary, y_proba, metric_funcs[metric_name]
            )

            label_results[metric_name] = {
                'value': metrics[metric_name],
                'ci_lower': ci_lower,
                'ci_upper': ci_upper,
                'ci_formatted': f"{metrics[metric_name]:.3f} ({ci_lower:.3f}-{ci_upper:.3f})"
            }

       
        # AUC (only if probabilities are available)

        if y_proba is not None:
            try:
                ci_lower, ci_upper = bootstrap_metric(
                    y_true_binary, y_pred_binary, y_proba, roc_auc_score
                )

                label_results['auc'] = {
                    'value': metrics['auc'],
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper,
                    'ci_formatted': f"{metrics['auc']:.3f} ({ci_lower:.3f}-{ci_upper:.3f})"
                }

            except:
                label_results['auc'] = {
                    'value': np.nan,
                    'ci_lower': np.nan,
                    'ci_upper': np.nan,
                    'ci_formatted': "N/A"
                }      

        results[f'label_{label}'] = label_results

   

    # Calculate weighted averages

    weighted_results = {}
    metric_names = ['precision', 'recall', 'f1', 'npv', 'ppv', 'sensitivity', 'specificity', 'auc']


    for metric_name in metric_names:
        if metric_name == 'auc' and y_proba is None:
            continue

        # Calculate weighted average

        label_counts = y_true.value_counts()
        total_count = len(y_true)     
        weighted_sum = 0
        for label in unique_labels:
            weight = label_counts[label] / total_count
            metric_value = results[f'label_{label}'][metric_name]['value']
            if not np.isnan(metric_value):
                weighted_sum += weight * metric_value

        # Bootstrap for weighted average CI

        def weighted_metric_func(y_true_sample, y_pred_sample):
            label_counts_sample = pd.Series(y_true_sample).value_counts()
            total_count_sample = len(y_true_sample)
            weighted_sum_sample = 0
           
            for label in unique_labels:
                if label in label_counts_sample:
                    weight = label_counts_sample[label] / total_count_sample
                    y_true_binary_sample = (y_true_sample == label).astype(int)
                    y_pred_binary_sample = (y_pred_sample == label).astype(int)


                    if metric_name == 'precision':
                        metric_value = precision_score(y_true_binary_sample, y_pred_binary_sample, zero_division=0)
                    elif metric_name == 'recall':
                        metric_value = recall_score(y_true_binary_sample, y_pred_binary_sample, zero_division=0)
                    elif metric_name == 'f1':
                        metric_value = f1_score(y_true_binary_sample, y_pred_binary_sample, zero_division=0)
                    elif metric_name in ['npv', 'ppv', 'sensitivity', 'specificity']:
                        metric_value = calculate_binary_metrics(y_true_binary_sample, y_pred_binary_sample)[metric_name]

                    weighted_sum_sample += weight * metric_value

            return weighted_sum_sample

        if metric_name != 'auc':
            ci_lower, ci_upper = bootstrap_metric(y_true, y_pred, None, weighted_metric_func)
        else:
            # For AUC, we need a different approach since it's calculated differently
            try:
                ci_lower, ci_upper = bootstrap_metric(y_true, y_pred, y_proba,
                                                   lambda yt, yp: roc_auc_score(yt, yp))
            except:
                ci_lower, ci_upper = np.nan, np.nan

        weighted_results[metric_name] = {
            'value': weighted_sum,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'ci_formatted': f"{weighted_sum:.3f} ({ci_lower:.3f}-{ci_upper:.3f})"
        }

    results['weighted_average'] = weighted_results

    return results

In [None]:
# Main function to calculate all metrics

def calculate_all_metrics(df):
    """
    Calculate all metrics for both train and test splits
    """
    # Validate required columns
    required_cols = ['label', 'preds', 'split']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    results = {}

    # Calculate for train split

    train_results = calculate_metrics_with_ci(df, 'train')
    if train_results:
        results['train'] = train_results

    # Calculate for test split

    test_results = calculate_metrics_with_ci(df, 'test')
    if test_results:
        results['test'] = test_results

    return results



In [None]:
# Function to display results in a formatted way

def display_results(results):
    """
    Display results in a formatted table
    """
    for split_name, split_results in results.items():
        print(f"\n{'='*60}")
        print(f"RESULTS FOR {split_name.upper()} SET")
        print(f"{'='*60}")

        # Display results for each label

        for label_key, label_results in split_results.items():
            if label_key == 'weighted_average':
                continue

            print(f"\n{label_key.upper()}:")
            print("-" * 40)

            for metric_name, metric_data in label_results.items():
                print(f"{metric_name.upper():>12}: {metric_data['ci_formatted']}")

        # Display weighted averages

        if 'weighted_average' in split_results:
            print(f"\nWEIGHTED AVERAGES:")
            print("-" * 40)

            for metric_name, metric_data in split_results['weighted_average'].items():
                print(f"{metric_name.upper():>12}: {metric_data['ci_formatted']}")

 



In [None]:
# Calculate all metrics

results = calculate_all_metrics(df)

In [None]:
# Display results

display_results(results)

In [None]:
# ignore auc above - inconsistent and probailities are likely inverted