In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Function for active learning main loop
def active_learning_loop(data, data_size, initial_size, batch_size, n_iterations):
    # === Step 1: Load data ===
    # Filter out data to the required size
    data = data

    # === Step 2: Select top-N labels (e.g., top 100) ===
    all_labels = [code for label_list in data['ICD9_CODE'] for code in label_list]
    top_labels = set([label for label, _ in Counter(all_labels).most_common(100)])

    data['filtered_labels'] = data['ICD9_CODE'].apply(lambda codes: [c for c in codes if c in top_labels])
    data = data[data['filtered_labels'].map(len) > 0]  # Remove samples with no labels

    # === Step 3: Construct training and validation sets ===
    texts = data['TEXT'].tolist()
    labels = data['filtered_labels'].tolist()
    X_pool, X_val, y_pool, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Optional: Limit dataset size for debugging
    X_pool = X_pool[:data_size]
    y_pool = y_pool[:data_size]

    # === Step 4: Initialize labeled / unlabeled pool ===
    X_labeled = X_pool[:initial_size]
    y_labeled = y_pool[:initial_size]
    X_unlabeled = X_pool[initial_size:]
    y_unlabeled = y_pool[initial_size:]

    # === Step 5: Initialize evaluation components ===
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    mlb = MultiLabelBinarizer()
    micro_f1_list = []
    macro_f1_list = []

    # === Step 6: Active Learning main loop ===
    for i in range(n_iterations):
        print(f"Iter {i+1}/{n_iterations} | Labeled size: {len(X_labeled)}")

        # Train TF-IDF + LR model
        X_vec = vectorizer.fit_transform(X_labeled)
        Y_bin = mlb.fit_transform(y_labeled)
        clf = OneVsRestClassifier(LogisticRegression(C=1.0, solver='saga', max_iter=1000), n_jobs=-1)
        clf.fit(X_vec, Y_bin)

        # Validate model on validation set
        X_val_vec = vectorizer.transform(X_val)
        Y_val_bin = mlb.transform(y_val)
        Y_val_pred = clf.predict(X_val_vec)
        micro_f1 = f1_score(Y_val_bin, Y_val_pred, average='micro')
        macro_f1 = f1_score(Y_val_bin, Y_val_pred, average='macro')

        micro_f1_list.append(micro_f1)
        macro_f1_list.append(macro_f1)
        print(f"Micro-F1: {micro_f1:.4f}")
        print(f"Macro-F1: {macro_f1:.4f}")
        print()

        # Convert unlabeled samples to TF-IDF vectors
        X_unlabeled_vec = vectorizer.transform(X_unlabeled).toarray()

        # === Diversity Sampling (KMeans) ===
        selected_idx = diversity_sampling_kmeans(
            embeddings=X_unlabeled_vec,
            batch_size=batch_size,
            n_clusters=batch_size,
            mode='centroid'  # You can replace this with 'border' or 'random'
        )

        # Add selected samples to labeled set
        for idx in sorted(selected_idx, reverse=True):
            X_labeled.append(X_unlabeled.pop(idx))
            y_labeled.append(y_unlabeled.pop(idx))

    return micro_f1_list, macro_f1_list

# Example usage:
# data = pd.read_pickle('path_to_data.pkl')
# micro_f1_list, macro_f1_list = active_learning_loop(data, data_size=5000, initial_size=300, batch_size=10, n_iterations=10)
