In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils import resample
from strategies import *
import random
from collections import Counter
from Classifier import TFIDF_LR_Classifier

In [None]:
# Function for active learning main loop
def active_learning_loop(data, data_size, initial_size, batch_size, n_iterations, method='diversity sampling', n_committee=None, p_min=0.1):
    # === Step 1: Load data ===
    # Filter out data to the required size
    data = data

    # === Step 2: Select top-N labels (e.g., top 100) ===
    all_labels = [code for label_list in data['ICD9_CODE'] for code in label_list]
    top_labels = set([label for label, _ in Counter(all_labels).most_common(100)])

    data['filtered_labels'] = data['ICD9_CODE'].apply(lambda codes: [c for c in codes if c in top_labels])
    data = data[data['filtered_labels'].map(len) > 0]  # Remove samples with no labels

    # === Step 3: Construct training and validation sets ===
    texts = data['TEXT'].tolist()
    labels = data['filtered_labels'].tolist()
    X_pool, X_val, y_pool, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Optional: Limit dataset size for debugging
    X_pool = X_pool[:data_size]
    y_pool = y_pool[:data_size]

    # === Step 4: Initialize labeled / unlabeled pool ===
    X_labeled = X_pool[:initial_size]
    y_labeled = y_pool[:initial_size]
    X_unlabeled = X_pool[initial_size:]
    y_unlabeled = y_pool[initial_size:]

    # === Step 5: Initialize evaluation components ===
    micro_f1_list = []
    macro_f1_list = []

    # === Step 6: Active Learning main loop ===
    for i in range(n_iterations):
        print(f"Iter {i+1}/{n_iterations} | Labeled size: {len(X_labeled)}")

        # Train TF-IDF + LR model
        clf = TFIDF_LR_Classifier()
        clf.fit(X_labeled, y_labeled)

        # Validate model on validation set
        eval_metrics = clf.evaluate(X_val, y_val)
        micro_f1_list.append(eval_metrics['micro_f1'])
        macro_f1_list.append(eval_metrics['macro_f1'])
        print(f"Micro-F1: {eval_metrics['micro_f1']:.4f} | Macro-F1: {eval_metrics['macro_f1']:.4f}\n")

        
        selected_idx = []

        if method == 'diversity sampling': 
            # === Diversity Sampling (KMeans) ===
            embeddings = clf.vectorizer.transform(X_unlabeled).toarray()
            selected_idx = diversity_sampling_kmeans(
                embeddings=X_unlabeled_vec,
                batch_size=batch_size,
                n_clusters=batch_size,
                mode='centroid'  # You can replace this with 'border' or 'random'
            )
            
        elif method == 'uncertainty sampling':
            # === Uncertainty Sampling ===
            probs = clf.predict_proba(X_unlabeled)
            selected_idx = uncertainty_sampling(
                probabilities=probs, 
                batch_size=batch_size, 
                mode='least_confidence') # You can replace this with 'binary_entropy' or 'smallest_margin'
            
        elif method == 'query by committee':
            # === QBC: train multiple committee models on different subsets ===
            if n_committee is None:
                n_committee = 3
            comm_probs_list = []
            for seed in range(n_committee):
                indices = np.random.choice(len(X_labeled), size=len(X_labeled), replace=True)
                X_sample = [X_labeled[i] for i in indices]
                y_sample = [y_labeled[i] for i in indices]
                member = TFIDF_LR_Classifier()
                member.fit(X_sample, y_sample)
                comm_probs_list.append(member.predict_proba(X_unlabeled))

            # === QBC ===
            selected_idx = query_by_committee(
                comm_probs_list=comm_probs_list,
                batch_size=batch_size,
                mode='vote_entropy'  # or 'kl_divergence'
            )

        elif method == 'expected model change':
            # === Expected Model Change ===
            X_unlabeled_vec = clf.vectorizer.transform(X_unlabeled)
            selected_idx = expected_model_change(
                clf=clf.model,
                X_unlabeled_vec=X_unlabeled_vec,
                batch_size=batch_size
            )

        elif method == 'iwal':
            # === IWAL ===
            '''May not successfully run since base learner LR is very slow in this AL strategy. 
            SGDClassifier is better.'''
            X_unlabeled_vec = clf.vectorizer.transform(X_unlabeled)
            probs = clf.predict_proba(X_unlabeled)
            selected_idx, _ = iwal_sampling(clf.model, X_unlabeled_vec, batch_size=batch_size, p_min=p_min)

        else:
            raise ValueError(f"Unknown method: {method}")


        # Add selected samples to labeled set
        for idx in sorted(selected_idx, reverse=True):
            X_labeled.append(X_unlabeled.pop(idx))
            y_labeled.append(y_unlabeled.pop(idx))

    return micro_f1_list, macro_f1_list

# Example usage:
# data = pd.read_pickle('path_to_data.pkl')
# micro_f1_list, macro_f1_list = active_learning_loop(data, data_size=5000, initial_size=300, batch_size=10, n_iterations=10)
