<a href="https://colab.research.google.com/github/isnanmulia/colab-machinelearning/blob/main/ML_SemiSupervised_PseudoLabeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial uses codes from these sources, with several adjustments:
- https://www.geeksforgeeks.org/machine-learning/pseudo-labelling-semi-supervised-learning/

In [8]:
# Import required libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
# Load Digits dataset (0â€“9 digit images)
iris = load_iris()
X, y = iris.data, iris.target

# Split into labeled and unlabeled data (20% labeled, 80% unlabeled)
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
    X, y, test_size=0.8, stratify=y, random_state=42
)

# Further split a part of unlabeled data as validation set
X_unlabeled, X_val, y_unlabeled, y_val = train_test_split(
    X_unlabeled, y_unlabeled, test_size=0.25, stratify=y_unlabeled, random_state=42
)

# summarize training set size
print('Labeled Train Set:', X_labeled.shape, y_labeled.shape)
print('Unlabeled Train Set:', X_unlabeled.shape, y_unlabeled.shape)
# summarize test set size
print('Test Set:', X_val.shape, y_val.shape)

Labeled Train Set: (30, 4) (30,)
Unlabeled Train Set: (90, 4) (90,)
Test Set: (30, 4) (30,)


In [14]:
# Definition of pseudo labeling procedure
def pseudo_labeling(X_labeled, y_labeled, X_unlabeled, model, threshold=0.95, max_iters=5):
    for iteration in range(max_iters):
        print(f"\nIteration {iteration + 1}")

        # Train model on labeled data
        model.fit(X_labeled, y_labeled)

        # Predict on unlabeled data
        probs = model.predict_proba(X_unlabeled)
        # print(probs)
        preds = np.argmax(probs, axis=1)
        confidences = np.max(probs, axis=1)

        # Select high-confidence pseudo-labels
        mask = confidences >= threshold
        X_pseudo = X_unlabeled[mask]
        y_pseudo = preds[mask]

        if len(X_pseudo) == 0:
            print("No high-confidence pseudo-labels found. Stopping.")
            break

        # Combine with original labeled data
        X_labeled = np.vstack([X_labeled, X_pseudo])
        y_labeled = np.concatenate([y_labeled, y_pseudo])

        # Remove pseudo-labeled examples from unlabeled set
        X_unlabeled = X_unlabeled[~mask]

        print(f"Added {len(X_pseudo)} pseudo-labeled samples.")
        print('Labeled Train Set:', X_labeled.shape, y_labeled.shape)

    return model

In [15]:
# Initialize model
model = RandomForestClassifier(n_estimators=100)

# Run pseudo-labelling
final_model = pseudo_labeling(X_labeled, y_labeled, X_unlabeled, model, threshold=0.95)

# Evaluate on held-out validation set
y_pred = final_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
# confusion matrix
cm = confusion_matrix(y_val, y_pred)
print(cm)


Iteration 1
Added 66 pseudo-labeled samples.
Labeled Train Set: (96, 4) (96,)

Iteration 2
No high-confidence pseudo-labels found. Stopping.

Validation Accuracy: 0.9666666666666667
[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]
