<a href="https://colab.research.google.com/github/isnanmulia/colab-machinelearning/blob/main/ML_SemiSupervised_LabelSpreading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial uses codes from these sources, with several adjustments:
- https://sparkbyexamples.com/machine-learning/semi-supervised-learning-with-example/

In [21]:
# Import necessary modules
from numpy import concatenate
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import accuracy_score, confusion_matrix

In [23]:
# Load the iris dataset
iris = load_iris()

# Split the data into labeled and unlabeled subsets
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(iris.data, iris.target, random_state=1, test_size=0.8, stratify=iris.target)

In [24]:
# Create the Label Spreading model
model = LabelSpreading(kernel='knn', alpha=0.8)

# Fit the model using both labeled and unlabeled data
model.fit(X_labeled, y_labeled)

In [25]:
# Predict labels for the unlabeled data
y_pred = model.predict(X_unlabeled)

# Compute the accuracy of the model
accuracy = accuracy_score(y_unlabeled, y_pred)
print('Accuracy:', accuracy)
# confusion matrix
cm = confusion_matrix(y_unlabeled, y_pred)
print(cm)

Accuracy: 0.9166666666666666
[[40  0  0]
 [ 0 38  2]
 [ 0  8 32]]


In [27]:
# Define dataset (just like on LabelPropagation)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.50, random_state=1, stratify=iris.target)
# split train into labeled and unlabeled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.50, random_state=1, stratify=y_train)
# summarize training set size
print('Labeled Train Set:', X_train_lab.shape, y_train_lab.shape)
print('Unlabeled Train Set:', X_test_unlab.shape, y_test_unlab.shape)
# summarize test set size
print('Test Set:', X_test.shape, y_test.shape)

Labeled Train Set: (37, 4) (37,)
Unlabeled Train Set: (38, 4) (38,)
Test Set: (75, 4) (75,)


In [28]:
# Label Spreading
# Prepare the data
# create the training dataset input
X_train_mixed = concatenate((X_train_lab, X_test_unlab))
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_test_unlab))]
# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))

# define model
model = LabelSpreading(kernel='knn', alpha=0.8)
# fit model on training dataset
model.fit(X_train_mixed, y_train_mixed)

print(y_train_mixed)
print(model.transduction_)

[ 1  1  1  1  2  1  1  0  2  0  0  0  2  2  1  2  1  0  2  1  2  2  0  2
  2  1  2  0  0  0  0  0  1  2  1  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1]
[1 1 1 1 2 1 1 0 2 0 0 0 2 2 1 2 1 0 2 1 2 2 0 2 2 1 2 0 0 0 0 0 1 2 1 0 0
 1 2 1 2 0 2 2 1 0 2 1 1 2 0 0 0 0 0 2 1 2 2 2 1 1 0 1 0 1 2 1 2 0 0 2 1 1
 0]


In [29]:
# make predictions on hold out test set
yhat = model.predict(X_test)
# calculate score for test set
score = accuracy_score(y_test, yhat)
# summarize score
print('Accuracy: %.3f' % (score*100))
# confusion matrix
cm = confusion_matrix(y_test, yhat)
print(cm)

Accuracy: 96.000
[[25  0  0]
 [ 0 23  2]
 [ 0  1 24]]
