# Semi-supervised Data

In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation, LabelSpreading

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                           n_classes=2, random_state=42)
X_train, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create semi-supervised training data
rng = np.random.RandomState(42)
n_labeled = 100
indices = np.arange(len(X_train))
rng.shuffle(indices)
y_train = np.copy(y_train_full)
y_train[indices[n_labeled:]] = -1  # Unlabeled data

# Naive method: Train only on labeled data
svc_naive = SVC(probability=True)
svc_naive.fit(X_train[indices[:n_labeled]], y_train_full[indices[:n_labeled]])
y_pred_naive = svc_naive.predict(X_test)
print("Naive SVC accuracy:", accuracy_score(y_test, y_pred_naive))

# Self-training
self_training = SelfTrainingClassifier(base_estimator=SVC(probability=True), criterion='k_best', k_best=50)
self_training.fit(X_train, y_train)
y_pred_self = self_training.predict(X_test)
print("Self-training accuracy:", accuracy_score(y_test, y_pred_self))

# Label Propagation
label_prop = LabelPropagation(kernel='rbf', gamma=0.25)
label_prop.fit(X_train, y_train)
y_pred_lp = label_prop.predict(X_test)
print("Label Propagation accuracy:", accuracy_score(y_test, y_pred_lp))

# Label Spreading
label_spread = LabelSpreading(kernel='rbf', gamma=0.25)
label_spread.fit(X_train, y_train)
y_pred_ls = label_spread.predict(X_test)
print("Label Spreading accuracy:", accuracy_score(y_test, y_pred_ls))


Naive SVC accuracy: 0.82
Self-training accuracy: 0.82
Label Propagation accuracy: 0.7733333333333333
Label Spreading accuracy: 0.7933333333333333


