# Active Learning using KMeans Clustering

In [1]:
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Load the Forest Covertype Dataset

In [2]:
# load dataset
x, y = fetch_covtype(return_X_y=True)
NUM_CLASSES = 7

In [3]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [4]:
# divide the whole dataset in train & test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.40, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 3% of labelled points for training
NUM_EXAMPLES = len(X_train)
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.90, random_state = 4)
X_cluster, _, y_cluster, _ = train_test_split(X_unlabelled, y_oracle, test_size = 0.60, random_state = 4)

In [5]:
### Function for finding cluster labels by labelling 20% points randomly

In [6]:
def get_labels_for_cluster(kmeans, labels):
    cluster_labels = [None] * NUM_CLASSES

    for label in range(NUM_CLASSES):
        # get args for all points with the current label
        args = np.argwhere(kmeans.labels_== label)
        # randomly choose 20% points of the cluster to label
        indices = np.random.choice(args.shape[0], int(len(args) * 0.2), replace=False).reshape(-1, 1)
        indices = np.sort(indices)
        args = args[indices].reshape(-1)
        # get labels for the chosen points
        random_labels = y_cluster[args]
        # find the cluster label and add to the list
        counts = np.bincount(random_labels)
        cluster_labels[label] = np.argmax(counts)

    return labels[cluster_labels]

In [7]:
labels = np.unique(y_cluster)
kmeans = KMeans(n_clusters=NUM_CLASSES, random_state=0).fit(X_cluster)
cluster_labels = get_labels_for_cluster(kmeans, labels)

In [8]:
### Performance Evaluation

In [9]:
y_pred = kmeans.predict(X_test)
y_pred = cluster_labels[y_pred]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.062442718530152104

Confusion Matrix:
[[     0  13332  71482      0      0      0      0]
 [     0   7632 104498   1252      0      0      0]
 [     0      0   5790   8478      0      0      0]
 [     0      0      0   1090      0      0      0]
 [     0     98   3724      0      0      0      0]
 [     0      0   2974   3815      0      0      0]
 [     0   1086   7154      0      0      0      0]]
