In [53]:
from sklearn.datasets import load_digits
import numpy as np 
from sklearn.cluster import KMeans
import statistics
from sklearn.metrics import accuracy_score

digits = load_digits()
X = digits.data
y = digits.target

def hide_percent(p):
    n = int(p * len(y))
    indices = np.random.choice(len(y), size=n, replace=False)
    y_hidden = y.copy()
    y_hidden[indices] = -1
    return y_hidden

def task1(p):
    print(f"------------- Hidden percentage: {p * 100}%")
    y_hidden = hide_percent(p)
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_

    cluster_label_map = {}

    for i, l in enumerate(y_hidden):
        if l != -1:
            if (labels[i] in cluster_label_map.keys()):
                cluster_label_map[labels[i]].append(l)
            else:
                cluster_label_map[labels[i]] = [l]

    maj_labels = [-1] * 10
    centroids = kmeans.cluster_centers_
    for i, l in enumerate(set(labels)):
        if (l in cluster_label_map.keys()):
            maj_labels[l] = statistics.mode(cluster_label_map[l])
        else:
            min_dist = 1e18
            for j, c in enumerate(centroids):
                d = np.linalg.norm(centroids[i] - c)
                if (labels[j] in cluster_label_map.keys() and d < min_dist):
                    min_dist = d
                    min_dist_label = labels[j]
            maj_labels[l] = min_dist_label


    y_propagated1 = []
    for i in range(len(y)):
        if (y_hidden[i] != -1):
            y_propagated1.append(y_hidden[i])
        else:
            y_propagated1.append(maj_labels[labels[i]])

    print (f"Accuracy score for the majority propagation: {accuracy_score(y_propagated1, y)}")

task1(0.99)
task1(0.95)
task1(0.90)
task1(0.80)


------------- Hidden percentage: 99.0%
Accuracy score for the majority propagation: 0.773511407902059
------------- Hidden percentage: 95.0%
Accuracy score for the majority propagation: 0.8703394546466333
------------- Hidden percentage: 90.0%
Accuracy score for the majority propagation: 0.8786867000556483
------------- Hidden percentage: 80.0%
Accuracy score for the majority propagation: 0.889259877573734


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def knn_percent(p):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=p, shuffle=True, random_state=42)

    knn = KNeighborsClassifier(metric='euclidean', n_neighbors=5)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print(f"Accuracy score for {p*100}% data using knn: {accuracy_score(y_pred, y_test)}")

knn_percent(0.99)
knn_percent(0.95)
knn_percent(0.90)
knn_percent(0.80)
knn_percent(0.3) # normal knn

Accuracy score for 99.0% data using knn: 0.3814606741573034
Accuracy score for 95.0% data using knn: 0.788056206088993
Accuracy score for 90.0% data using knn: 0.911619283065513
Accuracy score for 80.0% data using knn: 0.9568845618915159
Accuracy score for 30.0% data using knn: 0.9925925925925926
