In [43]:
from sklearn.datasets import fetch_openml 
from sklearn.cluster import KMeans
import time

In [44]:
def get_data():
    mnist = fetch_openml('mnist_784', version=1)
    images, labels = mnist["data"], mnist["target"]
    images_train, image_test , labels_train, labels_test = images[:60000], images[60000:], labels[:60000], labels[60000:]
    
    return images_train, image_test , labels_train, labels_test    

In [45]:
def classification(images, labels):
    clustering = KMeans(n_clusters = 69, init = 'random')
    model = clustering.fit(images)
    
    interpretation = {}
    
    for index in range(len(model.labels_)):
        real_label = int(labels[index])
        predicited_label = model.labels_[index]
        if predicited_label in interpretation.keys():
            label_votes = interpretation[predicited_label]
            if real_label in label_votes:
                vote_count = label_votes[real_label]
                vote_count += 1
                label_votes[real_label] = vote_count
            else:
                label_votes[real_label] = 1
        else:
            label_votes = {}
            interpretation[predicited_label] = label_votes
    
    for label in interpretation.keys():
        voting_counts = interpretation[label]
        interpretation[label] = sorted(voting_counts.items(), key = lambda x:x[1], reverse=True)
    
    real_interpretation = {}
    for predicited_label in interpretation.keys():
        voting_counts = interpretation[predicited_label]
        max_label = voting_counts[0][0]
        real_interpretation[predicited_label] = max_label

       
    return model, real_interpretation

In [46]:
def eval(model, test_images, test_labels, key):
    correct_amount = 0
    for index in range(len(test_images)):
        image = [test_images[index]]
        predicited_label = model.predict(image)[0]
        if key[predicited_label] == int(test_labels[index]):
            correct_amount += 1
    accuracy = correct_amount / len(test_images)
    return accuracy


In [47]:
def printOutput(model, accuracy):
    print("Predicited with an accuracy of {} percent".format(accuracy * 100))

    with open("clustercenters.txt", 'w') as f:
        for center in model.cluster_centers_:
            print(center, file = f)

    print("Written Cluster Centers to pre-exisitng file called clustercenters.txt")





In [48]:
start_time = time.time()

print("This program takes about 1.5 minutes to train and test")
print("I have attached a file called clustercenters.txt which contains my cluster centers")
print("I am using 100 clusters. 10 clusters were not enough to classify the MNIST numbers")
print("with high enough accuracy. The cluster centers in the file produce an accuracy")
print("of about 88.43%. Every time you run this program it will create and output new cluster centers.")
print("------------------------------------------------------------------------------------------")

images_train, image_test , labels_train, labels_test = get_data()
model, key = classification(images_train, labels_train)
accuracy = eval(model, image_test, labels_test, key)
printOutput(model, accuracy)

print(time.time() - start_time)




This program takes about 1.5 minutes to train and test
I have attached a file called clustercenters.txt which contains my cluster centers
I am using 100 clusters. 10 clusters were not enough to classify the MNIST numbers
with high enough accuracy. The cluster centers in the file produce an accuracy
of about 88.43%. Every time you run this program it will create and output new cluster centers.
------------------------------------------------------------------------------------------
Predicited with an accuracy of 85.28 percent
Written Cluster Centers to pre-exisitng file called clustercenters.txt
73.61689782142639
