In [1]:
import urllib.request
import gzip
import os
import os.path

In [2]:
savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files = [
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz",
]

In [3]:
if not os.path.exists(savepath):
    os.mkdir(savepath)

In [5]:
for f in files:
    url = baseurl + "/" + f
    loc = savepath + "/" + f
    print("download:", url)
    if not os.path.exists(loc):
        urllib.request.urlretrieve(url, loc)

download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


In [6]:
for f in files:
    gz_file = savepath + "/" + f
    raw_file = savepath + "/" + f.replace(".gz", "")
    print("gzip:", f)
    with gzip.open(gz_file, "rb") as fp:
        body = fp.read()
        with open(raw_file, "wb") as w:
            w.write(body)
print("ok")

gzip: train-images-idx3-ubyte.gz
gzip: train-labels-idx1-ubyte.gz
gzip: t10k-images-idx3-ubyte.gz
gzip: t10k-labels-idx1-ubyte.gz
ok


In [7]:
import struct

In [10]:
def to_csv(name, maxdata):
    lbl_f = open("./mnist/" + name + "-labels-idx1-ubyte", "rb")
    img_f = open("./mnist/" + name + "-images-idx3-ubyte", "rb")
    csv_f = open("./mnist/" + name + ".csv", "w", encoding="utf-8")
    mag, lbl_count = struct.unpack(">II", lbl_f.read(8))
    mag, img_count = struct.unpack(">II", img_f.read(8))
    rows, cols = struct.unpack(">II", img_f.read(8))
    pixels = rows * cols
    res = []
    for idx in range(lbl_count):
        if idx > maxdata:
            break
        label = struct.unpack("B", lbl_f.read(1))[0]
        bdata = img_f.read(pixels)
        sdata = list(map(lambda n: str(n), bdata))
        csv_f.write(str(label) + ",")
        csv_f.write(",".join(sdata) + "\r\n")
        if idx < 10:
            s = "P2 28 28 255\n"
            s += " ".join(sdata)
            iname = "./mnist/{0}-{1}-{2}.pgm".format(name, idx, label)
            with open(iname, "w", encoding="utf-8") as f:
                f.write(s)
    csv_f.close()
    lbl_f.close()
    img_f.close()

In [11]:
to_csv("train", 1000)

In [12]:
to_csv("t10k", 500)

In [13]:
from sklearn import svm, metrics

In [16]:
def load_csv(fname):
    labels = []
    images = []
    with open(fname, "r") as f:
        for line in f:
            cols = line.split(",")
            if len(cols) < 2:
                continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}

In [17]:
data = load_csv("./mnist/train.csv")

In [18]:
test = load_csv("./mnist/t10k.csv")

In [19]:
clf = svm.SVC()
clf.fit(data["images"], data["labels"])

SVC()

In [20]:
predict = clf.predict(test["images"])

In [21]:
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)

In [22]:
ac_score

0.8842315369261478

In [23]:
cl_report

'              precision    recall  f1-score   support\n\n           0       0.87      0.98      0.92        42\n           1       0.99      1.00      0.99        67\n           2       0.91      0.89      0.90        55\n           3       0.94      0.72      0.81        46\n           4       0.86      0.93      0.89        55\n           5       0.75      0.82      0.78        50\n           6       0.95      0.81      0.88        43\n           7       0.79      0.94      0.86        49\n           8       0.94      0.82      0.88        40\n           9       0.89      0.87      0.88        54\n\n    accuracy                           0.88       501\n   macro avg       0.89      0.88      0.88       501\nweighted avg       0.89      0.88      0.88       501\n'