In [7]:
%matplotlib inline
import math
import time 
import numpy as np
import torch as t
import matplotlib.pyplot as plt



In [3]:
## vectorization is faster! 

n = 1000
a = t.ones(n)
b = t.ones(n)
c = t.zeros(n)
t_0 = time.time()
for i in range(n):
    c[i] = a[i] + b[i]
print(f"{time.time() - t_0} sec")

t_0 = time.time()
d = a + b
print(f"{time.time() - t_0} sec")
## not as big of a difference on my computer... 1 order of magnitude instead of 3

0.005253314971923828 sec
0.0002167224884033203 sec


Building a simple MNIST classifier - a crude estimate using a hyperplane decision boundary between their means! (Take the vector between their means !)

In [None]:
## Load the dataset
from torchvision import transforms 
import torchvision

trans = []
trans.append(transforms.ToTensor())
print(trans)
trans = transforms.Compose(trans)
print(trans)

train = torchvision.datasets.FashionMNIST(root = "../data", transform = trans, train = True, download = True)
test = torchvision.datasets.FashionMNIST(root = "../data", transform = trans, train = False, download = True)


In [None]:
### separate into categories 0 and 1 based on label 

X_train_0 = t.stack([x[0] for x in train if x[1] == 0]).type(t.float32)
X_train_1 = t.stack([x[0] for x in train if x[1] == 1]).type(t.float32)
X_test = t.stack([x[0] for x in test if x[1]==0 or x[1]==1]).type(t.float32)
y_test = t.stack([t.tensor(x[1]) for x in test if x[1]==0 or x[1]==1]).type(t.float32)
print(X_train_0.shape, X_train_1.shape, X_test.shape, y_test.shape)

## compute means of each dataset 

ave_0 = t.mean(X_train_0, axis = 0)
ave_1 = t.mean(X_train_1, axis = 0)

In [None]:
_, axes = plt.subplots(1,2)
## shirts 
axes[0].imshow(ave_0.reshape(28,28).tolist(), cmap = 'Greys')
## pants 
axes[1].imshow(ave_1.reshape(28,28).tolist(), cmap = 'Greys')


In [None]:
####So, instead of learning the boundary, just take the difference between their means as the sicriminant vector
##
w = (ave_1 - ave_0).T
weighted = X_test.reshape(2000, -1) @ w.flatten()
plt.plot(weighted)
plt.xlabel("test examples")
plt.ylabel("y_guess")
## looks like the threshold for the decision boundary should be somewhere around -22
### it looks like most fluctuations happen around this point
print(f"median = {t.median(weighted)}")

acc = t.mean(((weighted > -22).type(y_test.dtype) == y_test).float(), dtype = t.float32)
print(f"Accuracy is not bad! {acc}")

In [None]:
### More systematically: let's try it for a few thresholds and see if we can do better 
thresholds = t.arange(-100,5,1)

preds_list = [(X_test.reshape(2000, -1) @ w.flatten()  > thresh).type(y_test.dtype) for thresh in thresholds]
## average number of predictions above the threshold
preds_mean = [t.mean(preds) for preds in preds_list]
## accuracy 
acc_list = [t.mean((preds == y_test).float(), dtype = t.float32) for preds in preds_list]
max_pos = t.argmax(t.tensor(acc_list))

print(f"threshold = {thresholds[max_pos.item()]}, max accuracy = {acc_list[max_pos.item()]}")

_, ax = plt.subplots(1,2)
ax[0].plot(thresholds, preds_mean)
ax[0].axvline(x=-25,color = "m",ls = "--")
ax[0].set_xlabel("thresholds")
# should be roughly half for a balanced dataset 
ax[0].set_ylabel("average positive predictions above threshold")
ax[1].plot(thresholds, acc_list)
ax[1].axvline(x=-25,color = 'm',ls = "--")
ax[1].set_xlabel("thresholds")
ax[1].set_ylabel("Accuracy")


