In [371]:
# Libraries
import numpy as np
rng =np.random.default_rng()
np.set_printoptions(suppress=True, precision=4)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from scipy.special import logsumexp
from bokeh.plotting import figure, output_notebook, show
output_notebook()
from tqdm import tqdm

## Multiclass Regression

### Simulate data

We will simulate some tiny image data, where each image has is an array of five 0-255 values and there
are 4 different images:
x0x0x
00xxx
xxx00
x000x

In [372]:
samples=[60,40,80,120]
variability=40

x0=[30,200,30,200,30]
x1=[200,200,30,30,30]
x2=[30,30,30,200,200]
x3=[30,200,200,200,30]

data = np.concatenate([rng.normal(loc=x,scale=variability,size=(n,5)).astype(int) for n,x in zip(samples,[x0,x1,x2,x3])])

# cut off entries bigger than 255

labels = np.array([0]*samples[0]+[1]*samples[1]+[2]*samples[2]+[3]*samples[3])

# rearrange the rows
rows = list(range(sum(samples)))
shuffled = rng.shuffle(rows)
data = data[rows,:]
labels=labels[rows]


In [373]:
labels

array([3, 1, 2, 0, 2, 3, 2, 3, 3, 3, 3, 0, 1, 2, 3, 3, 2, 2, 1, 3, 3, 3,
       2, 2, 2, 2, 3, 1, 3, 2, 3, 0, 0, 0, 2, 3, 1, 3, 2, 0, 3, 2, 3, 0,
       3, 3, 2, 2, 0, 2, 0, 3, 2, 3, 1, 0, 1, 0, 1, 0, 3, 1, 2, 2, 3, 2,
       2, 3, 1, 2, 3, 1, 3, 2, 3, 0, 3, 3, 3, 1, 2, 0, 2, 3, 3, 3, 0, 1,
       2, 3, 2, 3, 2, 3, 3, 1, 3, 3, 0, 2, 3, 3, 0, 2, 0, 1, 3, 1, 0, 3,
       1, 3, 3, 0, 2, 0, 1, 1, 2, 0, 2, 2, 2, 2, 3, 3, 3, 2, 2, 1, 3, 3,
       2, 0, 2, 0, 3, 0, 3, 2, 0, 1, 0, 3, 3, 3, 2, 2, 3, 0, 3, 3, 0, 1,
       3, 2, 0, 1, 2, 2, 1, 2, 0, 3, 2, 3, 2, 2, 3, 2, 3, 3, 0, 2, 1, 2,
       2, 3, 3, 3, 0, 1, 3, 3, 3, 0, 0, 2, 3, 0, 3, 0, 3, 0, 3, 3, 2, 2,
       0, 0, 3, 0, 0, 0, 3, 2, 0, 2, 3, 3, 3, 3, 3, 1, 3, 2, 3, 1, 2, 0,
       1, 0, 0, 1, 2, 3, 3, 3, 2, 0, 3, 3, 2, 2, 1, 2, 0, 0, 0, 3, 3, 3,
       0, 1, 1, 2, 2, 3, 2, 0, 3, 1, 3, 3, 3, 0, 3, 1, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 1, 2, 0, 3, 3, 2, 3, 0, 3, 1, 1, 2, 3, 2, 3, 0, 3, 3, 3,
       1, 0, 1, 2, 0, 3, 2, 2, 3, 0, 2, 3, 2, 3])

In [374]:
### Test the sklearn routines

L=LogisticRegression(solver='liblinear')

#### add the constant feature
data = (data-data.mean(axis=0))/data.std(axis=0)
L.fit(data,labels)
print("The coefficient matrix")
print(L.coef_)
print("The intercepts")
print(L.intercept_)


The coefficient matrix
[[-0.7448  1.4407 -3.0763  1.2578 -1.5464]
 [ 1.6059  0.2207 -0.3796 -1.8167 -0.3021]
 [-0.2022 -2.2679 -0.5316  0.2552  2.3482]
 [-0.3142  0.7471  4.2188  0.5531 -0.6291]]
The intercepts
[-2.7463 -3.6326 -2.4263 -1.3264]


In [375]:
print("Accuracy on the test data is {} percent".format(100*L.score(data,labels)))

Accuracy on the test data is 99.0 percent


In [422]:
def sigma(x,m):
    """computes the softmax function on the rows of the data matrix x for the weights m"""
    y = x @ m 
    j = np.exp(y)
    p = j/j.sum(axis=1,keepdims=True)
    return p
# Use sklearn to create a "one-hot" encoding of the labels

E=OneHotEncoder()
Y=E.fit_transform(labels.reshape(-1,1)).toarray()

def descent(x,y, max_iter=10000,nu=.000001,M=None):
    """does gradient ascent to maximum likelihood for data=x and labels (one-hot)=y"""
    features = x.shape[1]
    classes = y.shape[1]
    grads=[]
    if M is None:
        M = rng.normal(loc=0,scale=1,size=(features,classes))
    for i in tqdm(range(max_iter)):
        P=sigma(x,M)
        grad = data.transpose() @ (Y-P)
        M = M+nu*grad
        grads.append(np.max(np.abs(grad)))

    return M,grads

In [423]:
M,g=descent(data,Y,max_iter=10000,nu=1e-5)
print(M)

100%|██████████| 10000/10000 [00:00<00:00, 34120.70it/s]

[[ 0.312   2.2551  0.0543 -0.1579]
 [-0.0391 -1.282  -2.323  -0.3414]
 [-1.993  -0.7251 -0.9839  2.2387]
 [ 1.1905  0.005   0.8452  0.7962]
 [-1.095  -0.8986  0.8361 -1.2064]]





In [424]:
### the predicted label is the one with maximum probability
predictions=np.argmax(sigma(data,M),axis=1)

### accuracy compares predicted to true
correct=((predictions==labels).sum())/predictions.shape[0]

print("Accuracy here is {} percent".format(100* correct))

Accuracy here is 98.33333333333333 percent


In [425]:
f=figure(title='Size of largest coefficient in gradient vs. number of iterations')
f.line(x=list(range(len(g))),y=g)
show(f)

In [426]:
np.min(g)

8.84359279700982

In [443]:
M,g=descent(data,Y,max_iter=100000,nu=1e-5)

100%|██████████| 100000/100000 [00:02<00:00, 34717.29it/s]


In [444]:
f=figure(title='Size of largest coefficient in gradient vs. number of iterations')
f.line(x=list(range(len(g))),y=g)
show(f)

In [445]:
g[-1]

1.0148803989102506

In [455]:
M,g=descent(data,Y,max_iter=10000,nu=1e-5)
bigg=[]
for i in range(50):
    M,g=descent(data,Y,max_iter=10000,nu=(1e-5)/2,M=M)
    bigg = bigg + g
    print(g[-1])
    

 

100%|██████████| 10000/10000 [00:00<00:00, 31629.11it/s]
100%|██████████| 10000/10000 [00:00<00:00, 34738.17it/s]


5.239145689147484


100%|██████████| 10000/10000 [00:00<00:00, 34483.44it/s]


3.9878568557014606


100%|██████████| 10000/10000 [00:00<00:00, 35510.37it/s]


3.2370838044507746


100%|██████████| 10000/10000 [00:00<00:00, 35127.73it/s]


2.7618220273640244


100%|██████████| 10000/10000 [00:00<00:00, 34602.99it/s]


2.417622645230818


100%|██████████| 10000/10000 [00:00<00:00, 35075.82it/s]


2.1562056444175015


100%|██████████| 10000/10000 [00:00<00:00, 35889.06it/s]


1.9505204967695517


100%|██████████| 10000/10000 [00:00<00:00, 34017.56it/s]


1.7842047392362406


100%|██████████| 10000/10000 [00:00<00:00, 35588.83it/s]


1.6467654804026377


100%|██████████| 10000/10000 [00:00<00:00, 35080.20it/s]


1.5311563800187553


100%|██████████| 10000/10000 [00:00<00:00, 34496.66it/s]


1.43246554273079


100%|██████████| 10000/10000 [00:00<00:00, 34886.48it/s]


1.3471615772581536


100%|██████████| 10000/10000 [00:00<00:00, 31951.29it/s]


1.27263878965863


100%|██████████| 10000/10000 [00:00<00:00, 35169.68it/s]


1.206931444571136


100%|██████████| 10000/10000 [00:00<00:00, 34542.03it/s]


1.1485279437627884


100%|██████████| 10000/10000 [00:00<00:00, 34223.60it/s]


1.0962463437324548


100%|██████████| 10000/10000 [00:00<00:00, 34882.36it/s]


1.0491487854491288


100%|██████████| 10000/10000 [00:00<00:00, 33995.12it/s]


1.0064813257026652


100%|██████████| 10000/10000 [00:00<00:00, 35182.01it/s]


0.9676307733987306


100%|██████████| 10000/10000 [00:00<00:00, 34914.21it/s]


0.9320931668975794


100%|██████████| 10000/10000 [00:00<00:00, 35915.98it/s]


0.8994503810866581


100%|██████████| 10000/10000 [00:00<00:00, 35561.07it/s]


0.8693525147731155


100%|██████████| 10000/10000 [00:00<00:00, 35689.16it/s]


0.8415044551252755


100%|██████████| 10000/10000 [00:00<00:00, 32756.48it/s]


0.8156555053762804


100%|██████████| 10000/10000 [00:00<00:00, 34761.26it/s]


0.7915912893861966


100%|██████████| 10000/10000 [00:00<00:00, 32572.49it/s]


0.769127369513328


100%|██████████| 10000/10000 [00:00<00:00, 33165.34it/s]


0.7481041684163334


100%|██████████| 10000/10000 [00:00<00:00, 33007.30it/s]


0.7283828936511145


100%|██████████| 10000/10000 [00:00<00:00, 31578.88it/s]


0.7098422409711951


100%|██████████| 10000/10000 [00:00<00:00, 34350.23it/s]


0.6923757077739388


100%|██████████| 10000/10000 [00:00<00:00, 34952.53it/s]


0.6758893886374271


100%|██████████| 10000/10000 [00:00<00:00, 34262.23it/s]


0.6603001547529692


100%|██████████| 10000/10000 [00:00<00:00, 34585.07it/s]


0.6455341413016176


100%|██████████| 10000/10000 [00:00<00:00, 34083.82it/s]


0.6315254835463484


100%|██████████| 10000/10000 [00:00<00:00, 34610.32it/s]


0.6182152551008164


100%|██████████| 10000/10000 [00:00<00:00, 34106.83it/s]


0.6055505715424204


100%|██████████| 10000/10000 [00:00<00:00, 34464.91it/s]


0.5934838300222136


100%|██████████| 10000/10000 [00:00<00:00, 34865.89it/s]


0.5819720613386885


100%|██████████| 10000/10000 [00:00<00:00, 34017.34it/s]


0.570976375491053


100%|██████████| 10000/10000 [00:00<00:00, 34391.47it/s]


0.56046148530921


100%|██████████| 10000/10000 [00:00<00:00, 33111.30it/s]


0.5503952955957444


100%|██████████| 10000/10000 [00:00<00:00, 34807.88it/s]


0.5407485474780324


100%|██████████| 10000/10000 [00:00<00:00, 35055.80it/s]


0.5314945094822844


100%|██████████| 10000/10000 [00:00<00:00, 35515.36it/s]


0.5226087083028782


100%|██████████| 10000/10000 [00:00<00:00, 35320.13it/s]


0.514068693424929


100%|██████████| 10000/10000 [00:00<00:00, 35421.01it/s]


0.505853830722268


100%|██████████| 10000/10000 [00:00<00:00, 33214.08it/s]


0.49794512094101095


100%|██████████| 10000/10000 [00:00<00:00, 34978.59it/s]


0.49032503962727825


100%|██████████| 10000/10000 [00:00<00:00, 34968.82it/s]


0.48297739559155944


100%|██████████| 10000/10000 [00:00<00:00, 34965.30it/s]

0.4758872054455315





In [456]:
M

array([[-1.3558,  1.4395, -0.5234, -0.7165],
       [ 2.073 ,  0.6709, -2.088 ,  2.2809],
       [-4.5633,  0.0461, -0.9698,  5.4576],
       [ 1.6446, -1.8624,  0.5536,  0.871 ],
       [-1.89  ,  0.1597,  3.23  , -1.538 ]])