# Adagrad bases matrix factorization

In a [previous post](./nnmf-tensorflow.html), we had seen how to perfom non-negative matrix factorization (NNMF) using Tensorflow. In [another previous post](./linear-regression-adagrad-vs-gd.html), I had shown how to use Adagrad for linear regression. This current post can be considered an extension of the linear regression using Adagrad post.

### Customary imports

In [1]:
import autograd.numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Creating the matrix to be decomposed

In [2]:
A = np.array([[3, 4, 5, 2],
                   [4, 4, 3, 3],
                   [5, 5, 4, 3]], dtype=np.float32).T

### Masking one entry

In [3]:
A[0, 0] = np.NAN

In [4]:
A

array([[ nan,   4.,   5.],
       [  4.,   4.,   5.],
       [  5.,   3.,   4.],
       [  2.,   3.,   3.]], dtype=float32)

### Defining the cost function

In [5]:
def cost(W, H):
    pred = np.dot(W, H)
    mask = ~np.isnan(A)
    return np.sqrt(((pred - A)[mask].flatten() ** 2).mean(axis=None))

### Decomposition params

In [6]:
rank = 2
learning_rate=0.01
n_steps = 10000

### Gradient of cost wrt params W and H

In [7]:
from autograd import grad, multigrad
grad_cost= multigrad(cost, argnums=[0,1])

### Main gradient descent routine

In [39]:
shape = A.shape
H =  np.abs(np.random.randn(rank, shape[1]))
W =  np.abs(np.random.randn(shape[0], rank))

In [40]:
H

array([[ 1.97468221,  1.23326011,  0.83383108],
       [ 1.42192561,  0.81458234,  0.62578017]])

In [41]:
W

array([[ 0.04468053,  1.67663752],
       [ 1.38696491,  0.70903679],
       [ 0.36964657,  1.41336227],
       [ 0.45439383,  0.52227639]])

In [42]:
cost(W, H)

2.3529817156814383

In [43]:
del_W, del_H = grad_cost(W, H)

In [44]:
del_W

array([[-0.24896756, -0.17578989],
       [-0.21039854, -0.1499752 ],
       [-0.32925597, -0.23588712],
       [-0.19735456, -0.13864165]])

In [45]:
del_H*del_H

array([[ 0.00272064,  0.02293448,  0.07252781],
       [ 0.01894105,  0.109342  ,  0.29837662]])

In [46]:
np.square(del_H)

array([[ 0.00272064,  0.02293448,  0.07252781],
       [ 0.01894105,  0.109342  ,  0.29837662]])

In [47]:
gt_w = np.zeros_like(W)
gt_h = np.zeros_like(H)
eps = 1e-8
learning_rate=1
print "Iteration, Cost"
n_steps=10
for i in range(n_steps):
    
    if i%1==0:
        print "*"*20
        print i,",", cost(W, H)
    
    del_W, del_H = grad_cost(W, H)
    gt_w+= np.square(del_W)
    gt_h+= np.square(del_H)
    
    mod_learning_rate_W = np.divide(learning_rate, np.sqrt(gt_w+eps))
    mod_learning_rate_H = np.divide(learning_rate, np.sqrt(gt_h+eps))
    W =  W-del_W*mod_learning_rate_W
    H =  H-del_H*mod_learning_rate_H
    
    # Ensuring that W, H remain non-negative. This is also called projected gradient descent
    W[W<0] = 0
    H[H<0] = 0

Iteration, Cost
********************
0 , 2.35298171568
********************
1 , 4.13541533267
********************
2 , 1.72807149365
********************
3 , 0.935067707119
********************
4 , 1.72791208109
********************
5 , 0.517553294197
********************
6 , 0.938428667411
********************
7 , 1.24599961017
********************
8 , 1.33172716019
********************
9 , 0.529911681635


In [48]:
np.zeros_like(0)

array(0)

In [20]:
del_W*mod_learning_rate_W

array([[-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.00999547, -0.00999991]])

In [21]:
del_W

array([[-0.30195177, -0.1565043 ],
       [-0.30630729, -0.19443085],
       [-0.22483067, -0.27178401],
       [-0.00332272, -0.02337082]])

In [22]:
mod_learning_rate_W

array([[ 0.03311787,  0.06389599],
       [ 0.03264695,  0.05143216],
       [ 0.04447791,  0.03679392],
       [ 3.00821902,  0.4278801 ]])

In [23]:
mod_learning_rate_W*del_W

array([[-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.00999547, -0.00999991]])

In [24]:
np.multiply(mod_learning_rate_W, del_W)

array([[-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.01      , -0.01      ],
       [-0.00999547, -0.00999991]])

In [54]:
pd.DataFrame(W)

Unnamed: 0,0,1
0,1.729911,1.104788
1,1.191779,1.333739
2,2.018141,0.570662
3,0.355614,1.08309


In [55]:
pd.DataFrame(H)

Unnamed: 0,0,1,2
0,2.157829,0.783644,1.312347
1,1.101042,2.393814,2.470611


In [56]:
pred = np.dot(W, H)
pred_df = pd.DataFrame(pred).round()
pred_df

Unnamed: 0,0,1,2
0,5.0,4.0,5.0
1,4.0,4.0,5.0
2,5.0,3.0,4.0
3,2.0,3.0,3.0


In [57]:
pd.DataFrame(A)

Unnamed: 0,0,1,2
0,,4.0,5.0
1,4.0,4.0,5.0
2,5.0,3.0,4.0
3,2.0,3.0,3.0


In [58]:
gt_w

array([[  2.44242307e-01,   5.63349664e-01],
       [  2.99507989e+02,   2.51168511e+02],
       [  3.31684975e+02,   2.53301910e+01],
       [  3.29582472e+01,   3.93531649e+01]])

In [59]:
gt_h

array([[ 527.75972021,   13.77089759,   27.28864048],
       [ 303.27252759,    5.94133343,   23.39096651]])

In [60]:
mod_learning_rate_H

array([[ 0.00043529,  0.00269475,  0.0019143 ],
       [ 0.00057423,  0.00410259,  0.00206764]])

In [61]:
mod_learning_rate_W

array([[ 0.02023436,  0.01332327],
       [ 0.00057782,  0.00063098],
       [ 0.00054908,  0.00198692],
       [ 0.00174188,  0.00159408]])