In [43]:
import numpy as np
import math

In [138]:
def preprocessing(M): 
    """
    This function applies the first preprocessing method described in paragraph 9.4.5 of the book 
    """
    
    num_rows = len(M[:,0])
    num_col = len(M[0])
    
    #First we subtract the mean of row i of each element m_ij 
    for i in range(num_rows):
        row_mean = np.nanmean(M[i,:]) #compute the mean for all non-NaN elements 

        for j in range(num_col):
            if not math.isnan(M[i,j]): #check if an element is NaN
                M[i,j] -= row_mean
            
    #Next we subtract the mean of column j of each element m_ji
    for j in range(num_col):
        column_mean = np.nanmean(M[:,j])

        for i in range(num_rows):
            if not math.isnan(M[j,i]):
                M[j,i] -= column_mean
    return M 

H = np.array([[5,2,4,4,3],[3,1,2,4,1],[2,np.nan,3,1,4],[2,5,4,3,5],[4,4,5,4,np.nan]]) #let the blank elements be NaN
print(H)
J = preprocessing(H)
print(J)

[[ 5.  2.  4.  4.  3.]
 [ 3.  1.  2.  4.  1.]
 [ 2. nan  3.  1.  4.]
 [ 2.  5.  4.  3.  5.]
 [ 4.  4.  5.  4. nan]]
[[ 1.47   -1.1375  0.07    0.47   -0.825 ]
 [ 0.87   -0.7375 -0.2     1.87   -1.425 ]
 [-0.43       nan  0.17   -1.43    1.275 ]
 [-1.73    1.6625 -0.13   -0.73    0.975 ]
 [-0.18    0.2125  0.42   -0.18       nan]]


In [123]:
M = np.array([[5,2,4,4,3],[3,1,2,4,1],[2,np.nan,3,1,4],[2,5,4,3,5],[4,4,5,4,np.nan]]) #let the blank elements be NaN
M_original = np.copy(M)
print(M)

#Preprocessing:

#first, loop through rows
for i in range(len(M[0,:])):
    row_mean = np.nanmean(M[i,:]) #compute the mean for all non-NaN elements 
    print('mean of row',i,row_mean)
    
    for j in range(len(M[0])):
        if not math.isnan(M[i,j]): #check if an element is NaN
            M[i,j] -= row_mean
    
print(M)
#next, the columns
for j in range(len(M[0])):
    column_mean = np.nanmean(M[:,j])
    print('mean of column',j,column_mean)
    
    for i in range(len(M[0,:])):
        if not math.isnan(M[j,i]):
            M[j,i] -= column_mean
    
print(M)

print(np.nanmean(M_original))
print(np.nanmean(M))

[[ 5.  2.  4.  4.  3.]
 [ 3.  1.  2.  4.  1.]
 [ 2. nan  3.  1.  4.]
 [ 2.  5.  4.  3.  5.]
 [ 4.  4.  5.  4. nan]]
mean of row 0 3.6
mean of row 1 2.2
mean of row 2 2.5
mean of row 3 3.8
mean of row 4 4.25
[[ 1.4  -1.6   0.4   0.4  -0.6 ]
 [ 0.8  -1.2  -0.2   1.8  -1.2 ]
 [-0.5    nan  0.5  -1.5   1.5 ]
 [-1.8   1.2   0.2  -0.8   1.2 ]
 [-0.25 -0.25  0.75 -0.25   nan]]
mean of column 0 -0.07000000000000002
mean of column 1 -0.44500000000000006
mean of column 2 0.433
mean of column 3 -0.0536
mean of column 4 0.2589
[[ 1.47   -1.53    0.47    0.47   -0.53  ]
 [ 1.245  -0.755   0.245   2.245  -0.755 ]
 [-0.933      nan  0.067  -1.933   1.067 ]
 [-1.7464  1.2536  0.2536 -0.7464  1.2536]
 [-0.5089 -0.5089  0.4911 -0.5089     nan]]
3.260869565217391
0.003278260869565223


In [126]:
#Initialization

init_value = np.sqrt(np.nanmean(M)/2) #assuming d = 2, try many d values later 
print(init_value)
#dont use the init value for now, try later 

U = np.ones((5,2))
V = np.ones((2,5))

print(U)
print(V)

0.0404861758478448
[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [127]:
#Optimization
print(M_original)
print(U)
print(V)

print(np.count_nonzero(np.isnan(M_original)))
print(M_original.size)

SE = 0
P = np.zeros((5,5))
diff = np.zeros((5,5))

for r in range(5):
    
    for j in range(5):
        
        print(M_original[r,j])
        
        for k in range(2):

            if k != j:
                P[r,j] = P[r,j] + U[r,k]*V[k,j] 
            else:
                P[r,j] = P[r,j] +  U[r,j]*V[k,j] 
                
        if not math.isnan(M_original[r,j]):        
            SE = SE + (M_original[r,j] - P[r,j])**2
            
            
        #P[r,j] = P[r,j] + 1
        #print(M[r,j])
        
print(SE)
RMSE = np.sqrt(SE/23)
print(RMSE)
print(P)

[[ 5.  2.  4.  4.  3.]
 [ 3.  1.  2.  4.  1.]
 [ 2. nan  3.  1.  4.]
 [ 2.  5.  4.  3.  5.]
 [ 4.  4.  5.  4. nan]]
[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
2
25
5.0
2.0
4.0
4.0
3.0
3.0
1.0
2.0
4.0
1.0
2.0
nan
3.0
1.0
4.0
2.0
5.0
4.0
3.0
5.0
4.0
4.0
5.0
4.0
nan
75.0
1.805787796286538
[[2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]]


In [129]:
#update U: 
U_new = np.copy(U)
V_new = np.copy(V)
M_new = np.copy(M_original)
count = 0

for i in range(1000):
    #First, we do it for U
    total_denom = 0
    total_nom = 0
    #there are r = 5 rows and s = 2 columns
    for r in range(5):

        for s in range(2):

            #now, we apply the main formula from page 346: 
            for j in range(5):
                if not math.isnan(M_new[r,j]):

                    eps = 0
                    for k in range(2):
                        if k != s: 
                            eps += U[r][k]*V[k][j]

                    total_nom += V_new[s][j] * (M_new[r][j] - eps)

                    denom = (V_new[s][j])**2
                    total_denom += denom 

            U_new[r][s] = total_nom/total_denom
            
    

    #print(count)
    #print(U_new)

    #Next, we do it for V
    total_denom = 0
    total_nom = 0
    #there are r = 2 rows and s = 5 columns
    for r in range(2):

        for s in range(5):

            #now, we apply the main formula from page 346: 
            for i in range(5):
                if not math.isnan(M_new[i,s]):

                    eps = 0
                    for k in range(2):
                        if k != r: 
                            eps += U[i][k]*V[k][s]

                    total_nom += U_new[i][r] * (M_new[i][s] - eps)

                    denom = (U_new[i][r])**2
                    total_denom += denom 

            V_new[r][s] = total_nom/total_denom
        
print(V_new)

print(M_original)
print(np.matmul(U,V))
print(np.matmul(U_new,V_new))

[[2.48251656e+17 2.23550409e+17 2.42828178e+17 2.43237349e+17
  2.40741056e+17]
 [2.42472181e+17 2.37667713e+17 2.44155076e+17 2.44398240e+17
  2.44280675e+17]]
[[ 5.  2.  4.  4.  3.]
 [ 3.  1.  2.  4.  1.]
 [ 2. nan  3.  1.  4.]
 [ 2.  5.  4.  3.  5.]
 [ 4.  4.  5.  4. nan]]
[[2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]]
[[5.57449857 5.23815142 5.53158393 5.53900377 5.50917051]
 [4.31904128 4.04909958 4.28241577 4.28824024 4.26400352]
 [3.84668007 3.61320265 3.81656786 3.8216991  3.80094679]
 [4.22173923 3.97259861 4.19125642 4.19683046 4.17490765]
 [4.6817751  4.40423166 4.64751748 4.65370907 4.62924671]]
