In [58]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

# Sample dataset
data = np.array([
    [5, 3, 0, 0],
    [4, 0, 0, 5],
    [0, 0, 3, 4],
    [0, 5, 0, 0],
    [0, 0, 4, 0]
],dtype=float)

data_n = data / 5.0

# Perform SVD
U, sigma, Vt = svds(data, k=3)

# Convert sigma into a diagonal matrix
sigma = np.diag(sigma)

# Print the resulting matrices
print("User latent factors (U):")
print(np.round(U, 2))
print("\nSingular values (Sigma):")
print(np.round(sigma, 2))
print("\nItem latent factors (V^T):")
print(np.round(Vt, 2))

print("\nOriginal matrix (R):")
print(np.round(data, 2))

reconstructed_R = np.dot(np.dot(U, sigma), Vt)
print("\nReconstructed matrix (R):")
print(np.round(reconstructed_R, 2))

User latent factors (U):
[[-0.05  0.54 -0.54]
 [-0.39 -0.24 -0.72]
 [ 0.39 -0.53 -0.38]
 [ 0.56  0.54 -0.2 ]
 [ 0.61 -0.27 -0.09]]

Singular values (Sigma):
[[4.86 0.   0.  ]
 [0.   6.33 0.  ]
 [0.   0.   8.14]]

Item latent factors (V^T):
[[-0.38  0.54  0.75 -0.08]
 [ 0.28  0.69 -0.42 -0.53]
 [-0.69 -0.32 -0.18 -0.63]]

Original matrix (R):
[[5. 3. 0. 0.]
 [4. 0. 0. 5.]
 [0. 0. 3. 4.]
 [0. 5. 0. 0.]
 [0. 0. 4. 0.]]

Reconstructed matrix (R):
[[ 4.05  3.62 -0.82  0.96]
 [ 4.33 -0.22  0.29  4.66]
 [ 0.45 -0.3   3.39  3.54]
 [ 1.02  4.34  0.88 -1.03]
 [-1.08  0.71  3.06  1.1 ]]


In [59]:
from keras.layers import Input, Dense
from keras.models import Model

#normalize the data
data_n = data / 5.0

#define input layer
input_layer = Input(shape=(data_n.shape[1],))

#encode the layer to 3 latent factors
encoding_layer = Dense(3, activation='tanh')(input_layer)

#decode
decoding_layer = Dense(data_n.shape[1], activation='sigmoid')(encoding_layer)

#define model
autoencoder = Model(input_layer, decoding_layer)

autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(data_n, data_n, epochs=1000, batch_size=2,verbose=0)


<keras.src.callbacks.history.History at 0x79a69d783f10>

In [60]:
#lower dimension representation
# Use the encoder to obtain the lower-dimensional representation
encoder = Model(input_layer, encoding_layer)
encoded_data = encoder.predict(data)
print('Lower dimension representation')
print(encoded_data)
#original data
print('Original data')
print(data)
#use autoencoder to get lower dimension representation
reconstructed_data = autoencoder.predict(data_n) * 5
print('reconstructed data')
print(np.round(reconstructed_data,1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Lower dimension representation
[[ 0.99999964  0.18238753  0.99992496]
 [-0.03884642 -1.          1.        ]
 [-0.99999785 -0.97128886 -0.7388996 ]
 [ 1.          0.99999964 -0.7738645 ]
 [-0.99999917  0.99708664 -0.9999854 ]]
Original data
[[5. 3. 0. 0.]
 [4. 0. 0. 5.]
 [0. 0. 3. 4.]
 [0. 5. 0. 0.]
 [0. 0. 4. 0.]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
reconstructed data
[[3.7 3.3 0.3 0.9]
 [4.2 0.4 0.8 4.7]
 [1.  0.4 3.2 3.5]
 [1.3 4.4 0.8 0.1]
 [0.2 0.7 3.8 0.8]]


Lets create a sparse matrix for represneting missing user/item ratings

In [34]:
import numpy as np

# Create a random 20x20 matrix with values representing user-item ratings
np.random.seed(42)  # For reproducibility
user_item = np.random.randint(1, 11, size=(10, 10)).astype(float)  # Ratings between 1 and 10

# Introduce missing values (NaN) to simulate unavailable ratings
num_missing_values = 50  # Define how many values to remove
indices = np.random.choice(user_item.size, num_missing_values, replace=False)  # Randomly select indices

# Set these indices to NaN to represent missing values
user_item.ravel()[indices] = np.nan

user_item = np.nan_to_num(user_item, nan=0.0)

# Print the generated matrix with missing values
print("Generated 20x20 matrix with missing values (0 representing unavailable ratings):")
print(user_item)



Generated 20x20 matrix with missing values (NaN representing unavailable ratings):
[[ 7.  4.  8.  5.  7. 10.  0.  0.  8.  0.]
 [ 0.  8.  8.  3.  6.  0.  0.  0.  0.  2.]
 [ 0.  1. 10.  0.  0.  0. 10.  3.  7.  0.]
 [ 0.  3.  5.  3.  7.  5.  9.  0.  0.  0.]
 [ 0.  2. 10.  9. 10.  0.  2.  4.  0.  0.]
 [ 3.  1.  4.  0.  8.  0.  2.  6.  6.  0.]
 [ 0.  6.  2.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. 10.  9.  9.  1.  9.]
 [ 0.  0.  0.  0.  0.  8.  3.  1.  0.  0.]
 [ 0.  1.  0.  0.  7. 10.  0.  0.  0.  0.]]


In [61]:
# Perform SVD
U_user, sigma_weight, Vt_items = svds(user_item, k=5)

# Convert sigma into a diagonal matrix
sigma_weight = np.diag(sigma_weight)

# Print the resulting matrices
print("User latent factors (U):")
print(np.round(U_user, 2))
print("\nSingular values (Sigma):")
print(np.round(sigma_weight, 2))
print("\nItem latent factors (V^T):")
print(np.round(Vt_items, 2))

print("\nOriginal matrix (R):")
print(np.round(user_item, 2))

reconstructed_data = np.dot(np.dot(U_user, sigma_weight), Vt_items)
print("\nReconstructed matrix (R):")
print(np.round(reconstructed_data, 1))

User latent factors (U):
[[-0.06 -0.59  0.43 -0.09  0.49]
 [-0.21  0.3   0.01 -0.34  0.29]
 [-0.33 -0.38 -0.67  0.04  0.33]
 [-0.41  0.29 -0.05  0.06  0.37]
 [ 0.33  0.43 -0.08 -0.38  0.42]
 [ 0.63 -0.25 -0.12 -0.1   0.3 ]
 [-0.2   0.12  0.04 -0.14  0.1 ]
 [ 0.27  0.25 -0.16  0.75  0.3 ]
 [-0.23 -0.01  0.19  0.32  0.13]
 [-0.04  0.1   0.53  0.18  0.22]]

Singular values (Sigma):
[[ 9.49  0.    0.    0.    0.  ]
 [ 0.   11.49  0.    0.    0.  ]
 [ 0.    0.   15.38  0.    0.  ]
 [ 0.    0.    0.   19.42  0.  ]
 [ 0.    0.    0.    0.   33.43]]

Item latent factors (V^T):
[[ 0.16 -0.36 -0.22  0.09  0.33 -0.23 -0.35  0.66  0.14  0.21]
 [-0.42  0.18 -0.09  0.24  0.26 -0.08  0.13  0.12 -0.75  0.25]
 [ 0.17  0.09 -0.31  0.08  0.31  0.6  -0.55 -0.28 -0.14 -0.09]
 [-0.05 -0.22 -0.37 -0.24 -0.3   0.58  0.4   0.26 -0.01  0.31]
 [ 0.13  0.23  0.51  0.25  0.48  0.39  0.33  0.22  0.25  0.1 ]]

Original matrix (R):
[[ 7.  4.  8.  5.  7. 10.  0.  0.  8.  0.]
 [ 0.  8.  8.  3.  6.  0.  0.  0.  0.  2.]


In [46]:
from keras.layers import Input, Dense
from keras.models import Model

#normalize user_item
user_item_n = user_item / 10.0

input_layer = Input(shape=(user_item_n.shape[1],))

#encode the layer to 3 latent factors
encoding_layer = Dense(5, activation='tanh')(input_layer)

#decode
decoding_layer = Dense(user_item_n.shape[1], activation='sigmoid')(encoding_layer)

#define model
autoencoder = Model(input_layer, decoding_layer)

autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(user_item_n, user_item_n, epochs=1000, batch_size=4,verbose=0)

reconstructed_data = autoencoder.predict(user_item_n) * 10
print('reconstructed data')
print(np.round(reconstructed_data,1))


<keras.src.callbacks.history.History at 0x79a69b8053f0>

In [47]:
encoder = Model(input_layer, encoding_layer)
encoded_data = encoder.predict(user_item_n)
print('Lower dimension representation')
print(encoded_data)
#original data
print('Original data')
print(user_item)
#use autoencoder to get lower dimension representation
reconstructed_data = autoencoder.predict(user_item_n) * 10
print('reconstructed data')
print(np.round(reconstructed_data,1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Lower dimension representation
[[-0.48024788  0.67335963  0.17243196 -0.31488127 -0.9798216 ]
 [-0.91600406  0.27495766  0.6429624   0.9140307   0.18351303]
 [-0.72465414  0.03569224  0.91072655 -0.9667925   0.89887357]
 [-0.4745828  -0.5723395   0.8958237   0.08178291  0.4914223 ]
 [-0.974562   -0.15137216 -0.20821458  0.8228244   0.7921172 ]
 [-0.85846317 -0.85498154 -0.5042633  -0.3900629  -0.21004905]
 [-0.58132887 -0.39902332  0.7733504   0.8310442  -0.31509763]
 [ 0.96280134 -0.98203486  0.43898824 -0.7220377   0.8308267 ]
 [ 0.7797876  -0.78616387  0.9058016   0.26461664 -0.5912529 ]
 [ 0.40184987 -0.84573287  0.78622895  0.85611105 -0.92317677]]
Original data
[[ 7.  4.  8.  5.  7. 10.  0.  0.  8.  0.]
 [ 0.  8.  8.  3.  6.  0.  0.  0.  0.  2.]
 [ 0.  1. 10.  0.  0.  0. 10.  3.  7.  0.]
 [ 0.  3.  5.  3.  7.  5.  9.  0.  0.  0.]
 [ 0.  2. 10.  9. 10.  0.  2.  4.  0.  0.]
 [ 3.  1.  4.  0.  8.  0.  2.  6.  6.

In [56]:
#find new user rating where all values are missing for ratings
new_user_ratings = np.zeros((1, 10))
print(new_user_ratings)
latent_feature_new = np.dot(new_user_ratings, Vt_items.T)
predicted_svd_new_user = np.dot(latent_feature_new, Vt_items)

print("Predicted ratings for the new user (SVD):")
print(predicted_svd_new_user)
predicted_ae_new_user = autoencoder.predict(new_user_ratings)

print("Predicted ratings for the new user (Autoencoder):")
print(predicted_ae_new_user)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Predicted ratings for the new user (SVD):
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Predicted ratings for the new user (Autoencoder):
[[0.04385865 0.22426341 0.18491752 0.12395512 0.5133232  0.51758754
  0.22713345 0.11031058 0.05107613 0.080647  ]]
