In [1]:
import numpy as np
import torch
from torch import nn
import torchvision
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

### Test Matrix

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Create a user-item interaction matrix (100 users, 50 items) with sparsity
num_users = 100
num_items = 50
interaction_matrix = np.random.randint(0, 6, size=(num_users, num_items))  # Random interactions from 0 to 5

# Introduce sparsity by setting a high percentage of interactions to 0
sparsity = 0.8  # 80% of the interactions will be set to 0
mask = np.random.rand(*interaction_matrix.shape) < sparsity
interaction_matrix[mask] = 0

# Convert to PyTorch tensor
interaction_tensor = torch.tensor(interaction_matrix, dtype=torch.float32)

# Add noise to the input data
def add_noise(data, noise_factor=0.3):
    noisy_data = data + noise_factor * torch.randn_like(data)
    noisy_data = torch.clamp(noisy_data, 0., 5.)  # Ensure values stay within the interaction range
    return noisy_data

noisy_interaction_tensor = add_noise(interaction_tensor)

# Print the original and noisy matrices (first 10 users for brevity)
print("Original Interaction Matrix (first 10 users):")
print(interaction_matrix[:10])  # Print only the first 10 users for brevity
print("\nNoisy Interaction Matrix (first 10 users):")
print(noisy_interaction_tensor[:10].numpy())


Original Interaction Matrix (first 10 users):
[[0 0 2 0 0 0 0 2 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 5 0
  0 0 0 0 0 0 4 0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 4]
 [4 0 0 0 0 3 0 0 0 0 2 0 0 4 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0
  1 0 0 0 0 0 0 0 0 0 3 4 0 0]
 [0 4 2 0 3 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 2 5 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 1 0 0 0 1 2 0 4 0 0 0 0 0 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  4 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [0 0 3 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 3 4 4 0 0 0
  0 0 0 3 0 0 0 0 0 0 0 0 5 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 2 0 0 0 0 0 5 0 0 0 2
  0 0 0 0 0 0 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 4 0 0 0 0 0 0 3 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 2 3 0 0 4 0 0
  0 0 0 5 0 0 0 0 0 0 0 0 1 5]
 [0 0 0 0

In [21]:
class AutoEncoder(nn.Module):
	def __init__(self, input_dim, bottleneck_size, device='cpu'):
		super(AutoEncoder, self).__init__()
		self.device = device
		self.encoder = nn.Sequential(
			nn.Linear(input_dim, 128),
			nn.ReLU(),
			nn.Linear(128, 64),
			nn.ReLU(),
			nn.Linear(64, bottleneck_size)
		)
		self.decoder = nn.Sequential(
			nn.Linear(bottleneck_size, 64),
			nn.ReLU(),
			nn.Linear(64, 128),
			nn.ReLU(),
			nn.Linear(128, input_dim)
		)
	
	def forward(self, x):
		x = self.encoder(x)
		x = self.decoder(x)
		return x
	
	def fit(self, batches, n_epochs=100, min_delta=0.0001, lr=0.001, patience=10):
		optimizer = torch.optim.Adam(self.parameters(), lr=lr)
		criterion = nn.MSELoss()
		best_loss = float('inf')
		patience_counter = 0

		for epoch in range(n_epochs):
			epoch_loss = 0.0
			for batch in batches:
				batch = batch[0].to(self.device)  # Move batch to device
				optimizer.zero_grad()
				output = self.forward(batch)
				loss = criterion(output, batch)
				loss.backward()
				optimizer.step()
				epoch_loss += loss.item()

			epoch_loss /= len(batches)

			if epoch_loss < best_loss - min_delta:
				best_loss = epoch_loss
				patience_counter = 0
			else:
				patience_counter += 1

			if patience_counter >= patience:
				print(f"Early stopping at epoch {epoch+1} with loss {epoch_loss:.4f}")
				break

			print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {epoch_loss:.4f}')
		return


In [24]:
batch_size = 32 
train = TensorDataset(noisy_interaction_tensor)
batches = DataLoader(train, batch_size=batch_size, shuffle=True)

In [26]:
test = AutoEncoder(50, 10)
test.fit(batches, 1000, 0.0001, 0.0001, 10)

Epoch [1/1000], Loss: 1.7445
Epoch [2/1000], Loss: 1.7712
Epoch [3/1000], Loss: 1.6870
Epoch [4/1000], Loss: 1.7873
Epoch [5/1000], Loss: 1.7704
Epoch [6/1000], Loss: 1.7228
Epoch [7/1000], Loss: 1.5618
Epoch [8/1000], Loss: 1.7257
Epoch [9/1000], Loss: 1.7183
Epoch [10/1000], Loss: 1.5668
Epoch [11/1000], Loss: 1.6816
Epoch [12/1000], Loss: 1.7005
Epoch [13/1000], Loss: 1.6839
Epoch [14/1000], Loss: 1.6275
Epoch [15/1000], Loss: 1.6543
Epoch [16/1000], Loss: 1.6534
Early stopping at epoch 17 with loss 1.8313


In [5]:
user_ratings = pd.read_csv('matrix.csv.gzip', compression='gzip', index_col=0)


In [6]:
user_ratings.notna().count().sum()

1869924201

In [7]:
user_ratings = user_ratings.fillna(0)

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [14]:
tensors = torch.tensor(user_ratings.to_numpy(), dtype=torch.float32, device=device)
batch_size = 32 
train = TensorDataset(tensors)
batches = DataLoader(train, batch_size=batch_size, shuffle=True)

In [23]:
model = AutoEncoder(tensors.shape[-1], 10)
model.fit(batches, 1000, 0.0001, 0.0001, 10)

Epoch [1/1000], Loss: 0.0056
Epoch [2/1000], Loss: 0.0054
Epoch [3/1000], Loss: 0.0053
Epoch [4/1000], Loss: 0.0052


KeyboardInterrupt: 