In [2]:
from autoencoder import AutoEncoder
import pandas as pd
import numpy as np

In [53]:
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch import nn

In [42]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [47]:
training = False

In [50]:
generate_data = False

In [None]:
if generate_data:
	sample = pd.read_csv('predictions.csv')
	res = { k: v.get(["book_id", "rating"]) for k, v in sample.groupby('user_id')}
	sample_matrix = pd.DataFrame(index= res.keys(), columns = sample['book_id'].unique())
	sample_matrix.to_csv('sample_matrix.csv.gzip', index=True, columns=sample_matrix.columns, chunksize=1000, compression='gzip')
	for k, v in res.items():
		t = v.reset_index(drop=True).transpose()
		sample_matrix.loc[k][t.loc["book_id"]] = t.loc["rating"]
else:
	sample_matrix = pd.read_csv('sample_matrix.csv', index_col=0)

In [51]:
if not training:
	model = torch.load('sample_model/sample_model_k=32.pt')
	model.load_state_dict(torch.load('sample_model/sample_model_k_weights_32.pt'))
else:
	bottleneck = 32
	tensors = torch.tensor(sample_matrix.T.to_numpy(), dtype=torch.float32, device=device)
	batch_size = 32 
	train = TensorDataset(tensors)
	batches = DataLoader(train, batch_size=batch_size, shuffle=True)
	model = AutoEncoder(sample_matrix.shape[1], bottleneck)
	epochs, losses = model.fit(batches, n_epochs=1000, lr=0.001, patience=10)
	pd.DataFrame({"epochs": epochs, "training losses": losses}).to_csv(f'models/training_loss/{bottleneck}.csv')

	torch.save(model, f'sample_model_k={bottleneck}.pt')
	torch.save(model.state_dict(), f'sample_model_k_weights_{bottleneck}.pt')

In [54]:
def mask_test_model(model: nn.Module, mask_fraction: float, row: torch.Tensor, device: torch.device):
    model.eval()
    with torch.no_grad():
        criterion = nn.MSELoss()
        
        # Identify non-zero elements in the row
        non_zero_indices = row.nonzero(as_tuple=True)[0]

        # Create a mask for the non-zero elements
        mask = torch.rand(len(non_zero_indices)).to(device) < mask_fraction

        # Apply the mask to the row
        masked_row = row.clone().to(device)
        masked_row[non_zero_indices[mask]] = 0

        # Get the model's predictions
        predictions = model(masked_row)

        # Calculate the loss only for the masked values
        loss = criterion(predictions[non_zero_indices[mask]], row[non_zero_indices[mask]])

        # Optionally, return the predictions and mask for further analysis
        return predictions.cpu().numpy(), mask.cpu().numpy()

In [55]:
def test_sample_model(model: AutoEncoder, sample:pd.DataFrame, device: torch.device):
    model.eval()
    # sample = interaction_matrix.sample(sample_size, random_state=42)
    
    # Convert the sample DataFrame to a tensor
    sample_tensor = torch.tensor(sample.to_numpy(), dtype=torch.float32, device=device)
    
    # Test the model on each row of the sample
    # tested = sample.apply(lambda row: mask_test_model(model, 0.2, torch.tensor(row.to_numpy(), dtype=torch.float32, device=device), device)[0], axis=1)
    tested = []
    for row in sample.iterrows():
        prediction = mask_test_model(model, 0.2, torch.tensor(row[1].to_numpy(), dtype=torch.float32, device=device), device)[0]
        # display(prediction)
        prediction = pd.DataFrame(prediction)
        tested.append(prediction)
    

    tested = pd.concat(tested, axis=1)
    tested.index = sample.columns
    tested.columns = sample.index

    # Calculate the Frobenius norm of the difference
    # calculate the rmse
    
    # loss = linalg.norm(sample.to_numpy() - tested.to_numpy().T, ord='fro')
    mse = np.mean((sample.to_numpy() - tested.to_numpy().T)**2)
    loss = np.sqrt(mse)
    
    return loss, tested.T, sample

In [95]:
result = test_sample_model(model, sample_matrix.T, device)[1].T

In [78]:
# filter out values that are less than the threshold of 0.1
result2 = result.applymap(lambda x: x if x > 0.1 else 0)

  result2 = result.applymap(lambda x: x if x > 0.1 else 0)


In [98]:
# rows and columns that are not zero
result['B00086PL00'][result['B00086PL00'] > 0.01]

A10A1S5NAQBT21    0.011993
A11GO5VA74HD8K    0.010153
A12A08OL0TZY0W    0.019975
A13F2IV3ME23R     0.044718
A14OJS0VWMOSWO    0.020910
                    ...   
AUM3YMZ0YRJE0     0.016829
AVXXGV0UD721E     0.011293
AW1D2TDTE17QL     0.020270
AWLFVCT9128JV     0.013132
AYHVXPT15XU66     0.014390
Name: B00086PL00, Length: 155, dtype: float32