In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from cf_model import CollaborativeFiltering
from dataset import CFDataset, Encoder, split_data
import import_ipynb
import db
from torch.utils.data import DataLoader
from tqdm import tqdm
import random

importing Jupyter notebook from db.ipynb
cf_model.py	main.ipynb	   __pycache__	     user.py
dataset.py	menv		   README.md	     wandb
db.ipynb	model_weights.pth  requirements.txt  your_database.db
filemanager.py	movusers.db	   train.ipynb


In [2]:
db.display_size()

Users: 100

Movies: 30

Reviews: 713


In [3]:
# print(db.display_size())
#TODO change shuffle and not shuffle

data = db.get_table_values("Reviews")


list_of_lists = [list(t) for t in data]

# Shuffle the list of lists
random.shuffle(list_of_lists)

# Convert the shuffled list of lists back to a list of tuples
data = [tuple(lst) for lst in list_of_lists]

# print(data[0:10])

SELECT * FROM Reviews


In [4]:
encoder = Encoder([did[2] for did in data],[did[1] for did in data])
# print(encoder.user_to_idx)
# print(encoder.idx_to_user)

In [5]:
all_ds = CFDataset(data,encoder)

In [6]:
train_data, test_data = split_data(data)
train_data, validation_data = split_data(train_data)
train_ds = CFDataset(train_data,encoder)
test_ds = CFDataset(test_data,encoder)
validation_ds = CFDataset(validation_data,encoder)


BATCH_SIZE = 16
train_data_loader = DataLoader(train_ds,batch_size=BATCH_SIZE,shuffle=False,drop_last=True)
validation_data_loader = DataLoader(validation_ds,batch_size=BATCH_SIZE,shuffle=False,drop_last=True)
test_data_loader = DataLoader(test_ds,batch_size=BATCH_SIZE,shuffle=False,drop_last=True)

In [7]:
all_ds[20]

(tensor(19), tensor(15), tensor(10.))

In [8]:
# for ds in train_data_loader:
#     print(ds)

In [9]:
max([ds[1] for ds in train_ds.dataset])

tensor(29)

In [10]:
num_users = len(db.get_table_values("Users"))
num_movies = len(db.get_table_values("Movies"))
FEATURES=700
model = CollaborativeFiltering(num_users, num_movies, embedding_dim=FEATURES)


LEARNING_RATE = 0.001
criterion = nn.MSELoss()
optimiser = optim.SGD(model.parameters(), lr=0.001)
EPOCHS = 25

SELECT * FROM Users
SELECT * FROM Movies


# PRETEST

In [11]:
#test data
model.eval()  # Switch to evaluation mode
test_loss = 0
correct_predictions = 0  # Fixed variable name for clarity
total_contexts = 0

with torch.no_grad():  # Disable gradient computation
    for user_ids, movie_ids, ratings in test_data_loader:
        predictions = model(user_ids,movie_ids)  # Generate predictions
        # Calculate and accumulate loss
        loss = criterion(predictions, ratings)
        test_loss += loss.item()
        
        # # Reshape predictions to match [batch_size, context_size, vocab_size]
        # predictions = predictions.view(-1, context_size, VOCAB_SIZE)
        
        # # Get top prediction for each context position
        # top_predictions = predictions.argmax(dim=2)
        
        # # Calculate correct predictions
        # correct_preds = (top_predictions == context).float().sum()
        # correct_predictions += correct_preds.item()  # Accumulate correct predictions
        
        # total_contexts += context.numel()  # Total number of context word positions evaluated

# Calculate final metrics
test_loss /= len(test_data_loader)  # Average loss per batch
# print('correct predictions = ',correct_predictions)
# print('out of  = ',total_contexts)
# accuracy = correct_predictions / total_contexts  # Compute accuracy

# print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

# Evaluate the model
# (This would involve using a separate validation set or performing cross-validation)
test_loss

53.72111463546753

## wandb

In [12]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    entity="jcrich",
    project="collaborative filter model",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": LEARNING_RATE,
    "architecture": "collaborative filter",
    "dataset": "imdb",
    "epochs": EPOCHS,
    }
)


# # simulate training
# epochs = 10
# offset = random.random() / 5
# for epoch in range(2, epochs):
#     acc = 1 - 2 ** -epoch - random.random() / epoch - offset
#     loss = 2 ** -epoch + random.random() / epoch + offset
    
#     # log metrics to wandb
#     wandb.log({"acc": acc, "loss": loss})
    
# # [optional] finish the wandb run, necessary in notebooks
# wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjcrich[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Initialize lists to keep track of losses and epochs
train_losses = []
validation_losses = []
epochs = []

for epoch in range(EPOCHS):
    model.train()
    total_epoch_loss = 0
    for batch_idx, (user_ids, movie_ids, ratings) in tqdm(enumerate(train_data_loader), total=len(train_data_loader), desc=f'Epoch {epoch}'):

        optimiser.zero_grad()
        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, ratings)

        loss.backward()
        optimiser.step()
        
        batch_loss = loss.item()
        total_epoch_loss += batch_loss
        
    # After all batches, calculate average loss for the epoch
    avg_epoch_loss = total_epoch_loss / len(train_data_loader)
    train_losses.append(avg_epoch_loss)
    epochs.append(epoch)  # Append the current epoch number
    
    wandb.log({"Training Loss": avg_epoch_loss, "Epoch": epoch})
    # print(f'Epoch: {epoch}, Average Training Loss: {avg_epoch_loss}')

    # Start validation phase
    model.eval()  # Set the model to evaluation mode
    total_validation_loss = 0
    with torch.no_grad():  # Turn off gradients for validation, saves memory and computations
        for batch_idx, (user_ids, movie_ids, ratings) in enumerate(validation_data_loader):
            predictions = model(user_ids, movie_ids)
            loss = criterion(predictions, ratings)
            total_validation_loss += loss.item()
    
    # Calculate and log validation loss after the epoch
    avg_validation_loss = total_validation_loss / len(validation_data_loader)
    validation_losses.append(avg_validation_loss)
    
    wandb.log({"Validation Loss": avg_validation_loss, "Epoch": epoch})
    # print(f'Epoch: {epoch}, Average Validation Loss: {avg_validation_loss}')    
    # After training and validation phases, log the losses with the epoch number
    wandb.log({"Epoch": epoch, "Training Loss": avg_epoch_loss, "Validation Loss": avg_validation_loss})

wandb.finish()

Epoch 0: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1491.33it/s]
Epoch 1: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1963.26it/s]
Epoch 2: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1907.09it/s]
Epoch 3: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1973.29it/s]
Epoch 4: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1945.15it/s]
Epoch 5: 100%|████████████████████████████████| 28/28 [00:00<00:00, 1992.71it/s]
Epoch 6: 100%|████████████████████████████████| 28/28 [00:00<00:00, 2030.09it/s]
Epoch 7: 100%|████████████████████████████████| 28/28 [00:00<00:00, 2068.38it/s]
Epoch 8: 100%|████████████████████████████████| 28/28 [00:00<00:00, 2078.34it/s]
Epoch 9: 100%|████████████████████████████████| 28/28 [00:00<00:00, 2080.69it/s]
Epoch 10: 100%|███████████████████████████████| 28/28 [00:00<00:00, 1956.62it/s]
Epoch 11: 100%|███████████████████████████████| 28/28 [00:00<00:00, 1841.86it/s]
Epoch 12: 100%|█████████████

0,1
Epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Training Loss,██▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,██▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂

0,1
Epoch,24.0
Training Loss,3.39253
Validation Loss,4.94398


In [14]:
#test data
model.eval()  # Switch to evaluation mode
test_loss = 0
correct_predictions = 0  # Fixed variable name for clarity
total_contexts = 0

with torch.no_grad():  # Disable gradient computation
    for user_ids, movie_ids, ratings in test_data_loader:
        predictions = model(user_ids,movie_ids)  # Generate predictions
        # Calculate and accumulate loss
        loss = criterion(predictions, ratings)
        test_loss += loss.item()
        
        # # Reshape predictions to match [batch_size, context_size, vocab_size]
        # predictions = predictions.view(-1, context_size, VOCAB_SIZE)
        
        # # Get top prediction for each context position
        # top_predictions = predictions.argmax(dim=2)
        
        # # Calculate correct predictions
        # correct_preds = (top_predictions == context).float().sum()
        # correct_predictions += correct_preds.item()  # Accumulate correct predictions
        
        # total_contexts += context.numel()  # Total number of context word positions evaluated

# Calculate final metrics
test_loss /= len(test_data_loader)  # Average loss per batch
# print('correct predictions = ',correct_predictions)
# print('out of  = ',total_contexts)
# accuracy = correct_predictions / total_contexts  # Compute accuracy

# print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

# Evaluate the model
# (This would involve using a separate validation set or performing cross-validation)
test_loss

6.86813497543335

## Preference ranking

In [15]:
target_user_id = 140440102666832

In [16]:
e_uid = encoder.encode(target_user_id,encoder.user_to_idx)

In [17]:
def get_user_features(uid):

    for i in range(0,len(all_ds)):

        u,m,r = all_ds[i]
        # print(u)
        if u == uid:
            yield [m,r]
    

In [18]:
def predict_ratings(e_uid,e_mid,model):
    e_uid_tensor = torch.tensor(e_uid, dtype=torch.int64).unsqueeze(0)
    movie_eid_tensor = torch.tensor(e_mid,dtype=torch.int64).unsqueeze(0)
    model.eval()
    with torch.no_grad():
        user_rating = model(e_uid_tensor,movie_eid_tensor)
        return movie_eid_tensor,user_rating
        
    

In [19]:
#get user and features
seen_features = set(tuple(fe) for fe in get_user_features(e_uid)) # Get user features if needed
seen_movies = set(sf[0].item() for sf in seen_features)
all_movies = set([num for num in range(30)])
unseen_movies = all_movies - seen_movies
unseen_features = {tuple(predict_ratings(e_uid,um,model)) for um in unseen_movies}
all_features = seen_features | unseen_features

sorted_set = sorted(all_features, key=lambda x: x[1])
top_n = 10
recommendations = list(filter(lambda x: x[0].item() in unseen_movies,sorted_set))
n_recommendations = list(map(lambda x: x[0],recommendations[-1:top_n*-1:-1]))
n_recommendations


[tensor([29]),
 tensor([9]),
 tensor([15]),
 tensor([23]),
 tensor([21]),
 tensor([17]),
 tensor([26]),
 tensor([28]),
 tensor([19])]

# Matching

In [20]:
#all users, with all movie ratings.
def generate_all_ratings(model,num_users,num_movies):
    #build matrix

    #data already encoded....
    all_data = [['dummy',mnm,unm,'dummy','0/10'] for unm in range(0,num_users) for mnm in range(0,num_movies)]
    # print(all_data)
    # print([em for em in all_data])
    fake_encoder = Encoder([did[2] for did in all_data],[did[1] for did in all_data])
    all_ds = CFDataset(all_data,fake_encoder)
    
    data_loader = DataLoader(all_ds, batch_size=1)
    for uid,mid,ra in data_loader:
        # print(uid,mid)
        prediction = model(uid,mid)
        yield (uid.item(),mid.item(),prediction.item())



In [21]:
def create_ratings_tensor(data, num_users, num_movies):
    # Initialize an empty tensor to hold the ratings
    ratings_tensor = torch.zeros(num_users, num_movies)
    
    # Iterate over the data and fill the tensor
    for entry in data:
        user_id, movie_id, rating = entry
        # Convert rating to float
        rating = float(rating)
        ratings_tensor[user_id, movie_id] = rating
    
    return ratings_tensor

In [22]:
# Step 1: Generate Predictions
predictions = [ra for ra in generate_all_ratings(model,num_users,num_movies)]
# print(predictions)
ratings_tensor = create_ratings_tensor(predictions,num_users,num_movies)
print(ratings_tensor)
# predictions[0]

# ratings_tensor.shape

# ratings_tensor[0]

import torch.nn.functional as F

# Assuming predicted_ratings is your tensor of predicted ratings
# predicted_ratings.shape should be (num_users, num_movies)

# # Calculate cosine similarity
# user_similarities = F.cosine_similarity(ratings_tensor, ratings_tensor, dim=1)

# user_similarities

tensor([[ 3.3939,  8.1604,  5.4219,  ...,  5.1658,  6.8648,  9.8191],
        [ 4.7104,  9.4769,  6.7384,  ...,  6.4822,  8.1813, 11.1355],
        [ 4.5359,  9.3024,  6.5639,  ...,  6.3077,  8.0068, 10.9610],
        ...,
        [ 4.1149,  8.8814,  6.1429,  ...,  5.8867,  7.5858, 10.5400],
        [ 3.8317,  8.5981,  5.8597,  ...,  5.6035,  7.3026, 10.2568],
        [ 2.2451,  7.0116,  4.2731,  ...,  4.0169,  5.7160,  8.6702]])


## COSINE SIMILARITY

In [23]:
# Step 1: Normalize predicted ratings
normalized_ratings = F.normalize(ratings_tensor, p=2, dim=1)

# Step 2: Calculate cosine similarity
user_similarities = torch.matmul(normalized_ratings, normalized_ratings.T)

# Set diagonal elements to a large negative value to exclude self-similarity
user_similarities.fill_diagonal_(-float('inf'))

# You can optionally convert the similarities tensor to a numpy array for easier manipulation
user_similarities_np = user_similarities.numpy()

# Print or use the user similarities tensor
print(user_similarities)

tensor([[  -inf, 0.9993, 0.9995,  ..., 0.9998, 0.9999, 0.9990],
        [0.9993,   -inf, 1.0000,  ..., 0.9999, 0.9997, 0.9966],
        [0.9995, 1.0000,   -inf,  ..., 0.9999, 0.9998, 0.9969],
        ...,
        [0.9998, 0.9999, 0.9999,  ...,   -inf, 1.0000, 0.9977],
        [0.9999, 0.9997, 0.9998,  ..., 1.0000,   -inf, 0.9983],
        [0.9990, 0.9966, 0.9969,  ..., 0.9977, 0.9983,   -inf]])


### Matches

In [24]:
top_matches = {}
for i in range(len(user_similarities)):
    # Sort similarities for the current user i
    ranked_users = torch.argsort(torch.tensor(user_similarities[i]), descending=True)
    # Exclude self from top matches (optional)
    top_matches[i] = ranked_users[ranked_users != i]


  ranked_users = torch.argsort(torch.tensor(user_similarities[i]), descending=True)


In [25]:
top_matches

{0: tensor([39, 91, 45,  6, 31, 14, 61, 62, 58,  4, 89,  9, 38, 24, 46, 90, 27, 49,
         67, 41, 36, 53, 23, 81, 10, 82, 98, 17, 56, 87, 19, 25,  3, 65, 13, 33,
         43, 55, 48, 72, 86, 11, 66, 42, 79, 68, 85, 70, 97, 51, 69, 71, 59, 93,
         54, 37,  5, 95, 35, 88,  8, 50, 32,  7, 80, 16,  2, 26, 15, 74, 12, 20,
         75,  1, 83, 44, 77, 60, 47, 34, 78, 22, 18, 64, 21, 52, 63, 99, 28, 30,
         96, 57, 94, 76, 73, 84, 40, 29, 92]),
 1: tensor([44, 83, 75, 12, 77, 74, 15, 47, 34,  2, 16, 22,  7, 18, 21, 63, 32,  8,
         88, 95, 28, 37, 71, 30, 51, 97, 70, 96, 85, 68, 79, 42, 66, 11, 86, 48,
         57, 55, 94, 43, 76, 25, 19, 87, 56, 98, 82, 81, 23, 53, 36, 41, 49, 27,
         90, 46, 89,  4, 58, 61, 14,  6, 45, 91, 39, 84,  0, 31, 62,  9, 38, 24,
         67, 10, 17,  3, 65, 13, 33, 72, 69, 59, 93, 54,  5, 35, 50, 80, 26, 20,
         60, 78, 64, 52, 99, 73, 40, 29, 92]),
 2: tensor([16, 15,  7, 74, 12, 32, 75, 83,  1,  8, 88, 44, 77, 95, 37, 47, 34, 71,
      

In [26]:
encoder.decode(71,encoder.idx_to_user)

140440102673744

In [27]:
#decoded

decoded_matches = {encoder.decode(k,encoder.idx_to_user):[encoder.decode(ve,encoder.idx_to_user) for ve in v.tolist()] for k,v in top_matches.items()}
decoded_matches

{140440118970112: [140440118980864,
  140440102676816,
  140440115342416,
  140440118980096,
  140440122653424,
  140440115343232,
  140440102248624,
  140440102673696,
  140440418899120,
  140440102689696,
  140440115335552,
  140440115336320,
  140440102676528,
  140440115344768,
  140440115339392,
  140440116024304,
  140440116028384,
  140440102671104,
  140440116031456,
  140440115339248,
  140440102678784,
  140440115342320,
  140440115335024,
  140440115332816,
  140440115336656,
  140440115342176,
  140440115340304,
  140440418909248,
  140440118979904,
  140440115340064,
  140440115341504,
  140440102687680,
  140440102675568,
  140440102676672,
  140440115342032,
  140440115334400,
  140440122642048,
  140440118971216,
  140440115338240,
  140440102668752,
  140440116025072,
  140440115346880,
  140440102666832,
  140440418895568,
  140440102670336,
  140440118979808,
  140440102688160,
  140440115334640,
  140440115331424,
  140440102672208,
  140440118976496,
  140440102673

In [28]:
# Step 4: Select Top Matches
N = 5  # Number of top matches to select
for user, similar_users in decoded_matches.items():
    decoded_matches[user] = similar_users[:N]

decoded_matches

{140440118970112: [140440118980864,
  140440102676816,
  140440115342416,
  140440118980096,
  140440122653424],
 140440118980576: [140440115341648,
  140440115341936,
  140440102246368,
  140440115336272,
  140440111360720],
 140440115335120: [140440102677584,
  140440404457648,
  140440418893936,
  140440115340016,
  140440115336272],
 140440102675568: [140440102676672,
  140440115342032,
  140440115334400,
  140440418909248,
  140440102668752],
 140440102689696: [140440418899120,
  140440115335552,
  140440102248624,
  140440115343232,
  140440118980096],
 140440115334784: [140440102686480,
  140440102686912,
  140440115345968,
  140440102689792,
  140440115336560],
 140440118980096: [140440102676816,
  140440118980864,
  140440115342416,
  140440115343232,
  140440118970112],
 140440418893936: [140440102677584,
  140440115335120,
  140440115331664,
  140440115338096,
  140440102247376],
 140440115338096: [140440115331664,
  140440102247376,
  140440115332240,
  140440122639792,
  1

In [29]:
#decode....

## FINALLY FIND THE MATCHES....

In [30]:
target_user = 140440102666832



In [31]:
decoded_matches[target_user]

[140440418895568,
 140440116025072,
 140440115346880,
 140440115338240,
 140440118971216]

In [32]:
neighbour = 140440115331424

## Now see how they relate to the database...

In [33]:
reviews = db.get_table_values('Reviews')

SELECT * FROM Reviews


In [34]:
#TODO

In [35]:
def reviews_info(user_id,reviews):
    # print(db.get_table_values('Users'))
    info = {}
    info[user_id] = list(filter(lambda x :x[0] == user_id,db.get_table_values('Users')))[0][1]

    
    #TODO finish work...
    mov_tab = db.get_table_values('Movies')
    
    for review in reviews:
        mov_id = review[1]
        info[mov_id] = {'title':list(filter(lambda x :x[0] == mov_id,mov_tab))[0][1],'rating':review[4]}
        
    return info
    
    

In [36]:
target_reviews = list(filter(lambda rv: rv[2] == target_user,reviews))
neighbour_reviews = list(filter(lambda rv: rv[2] == neighbour,reviews))

In [37]:
tar_info = reviews_info(target_user,target_reviews)
neigh_info = reviews_info(neighbour,neighbour_reviews)

SELECT * FROM Users
SELECT * FROM Movies
SELECT * FROM Users
SELECT * FROM Movies


In [38]:
tar_info

{140440102666832: 'Ləman Lətifxanlı',
 2294629: {'title': 'Frozen', 'rating': '6/10'},
 6320628: {'title': 'Spider-Man: Far From Home', 'rating': '8/10'},
 120338: {'title': 'Titanic', 'rating': '9/10'},
 1074638: {'title': 'Skyfall', 'rating': '8/10'},
 1517268: {'title': 'Barbie', 'rating': '6/10'},
 4154796: {'title': 'Avengers: Endgame', 'rating': '10/10'},
 2820852: {'title': 'Furious 7', 'rating': '8/10'},
 110357: {'title': 'The Lion King', 'rating': '10/10'},
 2527336: {'title': 'Star Wars: The Last Jedi', 'rating': '6/10'}}

In [39]:
neigh_info

{140440115331424: 'Dainius Petrauskas',
 120338: {'title': 'Titanic', 'rating': '10/10'},
 4881806: {'title': 'Jurassic World: Fallen Kingdom', 'rating': '2/10'},
 4630562: {'title': 'The Fate of the Furious', 'rating': '9/10'},
 8041270: {'title': 'Jurassic World', 'rating': '3/10'},
 1630029: {'title': 'Avatar: The Way of Water', 'rating': '4/10'}}

# SAVE_MODEL

In [40]:
torch.save(model.state_dict(), 'model_weights.pth')