In [35]:
import pandas as pd
import urllib
import tempfile
import shutil
import zipfile
import os

os.chdir('/Users/joesh/neural_colab_filtering/')
archive_url = f'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
wd = os.getcwd()

with urllib.request.urlopen(archive_url) as url:
    with tempfile.NamedTemporaryFile(delete=True) as f:
        shutil.copyfileobj(url, f)
        with zipfile.ZipFile(f.name) as archive:
            archive.extractall(f'{wd}/datasets')


In [36]:
import torch
device = torch.device('cpu') if not torch.backends.mps.is_available() else torch.device('mps')
print(device)

mps


In [37]:
ratings = pd.read_csv('datasets/ml-1m/ratings.dat', sep='::', engine = 'python', 
                        header=None,
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [38]:
import source.model as model
import source.data as data

import torch.nn as nn

# Create a dataset object

dataset = data.filmDataset( ratings['user_id'].values, ratings['movie_id'].values, 
                            ratings['rating'].values)

mod = model.NCF(ratings.user_id.nunique(), ratings.movie_id.nunique(), dropout_rate=0.2)

In [39]:
# Create a dataloader object

from torch.utils.data import DataLoader
def create_dataloader(dataset, batch_size):
    train_set_size = int(0.9 * len(dataset))
    test_set_size = len(dataset) - train_set_size

    trainset, testset = torch.utils.data.random_split(dataset, [train_set_size, test_set_size])

    train_set_size = int(0.8 * len(trainset))
    valid_set_size = len(trainset) - train_set_size

    train_set, valid_set = torch.utils.data.random_split(trainset, 
                       [train_set_size, valid_set_size])

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last= False)
    valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, drop_last=False)
    test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, drop_last=False)
    return train_loader, valid_loader, test_loader

train_loader, valid_loader, test_loader = create_dataloader(dataset, 1024)

In [40]:
import json

def _get_config_file(model_path, model_name):
    #Name of the file for storing hyperparameter details
    return os.path.join(model_path, model_name+ ".config")

def _get_model_file(model_path, model_name):
    #Name of the file for storing network parameters
    return os.path.join(model_path, model_name+".tar")

def save_model(model, model_path, model_name):
    config_dict = model.config
    os.makedirs(model_path, exist_ok = True)
    config_file, model_file = _get_config_file(model_path, model_name), _get_model_file(model_path, model_name)
    with open(config_file, 'w') as f:
        json.dump(config_dict, f)
    torch.save(model.state_dict(), model_file)



checkpoint_path = os.path.join(os.getcwd(), "checkpoints")
mod = mod.to(device)
num_params = sum(p.numel() for p in mod.parameters())
print(f'Our model architecture: \n\n {mod} \n')

def load_model(model_path, model_name, network=None):
    model_file = _get_model_file(model_path, model_name)
    network.load_state_dict(torch.load(model_file, map_location=device))
    return network

Our model architecture: 

 NCF(
  (embedding_user): Embedding(6040, 32)
  (embedding_item): Embedding(3706, 32)
  (MLP): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=16, out_features=8, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=8, out_features=1, bias=True)
    (10): Sigmoid()
  )
  (dropout): Dropout(p=0.2, inplace=False)
) 



In [43]:
from tqdm.notebook import tqdm

def train_model(net, model_name, dataset, max_epoch=30, patience= 5, overwrite=False):
  file_exists  = os.path.isfile(_get_model_file(checkpoint_path, model_name))
  train_loader, valid_loader, test_loader = create_dataloader(dataset, 1024)  
  if file_exists and not overwrite:
    print("Model already exists. Skipping training")
  else:
    if file_exists:
      print("Overwriting existing model")
    
    optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
    loss_module = nn.MSELoss()
    
    val_score = []
    best_val_epoch = -1
    

    for epoch in range(max_epoch):
      net.train()
      running_loss = 0.0
      count = 0
      for users, film, ratings in tqdm(train_loader, desc = f"Epoch {epoch+1}", leave=False):
        count += 1
        users , film, ratings = users.to(device), film.to(device), ratings.to(device)
        ratings = ratings.unsqueeze(-1).type(torch.float32)
        optimizer.zero_grad() #clear any existing gradients
        preds = net(users, film)
        loss = loss_module(preds, ratings)
        running_loss += loss.item()

        loss.backward()
        optimizer.step()
      
      running_loss = running_loss/count
      val_loss = test_model(net, valid_loader)
      val_score.append(val_loss)
      print(f'Epoch {epoch+1:2d}: Training loss: {running_loss}, Validation loss {val_loss}')

      if len(val_score) == 1 or val_loss < val_score[best_val_epoch]:
        print("New best. Saving model")
        save_model(net, checkpoint_path, model_name)
        best_val_epoch = epoch
      elif best_val_epoch <= epoch - patience:
        print(f"Early stopping since model is not improving over last {patience} epochs")
        break
    
  load_model(checkpoint_path, model_name, net)
  test_acc = test_model(net, test_loader)
  print(f"Test loss: {test_acc}")
  return test_acc
      
def test_model(net, data_loader):
  loss_module = nn.MSELoss()
  net.eval()
  running_loss = 0.0
  count = 0
  for users, films, ratings in data_loader:
    count += 1
    users , films, ratings = users.to(device), films.to(device), ratings.to(device)
    with torch.no_grad():
      ratings = ratings.unsqueeze(-1).type(torch.float32)
      preds = net(users, films)
      loss = loss_module(preds, ratings)
      running_loss += loss.item()
  return running_loss/count


In [44]:
mod_name = "NCF_Recommender"
torch.seed()
train_model(mod, mod_name, dataset)


Model already exists. Skipping training
Test loss: 0.7587132374851071


0.7587132374851071

In [47]:
#mod.MLP
import gmf
gmf_mod = gmf.GMF(ratings.user_id.nunique(), ratings.movie_id.nunique())
gmf_mod = gmf_mod.to(device)
gmf_mod_name = "GMF_Recommender"
torch.seed()
train_model(gmf_mod, gmf_mod_name, dataset)


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list