In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
%matplotlib inline


# Collabrative filtering using deep neural networks

Reference :- [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
# !rm -rf '/content/movie_lens/ml-100k'

In [4]:
data_dir = "/content/movie_lens/"
ckpt_dir = './drive/MyDrive/Extra/DS/ckpt/'
tensorboard_dir = './drive/MyDrive/Extra/DS/training/'
if not os.path.isdir(data_dir):
    os.makedirs(data_dir)
if not os.path.isdir(ckpt_dir):
    os.makedirs(ckpt_dir)
if not os.path.isdir(tensorboard_dir):
    os.makedirs(tensorboard_dir)

In [5]:
writer = SummaryWriter(tensorboard_dir)
%load_ext tensorboard

In [6]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip "/content/ml-100k.zip" -d "/content/movie_lens/"

--2022-04-17 13:53:31--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2022-04-17 13:53:31 (30.9 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  /content/ml-100k.zip
   creating: /content/movie_lens/ml-100k/
  inflating: /content/movie_lens/ml-100k/allbut.pl  
  inflating: /content/movie_lens/ml-100k/mku.sh  
  inflating: /content/movie_lens/ml-100k/README  
  inflating: /content/movie_lens/ml-100k/u.data  
  inflating: /content/movie_lens/ml-100k/u.genre  
  inflating: /content/movie_lens/ml-100k/u.info  
  inflating: /content/movie_lens/ml-100k/u.item  
  inflating: /content/movie_lens/ml-100k/u.occupation  
  inflating: /content/movie_lens/ml-100k/u.user  
  inflating: /content/movie_lens/ml

In [7]:
names = ['user_id', 'item_id', 'rating', 'timestamp']

df = pd.read_csv('/content/movie_lens/ml-100k/u.data','\t',names = names)
# df.rename(columns = {'user_id':'userId', 'item_id':'movieId'}, inplace = True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [9]:
len(df.user_id.unique()), len(df.item_id.unique())

(943, 1682)

In [10]:
df.user_id = df.user_id.astype('category').cat.codes.values
df.item_id = df.item_id.astype('category').cat.codes.values

In [11]:
temp, test= train_test_split(df,test_size=0.1)
train, val = train_test_split(temp, test_size=0.23) # 0.23 x 0.9 = 0.2

In [12]:
len(train)

69300

In [13]:
len(train.user_id.unique())

943

In [14]:
class NCF(nn.Module):
  def __init__(self,args):
    super(NCF,self).__init__()
    self.no_users = args['n_users']
    self.no_movies = args['n_movies']
    self.no_latent_users = args['n_latent_variables_u']
    self.no_latent_movie = args['n_latent_variables_m']
    self.no_latent_mf = args['n_latent_variables_mf']

    self.movie_embedding = nn.Embedding(self.no_movies+1, self.no_latent_movie)
    self.movie_embedding_mf = nn.Embedding(self.no_movies+1, self.no_latent_mf)
    self.user_embedding = nn.Embedding(self.no_users+1, self.no_latent_users)
    self.user_embedding_mf = nn.Embedding(self.no_users+1, self.no_latent_mf)


    self.flatten = nn.Flatten()
    self.dropout = nn.Dropout(0.2)
    self.relu = nn.ReLU(inplace=True)
    self.bn1 = nn.BatchNorm1d(200)
    self.bn2 = nn.BatchNorm1d(100)
    

    self.fc1 = nn.Linear(self.no_latent_movie+self.no_latent_users,200)
    self.fc2 = nn.Linear(200,100)
    self.fc3 = nn.Linear(100,50)
    self.fc4 = nn.Linear(50,20)
    self.fc5 = nn.Linear(20,1)
    self.fc6 = nn.Linear(2,100)
    self.fc7 = nn.Linear(100,100)
    self.fc8 = nn.Linear(100,1)


  def forward(self,u,m):
    mx= self.movie_embedding(m)
    mx = self.flatten(mx)
    mx = self.dropout(mx)

    mxf= self.movie_embedding_mf(m)
    mxf = self.flatten(mxf)
    mxf = self.dropout(mxf)

    ux= self.user_embedding(u)
    ux = self.flatten(ux)
    ux = self.dropout(ux)

    uxf= self.user_embedding_mf(u)
    uxf = self.flatten(uxf)
    uxf = self.dropout(uxf)

    out = torch.cat([mx, ux], dim=1)
    out = self.dropout(out)
    out = self.fc1(out)
    out = self.bn1(out)
    out = self.dropout(out)
    out = self.fc2(out)
    out = self.bn2(out)
    out = self.dropout(out)
    out = self.fc3(out)
    out = self.fc4(out)
    out = self.relu(out)
    out = self.fc5(out)
    out = self.relu(out)

    out1 = torch.sum(mxf* uxf,dim = 1)
    out1 = torch.unsqueeze(out1,1)
    out = torch.cat([out1, out], dim=1)
    out = self.fc6(out)
    out = self.fc7(out)
    out = self.fc8(out)




    return out


In [26]:
class Load_MVLENS(Dataset):

  def __init__(self,df):
    

    x=np.array([np.array(df.user_id.values),np.array(df.item_id.values)])
    x = x.T
    y=np.array(df.rating.values)

    self.x_train=torch.tensor(x,dtype=torch.int32)
    self.y_train=torch.tensor(y,dtype=torch.float32)

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [29]:
def train_NCF(model,num_epochs,batch_size,lr,save_interval):
  
  print("Training")
  train_dataset = Load_MVLENS(train)
  val_dataset = Load_MVLENS(val)
  train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
  val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True)

  model.to(device)
  model = torch.nn.DataParallel(model).cuda()

  # criterion = RMSELoss(reduction = 'sum')
  criterion = nn.MSELoss(reduction = 'sum')
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  
  for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    avg = []
    for batch_idx,(data,target) in enumerate(train_loader):
      U = data[:,0].to(device)
      M = data[:,1].to(device)
      U = torch.unsqueeze(U,1)
      M = torch.unsqueeze(M,1)
      target = target.float().view((len(target),1))
      target = target.to(device)
      optimizer.zero_grad()
      output = model(U,M)
      loss = criterion(output, target)
      avg.append(loss.item())

      loss.backward()
      optimizer.step()
    epoch_loss = np.mean(avg)
    
    if(epoch % save_interval == 0):
      state = {
                  'epoch': epoch,
                  'state_dict': model.module.state_dict(),
                  'optimizer': optimizer.state_dict()
                  # 'scheduler': scheduler.state_dict()
                  }
      torch.save(state, ckpt_dir +'epoch_'+str(epoch)+'_ckpt.t7')
    
    val_loss= val_NCF(model,val_loader)

    writer.add_scalar('Train loss', epoch_loss, epoch)
    writer.add_scalar('Val loss', val_loss, epoch)

    print("Epoch {} Train loss: {},  Val loss: {}".format(epoch,epoch_loss,val_loss))



In [30]:
def val_NCF(model,val_loader):
  batch_size = 64
  val_loss = 0.0
  # criterion = RMSELoss(reduction='sum')
  criterion = nn.MSELoss(reduction = 'sum')
  model.eval()
  avg = []
  with torch.no_grad():
    for batch_idx,(data,target) in enumerate(val_loader):
        U = data[:,0].to(device)
        M = data[:,1].to(device)
        U = torch.unsqueeze(U,1)
        M = torch.unsqueeze(M,1)
        target = target.float().view((len(target),1))
        target = target.to(device)
        output = model(U,M)
        loss = criterion(output, target)
        avg.append(loss.item())
        # val_loss+=loss.item()

  val_loss = np.mean(avg)
  return val_loss

In [31]:
args = {'n_users':len(df.user_id.unique()),'n_movies':len(df.item_id.unique()),'n_latent_variables_u':8,'n_latent_variables_m':10,'n_latent_variables_mf':3}
NCF_model = NCF(args)
train_NCF(NCF_model,num_epochs = 100,batch_size=64,lr = 1e-3,save_interval=10)

Training
Epoch 0 Train loss: 87.35583280878605,  Val loss: 80.13437345292833
Epoch 1 Train loss: 80.1442201707722,  Val loss: 76.046039863869
Epoch 2 Train loss: 77.39733747233974,  Val loss: 72.31630504278489
Epoch 3 Train loss: 74.63820350203157,  Val loss: 73.0882174173991
Epoch 4 Train loss: 72.25939038629787,  Val loss: 67.87162923224179
Epoch 5 Train loss: 70.00347452700854,  Val loss: 66.64395809173584
Epoch 6 Train loss: 68.45748622045403,  Val loss: 65.76105023607795
Epoch 7 Train loss: 67.20206766665697,  Val loss: 63.62404838609107
Epoch 8 Train loss: 66.10205698761804,  Val loss: 63.80223706327838
Epoch 9 Train loss: 65.01309643689946,  Val loss: 61.9218182210569
Epoch 10 Train loss: 64.25938850888916,  Val loss: 62.21819284815847
Epoch 11 Train loss: 63.673668288127864,  Val loss: 60.163164815784974
Epoch 12 Train loss: 62.82215253666167,  Val loss: 61.42904090881348
Epoch 13 Train loss: 62.463872326598164,  Val loss: 59.823846605088974
Epoch 14 Train loss: 62.328623379806

In [39]:
from sklearn.metrics import mean_absolute_error
x=np.array([np.array(test.user_id.values),np.array(test.item_id.values)])
x = x.T
x = torch.tensor(x,dtype=torch.int32)
U = x[:,0].to(device)
M = x[:,1].to(device)
U = torch.unsqueeze(U,1)
M = torch.unsqueeze(M,1)
pred = NCF_model(U, M).detach().cpu()
y_hat_2 = np.round(pred)
y_true = test.rating
print(np.sqrt(mean_absolute_error(y_true, y_hat_2)))
print(np.sqrt(mean_absolute_error(y_true, pred)))


0.8471717653463199
0.8681949942503375
