In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics, preprocessing

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

device(type='cuda')

In [None]:
# !gunzip /content/drive/MyDrive/CSE272/Movies_and_TV.json.gz

In [None]:
import pandas as pd
import json

data = []
with open('/content/drive/MyDrive/CSE272/Movies_and_TV.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

In [None]:
df.reviewerID.nunique()

3826085

In [None]:
df.reviewerID.value_counts()

3691354    4254
462849     2292
2085233    2175
3143039    2136
3726617    2046
           ... 
318334        1
1374689       1
2131582       1
871353        1
3377852       1
Name: reviewerID, Length: 3826085, dtype: int64

In [None]:
df.asin.nunique()

182032

In [None]:
df.overall.value_counts()

5.0    5491586
4.0    1498047
3.0     735907
1.0     633153
2.0     406875
Name: overall, dtype: int64

In [6]:
# load data frames to save time
df_train = pd.read_csv('/content/drive/MyDrive/CSE272/train_df.csv')
df_valid = pd.read_csv('/content/drive/MyDrive/CSE272/test_df.csv')

In [7]:
df = pd.concat([df_train, df_valid])

In [8]:
import os
import pandas as pd
from torchvision.io import read_image

class MoviesTVDataset(Dataset):
    def __init__(self, users, products, ratings):
        self.users = users
        self.products = products
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        users = self.users[idx]
        products = self.products[idx]
        ratings = self.ratings[idx]

        return {
            "users": torch.tensor(users, dtype=torch.long),
            "products": torch.tensor(products, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

In [10]:
class NCFModel(nn.Module):
    def __init__(self, n_users, n_products):
        super().__init__()

        self.user_embed = nn.Embedding(n_users, 32)
        self.product_embed = nn.Embedding(n_products, 32)
        self.out = nn.Linear(64, 1)

    def forward(self, users, products, ratings=None):
        user_embeds = self.user_embed(users)
        product_embeds = self.product_embed(products)
        output = torch.cat([user_embeds, product_embeds], dim=1)

        output = self.out(output)
        return output

In [11]:
train_dataset = MoviesTVDataset(
    users=df_train.reviewerID.values,
    products=df_train.asin.values,
    ratings=df_train.overall.values
)

valid_dataset = MoviesTVDataset(
    users=df_valid.reviewerID.values,
    products=df_valid.asin.values,
    ratings=df_valid.overall.values
)


In [12]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=2048,
                          shuffle=True,
                          num_workers=2) 

validation_loader = DataLoader(dataset=valid_dataset,
                          batch_size=2048,
                          shuffle=True,
                          num_workers=2) 

dataiter = iter(train_loader)
dataloader_data = next(dataiter)
print(dataloader_data)

{'users': tensor([ 50948,   6864,  76001,  ..., 174339, 309367,   1190]), 'products': tensor([106867,  71993,  37519,  ...,  19105,  15680, 132042]), 'ratings': tensor([5, 3, 5,  ..., 4, 4, 3])}


In [14]:
model = NCFModel(
    n_users=df.reviewerID.nunique(),
    n_products=df.asin.nunique(),
).to(device)

loss_func = nn.MSELoss()

In [15]:
def train_epocs(model, epochs=1, lr=0.01, wd=0.0):
    epochs = 1
    total_loss = 0
    plot_steps, print_steps = 5000, 5000
    step_cnt = 0
    all_losses_list = [] 
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for epoch_i in range(epochs):
         for i, train_data in enumerate(train_loader):
            output = model(train_data["users"].to(device), 
                        train_data["products"].to(device)
                        ) 
            
            rating = train_data["ratings"].view(len(output), -1).to(torch.float32)

            loss = loss_func(output, rating.to(device))
            total_loss = total_loss + loss.sum().item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            step_cnt = step_cnt + len(train_data["users"])
            

            if(step_cnt % plot_steps == 0):
                print(i)
                avg_loss = total_loss/(len(train_data["users"]) * plot_steps)
                print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
                all_losses_list.append(avg_loss)
                total_loss = 0 

In [16]:
train_epocs(model)

624
epoch 0 loss at step: 1280000 is 0.00022704817415215075
1249
epoch 0 loss at step: 2560000 is 7.207709407666698e-05


In [18]:
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        model_output = model(batched_data['users'].to(device), 
                       batched_data["products"].to(device))
        
        model_output_list.append(model_output.sum().item() / len(batched_data['users']) )

        target_rating = batched_data["ratings"]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users']))

        # print(f"model_output: {model_output}, target_rating: {target_rating}")


# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")

rms: 0.0371497814427341


In [19]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(target_rating_list, model_output_list)
print(f"mae: {mae}")

mae: 0.031537923915674525
