Write a notebook that uses a neural network to predict the rating of movies based on this dataset. Use the following guidelines.

* Use 20% of data for validation

* Use the latest data for validation 

* Use a baseline model to compare to your neural network


In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [2]:
PATH = Path("netflix-prize-data/")
list(PATH.iterdir())

[PosixPath('netflix-prize-data/movie_titles.csv'),
 PosixPath('netflix-prize-data/combined_data_1.txt'),
 PosixPath('netflix-prize-data/probe.txt'),
 PosixPath('netflix-prize-data/combined_data_4.txt'),
 PosixPath('netflix-prize-data/combined_data_2.txt'),
 PosixPath('netflix-prize-data/combined_data_3.txt'),
 PosixPath('netflix-prize-data/qualifying.txt'),
 PosixPath('netflix-prize-data/README')]

### Preprocessing Data

In [3]:
df = pd.read_csv(PATH/"combined_data_1.txt", header=None, names=["Cust_id", "Rating", "Date"])
df1 = pd.read_csv(PATH/"combined_data_2.txt", header=None, names=["Cust_id", "Rating", "Date"])
df2 = pd.read_csv(PATH/"combined_data_3.txt", header=None, names=["Cust_id", "Rating", "Date"])
df3 = pd.read_csv(PATH/"combined_data_4.txt", header=None, names=["Cust_id", "Rating", "Date"])

In [4]:
print(df.shape)
print(df1.shape)
print(df2.shape)
print(df3.shape)

(24058263, 3)
(26982302, 3)
(22605786, 3)
(26851926, 3)


In [5]:
df = df.append(df1)
df = df.append(df2)
df = df.append(df3)

In [6]:
df.shape

(100498277, 3)

In [7]:
df.head()

Unnamed: 0,Cust_id,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [24]:
df = df.reset_index(drop=True)

In [25]:
movie_id_lines = df.index[df.Rating.isnull()].tolist()

In [35]:
movie_id = [0 for _ in range(df.shape[0])]

In [36]:
for i in range(len(movie_id_lines)):
    _id = i + 1
    if i == len(movie_id_lines) - 1:
        movie_id[movie_id_lines[i]:df.shape[0]] = [_id for _ in range(movie_id_lines[i], df.shape[0])]
    else:
        movie_id[movie_id_lines[i]:movie_id_lines[i + 1]] = [_id for _ in range(movie_id_lines[i], movie_id_lines[i + 1])]

In [44]:
df["movie_id"] = movie_id

In [48]:
df = df.drop(movie_id_lines, axis=0)

In [52]:
data = df.reset_index_index(drop=True)

In [53]:
data.to_csvv("rating_with_movieId.csv", index=False)

### Encoding data

In [2]:
data = pd.read_csv("rating_with_movieId.csv")

In [3]:
data.head()

Unnamed: 0,Cust_id,Rating,Date,movie_id
0,1488844,3.0,2005-09-06,1
1,822109,5.0,2005-05-13,1
2,885013,4.0,2005-10-19,1
3,30878,4.0,2005-12-26,1
4,823519,3.0,2004-05-03,1


In [4]:
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [5]:
def proc_col(col, train_col=None):
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [6]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["Cust_id", "movie_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [7]:
df_t_e = encode_data(train)
df_t_e = df_t_e.drop(["Date"], axis = 1)
df_v_e = encode_data(val, train)
df_v_e = df_v_e.drop(["Date"], axis = 1)

In [8]:
df_t_e.head()

Unnamed: 0,Cust_id,Rating,movie_id
0,0,3.0,0
1,1,5.0,0
2,2,4.0,0
3,3,4.0,0
6,4,4.0,0


In [9]:
df_v_e.head()

Unnamed: 0,Cust_id,Rating,movie_id
4,4030,3.0,0
5,64034,3.0,0
7,37234,3.0,0
8,64231,4.0,0
25,16586,4.0,0


### Matrix factorization model

In [10]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [11]:
print(df_t_e.Cust_id.nunique())
print(df_t_e.movie_id.nunique())

479831
17770


In [12]:
num_users = df_t_e.Cust_id.nunique()
num_item = df_t_e.movie_id.nunique()
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_item,emb_size)
users = torch.LongTensor(df_t_e.Cust_id.values)
items = torch.LongTensor(df_t_e.movie_id.values)

In [13]:
U = user_emb(users.long())
V = item_emb(items.long())

In [14]:
x_train = df_t_e.drop("Rating", axis=1)
x_val = df_v_e.drop("Rating", axis=1)

In [15]:
y_train = df_t_e.Rating.values.astype(np.float32)
y_val = df_v_e.Rating.values.astype(np.float32)

In [16]:
class WiDSDataset(Dataset):
    def __init__(self, x, y):
        x = x.copy()
        self.x = x.copy().values.astype(np.int64)
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.x[idx], self.y[idx]]    

In [17]:
train_ds = WiDSDataset(x_train, y_train)
valid_ds = WiDSDataset(x_val, y_val)

In [18]:
batch_size = 100000
train_dl = DataLoader(train_ds, batch_size=batch_size)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

### Training MF model

In [19]:
model = MF(num_users, num_item, emb_size=100)

In [20]:
def get_optimizer(model, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optimizer

In [32]:
def train_model(model, optimizer, train_dl=train_dl, verbose=False):
    model.train()
    total = 0
    sum_loss = 0
    for i, (x, y) in enumerate(train_dl):
        batch = y.shape[0]
        y = y.unsqueeze(1)
        y_hat = model(x[:,0].long(), x[:,1].long())
        loss = F.mse_loss(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += batch
        sum_loss += batch * (loss.item())
        if verbose: 
            print(sum_loss/total)
    return sum_loss/total

In [33]:
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for i, (x, y) in enumerate(valid_dl):
        batch = y.shape[0]
        y = y.unsqueeze(1)
        y_hat = model(x[:,0].long(), x[:,1].long())
        loss = F.mse_loss(y_hat, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = (y_hat > 0).float()
        correct += (pred == y).float().sum().item()
    print("val loss", sum_loss/total, correct/total)
    return sum_loss/total, correct/total

In [23]:
from datetime import datetime

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr=lr, wd=wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dl)
        print("loss", loss)
        val_loss(model, valid_dl)
    

In [25]:
train_loop(model, epochs=10, lr=0.01, wd=1e-6)

loss 13.508707538109421
val loss 8.937237874704275 0.04341254602408631
loss 1.8797645634605231
val loss 2.9589232917114794 0.04471306551520599
loss 2.0839081967608384
val loss 2.820860647047008 0.04591967048794739
loss 2.9317543367186176
val loss 2.7767641143215065 0.04588776841784645
loss 2.5064940836931457
val loss 3.4542023994721114 0.04581938550939294
loss 2.169832572855896
val loss 3.1640713618849063 0.04572178806872532
loss 1.8943118824379728
val loss 2.4768497785566805 0.04571163514781956
loss 1.8812370140244883
val loss 4.486951320478401 0.04510703866172558
loss 1.9833495774250476
val loss 3.1369475800519337 0.04551713703556608
loss 2.135223205777707
val loss 3.6450721486002933 0.044268029148836846


### Neural Network Model

In [34]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_item, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.0)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

In [35]:
model = CollabFNet(num_users, num_item, emb_size=100)

In [None]:
train_loop(model, epochs=1, lr=0.01, wd=1e-6)

loss 1.1351458821393203
val loss 6.185711726913018 0.045942564329205474
