In [39]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
os.listdir('./data/ml-latest-small/ml-latest-small')

['links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv']

In [40]:
movies = pd.read_csv('./data/ml-latest-small/ml-latest-small/movies.csv')
ratings = pd.read_csv('./data/ml-latest-small/ml-latest-small/ratings.csv')
df = pd.merge(ratings, movies, on='movieId', how='inner')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


In [42]:
df[['userId', 'movieId']] = df[['userId', 'movieId']].astype(str)

In [43]:
n_movies = len(df['movieId'].unique())
n_users = len(df['userId'].unique())
n_movies, n_users

(9724, 610)

### Preprocessing

In [44]:

def data_process(data):
    # data = pd.read_csv(data_path)

    # Remove all rows where rating == 3
    data = data.drop(data[data['rating'] == 3].index)
    data['rating'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)
    data = data.sort_values(by='timestamp', ascending=True)
    train,test = train_test_split(data,test_size= 0.2 )
    return train, test, data


def get_user_feature(data):
    # get user history and mean user rating
    data_group = df[df['rating'] == 1]
    data_group = data_group[['userId', 'movieId']].groupby('userId').agg(list).reset_index()
    data_group['user_hist'] = data_group['movieId'].apply(lambda x: '|'.join([str(i) for i in x]))
    data = pd.merge(data_group.drop('movieId', axis=1), data, on='userId')
    data_group = data[['userId', 'rating']].groupby('userId').agg('mean').reset_index()
    data_group.rename(columns={'rating': 'user_mean_rating'}, inplace=True)
    data = pd.merge(data_group, data, on='userId')
    return data


def get_item_feature(data):
    # get mean item rating
    data_group = data[['movieId', 'rating']].groupby('movieId').agg('mean').reset_index()
    data_group.rename(columns={'rating': 'item_mean_rating'}, inplace=True)
    data = pd.merge(data_group, data, on='movieId')
    return data


def get_var_feature(data, col):
    key2index = {}

    def split(x):
        key_ans = x.split('|')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",\
                # so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1
        return torch.Tensor(list(map(lambda x: key2index[x], key_ans)))

    var_feature = list(map(split, data[col].values))
    print(var_feature)
    var_feature_length = np.array(list(map(len, var_feature)))
    max_len = max(var_feature_length)
    # var_feature = torch.nn.utils.rnn.pad_sequence(var_feature, )
    var_feature = pad_sequences(var_feature, maxlen=max_len, padding='post', )
    return key2index, var_feature, max_len


def get_test_var_feature(data, col, key2index, max_len):
    print("user_hist_list: \n")

    def split(x):
        key_ans = x.split('|')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",
                # so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1
        return list(map(lambda x: key2index[x], key_ans))

    test_hist = list(map(split, data[col].values))
    test_hist = pad_sequences(test_hist, maxlen=max_len, padding='post')
    return test_hist

In [45]:
train, test, data = data_process(df)

train = get_user_feature(train)
train = get_item_feature(train)

test = get_user_feature(test)
test = get_item_feature(test)

In [46]:
train.head()

Unnamed: 0,movieId,item_mean_rating,userId,user_mean_rating,user_hist,rating,timestamp,title,genres
0,1,0.858586,1,0.970238,3176,1,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,0.858586,103,0.931507,64114|91671|132046,1,1431954238,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,0.858586,121,0.809524,44|272,1,847656180,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,0.858586,132,0.594595,4643|3264,0,1157921785,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,0.858586,135,0.810651,153|65|165|1917|1544|1372|1722|3248|4621|1694|...,1,1009691859,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

sparse_features = ['userId', 'movieId']  # , 'gender', 'age', 'occupation'
dense_features = ['user_mean_rating', 'item_mean_rating']
target = ['rating']

user_sparse_features, user_dense_features = ['userId'], ['user_mean_rating']   # , 'gender', 'age', 'occupation'
item_sparse_features, item_dense_features = ['movieId', ], ['item_mean_rating']

# 1.Label Encoding for sparse features, and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    lbe.fit(data[feat])
    train[feat] = lbe.transform(train[feat])
    test[feat] = lbe.transform(test[feat])

mms = MinMaxScaler(feature_range=(0, 1))  # does this make sense? Mean ratings are already [0,1]
mms.fit(train[dense_features])
mms.fit(test[dense_features])  # I don't think we should be fitting to test set
train[dense_features] = mms.transform(train[dense_features])
test[dense_features] = mms.transform(test[dense_features])

In [47]:
train.head()

Unnamed: 0,movieId,item_mean_rating,userId,user_mean_rating,user_hist,rating,timestamp,title,genres
0,1,0.858586,1,0.970238,3176,1,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,0.858586,103,0.931507,64114|91671|132046,1,1431954238,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,0.858586,121,0.809524,44|272,1,847656180,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,0.858586,132,0.594595,4643|3264,0,1157921785,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,0.858586,135,0.810651,153|65|165|1917|1544|1372|1722|3248|4621|1694|...,1,1009691859,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
# 2.preprocess the sequence feature

genres_key2index, train_genres_list, genres_maxlen = get_var_feature(train, 'genres')
user_key2index, train_user_hist, user_maxlen = get_var_feature(train, 'user_hist')



In [None]:
# user_sparse_features, user_dense_features = ['userId'], ['user_mean_rating']   # , 'gender', 'age', 'occupation'
# item_sparse_features, item_dense_features = ['movieId', ], ['item_mean_rating']

In [None]:
def create_embedding_matrix(feature_subset, embedding_dim=32, init_std=0.0001, linear=False, sparse=False, device='cpu'):

    feature_columns = feature_subset.columns
    embedding_dict = nn.ModuleDict({feat_name: nn.Embedding(feature_subset[feat_name].nunique(),
                                                                      embedding_dim if not linear else 1)
                                    for feat_name in feature_columns})

    for tensor in embedding_dict.values():
        nn.init.normal_(tensor.weight, mean=0, std=init_std)

    return embedding_dict.to(device)

In [None]:
user_embedding_dict = create_embedding_matrix(df[['userId']])

In [None]:
user_embedding_dict

ModuleDict(
  (userId): Embedding(610, 32)
)

### Dataset

In [48]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [49]:
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import LabelEncoder

class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.df = df

        # encode the string idx for users and movies
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        self.df['userId'] = self.user_encoder.fit_transform(self.df['userId'])
        self.df['movieId'] = self.item_encoder.fit_transform(self.df['movieId'])
        self.N_USERS = len(self.user_encoder.classes_)
        self.N_MOVIES = len(self.item_encoder.classes_)

        # Convert data to PyTorch tensors
        self.users = torch.tensor(self.df['userId'].values)
        self.movies = torch.tensor(self.df['movieId'].values)
        self.ratings = torch.tensor(self.df['rating'].values)

        # binarize values, if rating is over 3.5 stars = 1, otherwise = 0
        # self.ratings = torch.where(self.ratings > 3.5, 1.0, 0.0)


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

In [66]:
ds = MovieLensDataset(train)
ds[:3] 
# (tensor([ 0,  5, 15]), tensor([0, 0, 0]), tensor([1, 1, 1]))


(tensor([ 0,  5, 15]), tensor([0, 0, 0]), tensor([1, 1, 1]))

### Model

In [None]:
class StringLookup(nn.Module):
    def __init__(self, vocabulary, num_oov_indices=1, mask_token=None):
        super().__init__()

        '''
         initializes the lookup table with an embedding matrix, 
         where each word in the vocabulary is assigned a unique index. 
         We also fill the weights with 1s for the tokens in the vocabulary, 
         so they will be "looked up" properly during forward propagation. 
         The OOV indices are initialized to 0.
        '''

        self.num_oov_indices = num_oov_indices
        self.mask_token = mask_token
        word_2_idx = {w: i for i, w in enumerate(vocabulary)}

        # Create the vocabulary lookup table
        self.lookup_table = nn.Embedding(len(vocabulary) + num_oov_indices, 1, padding_idx=None)
        self.lookup_table.weight.data.fill_(0)
        for i, token in enumerate(vocabulary):
            self.lookup_table.weight.data[i] = i + num_oov_indices

    def forward(self, inputs):
        '''
        performs the actual lookup operation. 
        We first apply the mask if the mask_token argument is provided. 
        Then we add the number of OOV indices to the input values so 
        that they correspond to the correct index in the embedding matrix. 
        Finally, we lookup the indices in the embedding matrix and return the outputs.
        '''

        if self.mask_token is not None:
            mask = inputs != self.mask_token
            inputs = inputs.masked_fill(~mask, 0)

        # Add the number of OOV indices to the input values
        inputs += len(self.lookup_table.weight)

        # Lookup the indices in the vocabulary table
        outputs = self.lookup_table(inputs)

        return outputs.squeeze()

In [83]:


class SimpleTwoTower(nn.Module):
    
    def __init__(self, n_items, n_users, embedding_size, ln=None):
        super(SimpleTwoTower, self).__init__()


        # self.userid_embedding = tf.keras.Sequential([
        #     tf.keras.layers.StringLookup(
        #         vocabulary=unique_user_ids, mask_token=None),
        #     tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
        # ])

        # self.user_id_embedding = nn.Sequential([
            
        #     nn.Embedding(len(unique_user_ids), 5, padding_idx=0),
        #     nn.Embedding(num_embeddings=(n_users + 1), embedding_dim=embedding_size)
        # ])

        # self.timestamp_embedding = tf.keras.Sequential([
        #     tf.keras.layers.Discretization(timestamp_buckets.tolist()),
        #     tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
        # ])
        # self.normalized_timestamp = tf.keras.layers.Normalization(
        #     axis=None
        # )

        # self.normalized_timestamp.adapt(timestamps)

        # --------------------------------------------------------------
        
        # self.ln = ln
        self.item_emb = nn.Embedding(num_embeddings=n_items, embedding_dim=embedding_size)  # self.ln[0]
        # We add +1 additional embedding to account for unknown tokens.
        self.user_emb = nn.Embedding(num_embeddings=n_users + 1, embedding_dim=embedding_size)
       
        
        self.item_layers = [] #nn.ModuleList()
        self.user_layers = [] #nn.ModuleList()
        
        # for i, n in enumerate(ln[0:-1]):
        #     m = int(ln[i+1])
        self.item_layers.append(nn.Linear(embedding_size, embedding_size, bias=True))  # n, m
        self.item_layers.append(nn.ReLU())
        
        self.user_layers.append(nn.Linear(embedding_size, embedding_size, bias=True))
        self.user_layers.append(nn.ReLU())   # is this ReLU needed???
            
            
        self.item_layers = nn.Sequential(*self.item_layers)
        self.user_layers = nn.Sequential(*self.user_layers)
        
        self.dot = torch.matmul
        self.sigmoid = nn.Sigmoid()

    def forward(self, users, items):

        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result
        '''
        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.timestamp_embedding(inputs["timestamp"]),
            tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
        ], axis=1)
        '''

        
        item_emb = self.item_emb(items) # [B, embed_size]
        user_emb = self.user_emb(users) # [B, embed_size]
        print(user_emb.shape, item_emb.shape)
        
        item_emb = self.item_layers(item_emb) # [B, embed_size]
        user_emb = self.user_layers(user_emb) # [B, embed_size]

        print(user_emb.shape, item_emb.shape)
        dp = self.dot(user_emb, item_emb.T) # [B, B]
        print(dp)
        dp = dp.sum(dim=1).squeeze() # [B]

        return self.sigmoid(dp)

In [84]:
tt = SimpleTwoTower(n_items=len(ds.item_encoder.classes_),
                    n_users=len(ds.user_encoder.classes_),
                    embedding_size=8)

train_dataloader = DataLoader(ds, batch_size=4, shuffle=True)

In [85]:
sample = next(iter(train_dataloader))
sample

[tensor([171, 140, 316,  11]),
 tensor([2831, 4654, 6150, 5125]),
 tensor([1, 1, 0, 1])]

In [86]:
users, items, ratings = sample
tt(users, items)

torch.Size([4, 8]) torch.Size([4, 8])
torch.Size([4, 8]) torch.Size([4, 8])
tensor([[0.0109, 0.0000, 0.0000, 0.0492],
        [0.0536, 0.0486, 0.0355, 0.0228],
        [0.2337, 0.0481, 0.1612, 0.6567],
        [0.0000, 0.0000, 0.0000, 0.0000]], grad_fn=<MmBackward0>)


tensor([0.5150, 0.5400, 0.7502, 0.5000], grad_fn=<SigmoidBackward0>)

### Define Training / Validation

In [None]:
from tqdm import tqdm
import torch
# import horovod.torch as hvd
# from sparkdl import HorovodRunner


def save_checkpoint(model, epoch):
    save_path = os.path.join(SAVE_MODEL_DIR, f"Epoch{str(epoch)}_checkpoint.pt")
    torch.save(model.state_dict(), save_path)


def train_one_epoch(model, criterion, optimizer, scheduler, 
                    train_dataloader, steps_per_epoch, epoch, 
                    device, batch_size):

    model.train()  # Set model to training mode

    # statistics
    running_loss = 0.0
    running_corrects = 0

    # Iterate over the data for one epoch.
    for step in tqdm(range(steps_per_epoch)):

        users, movies, ratings = next(iter(train_dataloader))


        # Track history in training
        with torch.set_grad_enabled(True):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            logits = model(users, movies)
            
            preds = torch.round(logits)
            loss = criterion(logits, ratings)

            # backward + optimize
            loss.backward()
            optimizer.step()

    # statistics
    running_loss += loss.item() * len(ratings)
    running_corrects += torch.sum(preds == ratings)

    scheduler.step()

    save_checkpoint(model, epoch)

    epoch_loss = running_loss / (steps_per_epoch * batch_size)
    epoch_acc = running_corrects.double() / (steps_per_epoch * batch_size)

    print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
    return epoch_loss, epoch_acc


def evaluate(model, criterion, val_dataloader, validation_steps, device, batch_size,
             metric_agg_fn=None):
    model.eval()  # Set model to evaluate mode

    # statistics
    running_loss = 0.0
    running_corrects = 0

    # Iterate over all the validation data.
    for step in tqdm(range(validation_steps)):

        users, movies, ratings = next(iter(val_dataloader))



        # Do not track history in evaluation to save memory
        with torch.set_grad_enabled(False):
            # forward
            outputs = model(users, movies)
            preds = torch.round(outputs)
            loss = criterion(outputs, ratings)

        # statistics
        running_loss += loss.item()
        running_corrects += torch.sum(preds == ratings.data)

    # The losses are averaged across observations for each minibatch.
    epoch_loss = running_loss / validation_steps
    epoch_acc = running_corrects.double() / (validation_steps * batch_size)

    # metric_agg_fn is used in the distributed training to aggregate the metrics on all workers
    if metric_agg_fn is not None:
        epoch_loss = metric_agg_fn(epoch_loss, 'avg_loss')
        epoch_acc = metric_agg_fn(epoch_acc, 'avg_acc')

    print('Validation Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
    return epoch_loss, epoch_acc


### Run 

In [None]:
# Set h-params

LR = 1e-3
NUM_EPOCHS = 100
BATCH_SIZE = 16

if torch.cuda.is_available():
    #     torch.cuda.set_device(hvd.local_rank())
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

criterion = torch.nn.BCEWithLogitsLoss()

STEP_SIZE = 20
GAMMA = 0.1

SAVE_MODEL_DIR = '/content/sample_data'

In [None]:

#   hvd.init()  # Initialize Horovod.



train_df, val_df = train_test_split(df, test_size=0.2, random_state=0, stratify=df[['rating']])

train_ds = MovieLensDataset(train_df)
val_ds = MovieLensDataset(val_df)

train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True)

model = SimpleTwoTower(n_items=train_ds.N_USERS, n_users=train_ds.N_MOVIES, embedding_size=64)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) 
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)


steps_per_epoch = len(train_dataloader)

validation_steps = max(1, len(val_dataloader))


for epoch in range(NUM_EPOCHS):
    print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))
    print('-' * 10)


    # train_dataloader_iter, val_dataloader_iter = load_train_val_iterators()

    train_loss, train_acc = train_one_epoch(model, criterion, optimizer, exp_lr_scheduler, 
                                            train_dataloader, steps_per_epoch, epoch, 
                                            device, batch_size=BATCH_SIZE)
    
    val_loss, val_acc = evaluate(model, criterion, 
                                 val_dataloader, validation_steps, 
                                 device, metric_agg_fn=None, batch_size=BATCH_SIZE)

print(val_loss)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['movieId'] = self.df['movieId'].map(self.movie_id_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['userId'] = self.df['userId'].map(self.user_id_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['movieId'] = self.df['movieId'].map(self.movie_id_map)
A value is trying t

Epoch 1/100
----------


100%|██████████| 5042/5042 [03:03<00:00, 27.48it/s]


Train Loss: 0.0002 Acc: 0.0001


100%|██████████| 1261/1261 [00:03<00:00, 334.28it/s]


Validation Loss: 0.9272 Acc: 0.3861
Epoch 2/100
----------


100%|██████████| 5042/5042 [02:59<00:00, 28.07it/s]


Train Loss: 0.0001 Acc: 0.0001


100%|██████████| 1261/1261 [00:02<00:00, 478.39it/s]


Validation Loss: 0.9274 Acc: 0.3859
Epoch 3/100
----------


  8%|▊         | 413/5042 [00:15<02:58, 25.96it/s]


KeyboardInterrupt: ignored