In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch 
from torch import nn, Tensor
import torch.nn.functional as F
import matplotlib.pyplot as plt
from models.VanillaTransformer import VanillaTransformer, generate_square_subsequent_mask 
import pdb


## MovieLens 1M Preprocessing

The first dataset that will be used to train and test recommendation systems is the MovieLens 1M dataset. Fields for the dataset must be convereted into vectors in order to be properly inputed into a Transformer model. Thus, ennumerating non-numerical variables is essential for the second part, input embeddings. 

#### Users

In [3]:
users = pd.read_csv("data/users.dat", sep="::", header=None, engine="python", encoding="latin-1")
users.columns = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]

### Male is True
users['Gender'] = users['Gender'].apply(lambda sex: sex == 'M')
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,False,1,10,48067
1,2,True,56,16,70072
2,3,True,25,15,55117
3,4,True,45,7,2460
4,5,True,25,20,55455


In [4]:
print("Number of users x features:", users.shape)

Number of users x features: (6040, 5)


#### Movies

In [5]:
movies = pd.read_csv("data/movies.dat", sep="::", names=["MovieID", "Movie", "Genre"], encoding='latin-1')
movies['Genre'] = movies['Genre'].copy().str.split("|")


new_movies = movies.explode('Genre')
ennumerate_genre = pd.crosstab(new_movies.Movie, new_movies.Genre).reset_index()
movie_indicies = movies[['MovieID', "Movie"]].copy()
enumerated_movies = movie_indicies.merge(ennumerate_genre, on='Movie')
enumerated_movies.head()

Unnamed: 0,MovieID,Movie,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Ratings

In [6]:
ratings = pd.read_csv("data/ratings.dat", sep="::", names=['UserID', 'MovieID', 'Rating', 'time'],encoding='latin-1')
ratings['interaction'] = np.ones(len(ratings))
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,time,interaction
0,1,1193,5,978300760,1.0
1,1,661,3,978302109,1.0
2,1,914,3,978301968,1.0
3,1,3408,4,978300275,1.0
4,1,2355,5,978824291,1.0


#### Combined

In [7]:
ratings_plus_movie = ratings.merge(enumerated_movies, on='MovieID')
movies_combined = users.merge(ratings_plus_movie, on='UserID').copy()

### Can get rid of Movie probably
movies_combined.reset_index(inplace=True)
movies_combined.head()

Unnamed: 0,index,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,time,interaction,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,False,1,10,48067,1193,5,978300760,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,False,1,10,48067,661,3,978302109,1.0,...,0,0,0,1,0,0,0,0,0,0
2,2,1,False,1,10,48067,914,3,978301968,1.0,...,0,0,0,1,0,1,0,0,0,0
3,3,1,False,1,10,48067,3408,4,978300275,1.0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,False,1,10,48067,2355,5,978824291,1.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
len(movies_combined.MovieID.unique())

3706

In [9]:
movies_combined.MovieID.unique()

array([1193,  661,  914, ..., 2845, 3607, 2909])

In [10]:
### Must redefine vocabulary

enum_movies = enumerate(movies_combined.MovieID.unique())
new_movie_id = dict((old_id, new_id + 1) for new_id, old_id in enum_movies)
movies_combined.replace({'MovieID' : new_movie_id}, inplace=True)

In [11]:
len(movies_combined.MovieID.unique())

3706

## Train-Test Split

Following from the train test split process of BERT4Rec, each movie watched by a single user will be grouped and ordered to become the historical session for that particular user. For each user, we will take out the most recent movie that was watched and save it for the testing set, and we will remove the second most recent movie to be used for validation set. 

After, we will try the traditional train-test split by randomly selecting users for the training and testing groups

In [12]:
set_ranges = movies_combined.sort_values(by='time').groupby('UserID').agg({'MovieID' : list, 'index': list}).reset_index(drop=True)

set_ranges['len'] = set_ranges['MovieID'].str.len()

### Limiting max_len to 512, minimum is already 20, think about truncating 
### Instead of getting rid of users with long sequences
# set_ranges = set_ranges.loc[set_ranges.len < 512].copy()
#371 above 512

above_max_seq_len = set_ranges.loc[set_ranges.len >= 512].copy()
above_max_seq_len['MovieID'] = above_max_seq_len['MovieID'].str[:512]
above_max_seq_len['index'] = above_max_seq_len['index'].str[:512]

set_ranges.iloc[np.array(above_max_seq_len.index)] = above_max_seq_len
set_ranges['len'] = set_ranges['MovieID'].str.len()
set_ranges.shape

(6040, 3)

In [13]:
set_ranges['val'] = set_ranges['MovieID'].str[-2]
set_ranges['test'] = set_ranges['MovieID'].str[-1]
set_ranges['train'] = set_ranges['MovieID'].str[:-2]

# Use index to get the correct rows in the original dataset to extract the appropriate features from the user-item pair
set_ranges['val_index'] = set_ranges['index'].str[-2]
set_ranges['test_index'] = set_ranges['index'].str[-1]
set_ranges['train_index'] = set_ranges['index'].str[:-2]

train_test_split_raw = set_ranges.drop(columns=['MovieID', 'index']).copy()

## Input Embeddings

After the process of getting the data into the proper format, it is time to process the fields so that the Transformer model can be properly trained.

Should the larger movie history sequences be seperated out or completely cut?

In [14]:
new_list = np.array(train_test_split_raw.train)
tensor_lst = [Tensor(lst) for lst in new_list]
pad_tensors = [F.pad(tensor, pad=(0, 512-len(tensor))) for tensor in tensor_lst]
ready_to_embed = torch.stack(pad_tensors)
# ready_to_embed = torch.einsum("ij -> ji", almost_ready_to_embed)
ready_to_embed.shape

torch.Size([6040, 512])

In [15]:
val_ready = Tensor(train_test_split_raw.val).unsqueeze(1)
evaluation_input = torch.cat((ready_to_embed, val_ready), dim=1)
evaluation_input.shape

torch.Size([6040, 513])

In [16]:
training_movies = list(train_test_split_raw.train)
training_movies = np.unique([item for sublist in training_movies for item in sublist])

training_indicies = list(train_test_split_raw.train_index)
training_indicies = [item for sublist in training_indicies for item in sublist]

training_data_all = movies_combined.iloc[training_movies].copy()

###

##### Breakdown input data into appropriate batches

In [33]:
"""Keep 200 as default batch size.
During training, move in steps of batch_size.
[batch_size, full_seq_len]"""
batch_size = 1000
def get_batch(src: Tensor, i: int, batch_size: int = 1000):
    seq_len = min(batch_size, src.shape[0] - i) 

    data = src[i:i+seq_len, :(src.shape[1] - 1)]
    
    # Target is the same as data but shifted over an index
    target = src[i+1:i+1+seq_len, 1:src.shape[1]].reshape(-1)
    return data, target

def get_batch_next_item(src: Tensor, i: int, batch_size: int = 1000):
    seq_len = min(batch_size, src.shape[0] - i) 

    data = src[i:i+seq_len, :(src.shape[1] - 1)]
    
    # Target is the same as data but shifted over an index
    target = src[i+1:i+1+seq_len, src.shape[1] - 1].reshape(-1)
    return data, target 


In [34]:
get_batch_next_item(evaluation_input, 5000, 200)[0]


tensor([[1859.,   65., 3131.,  ...,    0.,    0.,    0.],
        [1372.,  435.,   49.,  ...,    0.,    0.,    0.],
        [  45.,   40., 1048.,  ...,    0.,    0.,    0.],
        ...,
        [2674.,   45.,   23.,  ...,    0.,    0.,    0.],
        [2963.,  936.,  247.,  ...,    0.,    0.,    0.],
        [ 511.,  715.,  589.,  ...,    0.,    0.,    0.]])

### Initialization of Parameters

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


ntokens = len(movies_combined.MovieID.unique()) # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = VanillaTransformer(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [36]:
########## Redo MovieIDs, i.e. each movieID in [1, 3668] ################
ntokens
##############################################################

3706

## Training 

In [30]:
import copy
import time
import math

epoch = 1

### Figure out loss function 
criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# What does this do?
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    
    # What does this stand for
    log_interval = 200
    
    start_time = time.time()
    
    # Change bptt
    src_mask = generate_square_subsequent_mask(511).to(device)
    # Ensures that the model masks the future tokens, different from masking padded tokens


    ### Since batch does not evenly divide len of data, don't we need to have an edge case for last smaller batch
    num_batches = len(ready_to_embed) // batch_size
    for batch, i in enumerate(range(0, ready_to_embed.size(0) - 1, batch_size)):
        data, targets = get_batch(ready_to_embed, i)
        
        # Ensures that the model masks the padded tokens
        src_pad_mask = (data == 0).bool()       
        ### How do we deal with the target masking?
       
        curr_batch_size = data.size(0)
        # For src_mask fix
        if curr_batch_size != batch_size:  # only on last batch
            src_mask = src_mask[:curr_batch_size, :curr_batch_size]
        
        # ###############Bug fix code####################
        data = data.type(torch.LongTensor)
        # src_mask = src_mask.type(torch.LongTensor)
    

        # data = data.to(device)
        # src_mask = src_mask.to(device)
        # ############################################

        output = model(data.t(), src_mask, src_pad_mask)
        # pdb.set_trace()

        
        ### Figure out shape of output.view(-1, ntokens)
        loss = criterion(output.view(-1, ntokens), targets.type(torch.LongTensor))

        # What does this do?
        optimizer.zero_grad()
        
        # Computes gradients
        loss.backward()

        #What does this do?
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

        #What does this do?
        optimizer.step()

        total_loss += loss.item()
        print(f"Loss for batch {batch}: {total_loss}")
        
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [31]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    ### What does turning on eval mode do? 
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(511).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, batch_size):
            ### Change get batch for eval
            data, targets = get_batch_next_item(eval_data, i)
            src_pad_mask = (data == 0).bool()

            curr_batch_size = data.size(0)
            if curr_batch_size != batch_size:
                src_mask = src_mask[:curr_batch_size, :curr_batch_size]
            
            output = model(data, src_mask, src_pad_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [32]:
best_val_loss = float('inf')
epochs = 1
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, evaluation_input)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()


Loss for batch 0: 21.320785522460938


KeyboardInterrupt: 

In [139]:
generate_square_subsequent_mask(512).dtype

torch.float32

In [None]:
"""RuntimeError: Expected tensor for argument #1 'indices' to have one of the follow
ing scalar types: Long, Int; but got torch.FloatTensor instead (while checking 
arguments for embedding)"""

"""src_mask.dtype == torch.int64"""

"""expected scalar type Long but found Float"""