In [3]:
#Import Required libraries
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch

from typing import List
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm
from torchsummary import summary
from torch.nn.utils.rnn import pad_sequence

In [4]:
#!pip install torchsummary

# Data Engineering Pipeline Functions

#### add_target_df(df: pd.DataFrame), takes in a pandas DataFrame and performs several operations on it.

The function adds a new column called 'label' to the DataFrame, which contains the next city ID for each row. 

The function then removes the last row for each trip and filters out any trips that have less than 3 bookings. 

The resulting DataFrame is then sorted in descending order based on the number of cities visited in each trip. 

The final processed DataFrame is returned by the function.


#### pad_tensor(tensors: List, num_features: int), takes in a list of tensors and an integer representing the number of features for each tensor. 

The function converts the data in the list to tensors and adds front padding to make each tensor the same length. 

The padded tensors are then stacked together and returned

#### sequence_padding(batch), takes in a batch of data and performs padding on it. 

The function separates the features and labels of the data, pads each according to the sequence with the highest length, and returns the padded features and labels.




In [43]:
def add_target_df(df: pd.DataFrame):
    '''
    adds target city ['label'] to the dataset for each row (next city ID) also remove the last row for each trip
    filter trips shorter than 3 bookings
    sort the data with trip city count in descending order
    '''
    df_segment = []
    df['label'] = df['city_id'].shift(-1)
    idx_label = np.unique(df.index)
    
    for idx in tqdm(idx_label):
        temp_data = df.loc[idx].head(-1)
        if type(temp_data)==pd.DataFrame and len(temp_data)>=3:
            df_segment.append(temp_data)
    
    df_segment = sorted(df_segment, key=lambda x: len(x), reverse=True)
    processed_df = pd.concat(df_segment)

    
    return processed_df


def pad_tensor(tensors: List, num_features: int) -> torch.Tensor:
    """
    Convert data to tensors and add front padding to make each tensor same length
    """
    lenth = [len(tensor) for tensor in tensors]
    max_size = max(length)
    pad_tensor = []
    for q in tensors:
        new_tensor = torch.zeros(max_size, num_features)
        new_tensor[max_size-len(q):] = q
        pad_tensor.append(new_tensor)
    
    return torch.stack(pad_tensor,dim=0).long()


def sequence_padding(batch):
    '''
    pad each batch according to the sequence 
    with the highest length
    '''
    features = [data[0] for data in batch]
    features = pad_tensor(features, 2)
    labels = [data[1].unsqueeze(dim=1) for data in batch]
    labels = pad_tensor(labels, 1)
    
    return features, labels

# Import Data

The code below reads in three dataset provided with the challenge, train_set.csv, test_set.csv, and ground_truth.csv into pandas DataFrames.<p>


<li> The first line reads in the train_set.csv file and creates a DataFrame called df_train. The DataFrame is indexed by the 'utrip_id' column, which is converted to an integer data type.
<li>The second line reads in the test_set.csv file and creates a DataFrame called df_test. The DataFrame is indexed by the 'utrip_id' column, which is converted to an integer data type.
<li>The third line reads in the ground_truth.csv file and creates a DataFrame called df_ground_truth. The DataFrame is indexed by the 'utrip_id' column.
<li>The next line performs a merge operation on the df_test DataFrame and the df_ground_truth DataFrame, joining them on the 'utrip_id' column. This creates a new DataFrame that contains all the columns from both the df_test and df_ground_truth DataFrames.

    
We then update the index of df_test DataFrame to the 'utrip_id' column, which is converted to an integer data type.
    

We further create a new column called 'city_id' in the df_test DataFrame. This column is filled with the values from the 'city_id_x' column if it is not equal to 0, otherwise it is filled with the values from the 'city_id_y' column.
    

Finally, the last line of code keeps only the columns
    
    'user_id','checkin','checkout','device_class','affiliate_id','booker_country', 'city_id' from df_test DataFrame 
    
and discards the other columns.



In [7]:
df_train = pd.read_csv('train_set.csv').set_index('utrip_id')
df_train.index = df_train.index.astype(int)
df_test = pd.read_csv('test_set.csv').set_index('utrip_id')
df_ground_truth = pd.read_csv('ground_truth.csv').set_index('utrip_id')
df_test = df_test.merge(df_ground_truth, left_on='utrip_id', right_on='utrip_id')
df_test.index = df_test.index.astype(int)
df_test['city_id'] = np.where(df_test['city_id_x'] == 0, df_test['city_id_y'], df_test['city_id_x'])
df_test = df_test[['user_id','checkin','checkout','device_class','affiliate_id','booker_country', 'city_id']]

Next we preprocess the train and test datasets

In [8]:
df_train = add_target_df(df_train)

  0%|          | 0/217684 [00:00<?, ?it/s]

In [9]:
df_test['city_id'] = df_test['city_id'].astype(int)

In [10]:
df_test = add_target_df(df_test)

  0%|          | 0/70661 [00:00<?, ?it/s]

# Process Data

Here we will perform categorical encoding on the data

In [11]:
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

In [12]:
utrip_distinct_train = df_train.index.unique()
utrip_train_map = {utrip:i for i, utrip in enumerate(utrip_distinct_train)}
df_train = df_train.rename(index=utrip_train_map)

utrip_distinct_test = df_test.index.unique()
utrip_test_map = {utrip:i for i, utrip in enumerate(utrip_distinct_test)}
df_test = df_test.rename(index=utrip_test_map)

This code is renaming the index of the DataFrames df_train and df_test, which are indexed by the 'utrip_id' column.<br><br>


<li>The first line of code creates a variable called utrip_distinct_train which contains an array of unique values of the index of the df_train DataFrame.
    
<li>The second line of code creates a dictionary called utrip_train_map where the keys are the unique values of the index of the df_train DataFrame and the values are integers starting from 0. This creates a mapping of the old index values to new integer values.
<li>The third line of code uses the rename method to update the index of the df_train DataFrame to the new integer values, using the utrip_train_map dictionary.
<li>The next three lines of code repeat the same process for the df_test DataFrame, creating a variable called utrip_distinct_test which contains an array of unique values of the index of the df_test DataFrame. <br>
    


    
The code then creates a dictionary called utrip_test_map where the keys are the unique values of the index of the df_test DataFrame and the values are integers starting from 0. The last line of code uses the rename method to update the index of the df_test DataFrame to the new integer values, using the utrip_test_map dictionary.

The end result is that both DataFrames df_train and df_test have their index columns updated to integers starting from 0 and this allows to use the index as a unique identifier without the need of a extra column.

In [13]:
df_train.head()

Unnamed: 0_level_0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,label
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3635431,2016-04-01,2016-04-02,47319,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-02,2016-04-03,36063,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-03,2016-04-04,36063,mobile,384,Gondal,Gondal,36063
0,3635431,2016-04-04,2016-04-05,36063,mobile,9924,Gondal,Gondal,3109
0,3635431,2016-04-05,2016-04-06,3109,mobile,9924,Gondal,Gondal,3109


In [14]:
df_test.head()

Unnamed: 0_level_0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,city_id,label
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,4152137,2016-08-12,2016-08-13,mobile,359,The Devilfire Empire,62394,27189
0,4152137,2016-08-13,2016-08-14,mobile,359,The Devilfire Empire,27189,59713
0,4152137,2016-08-14,2016-08-15,mobile,359,The Devilfire Empire,59713,39027
0,4152137,2016-08-15,2016-08-16,mobile,359,The Devilfire Empire,39027,45361
0,4152137,2016-08-16,2016-08-17,mobile,359,The Devilfire Empire,45361,47439


We follow similar coding steps for encoding other features

In [15]:
distinct_affiliate_id = np.unique(df_train.affiliate_id)
affiliate_id_map = {aff_id:i+1 for i, aff_id in enumerate(distinct_affiliate_id)}

distinct_checkin_id = np.unique(df_train.checkin)
checkin_map = {checkin:i+1 for i, checkin in enumerate(distinct_checkin_id)}

distinct_device_class = np.unique(df_train.device_class)
device_map = {device:i+1 for i, device in enumerate(distinct_device_class)}

distinct_city_id = np.unique([df_train.city_id,df_train.label])
city_id_map = {city_id:i+1 for i, city_id in enumerate(distinct_city_id)}

distinct_booker_country = np.unique(df_train.booker_country)
booker_country_map = {booker_country:i+1 for i, booker_country in enumerate(distinct_booker_country)}

distinct_hotel_country = np.unique(df_train.hotel_country)
hotel_country_map = {hotel_country:i+1 for i, hotel_country in enumerate(distinct_hotel_country)}

In [16]:
df_train['checkin'] = df_train['checkin'].map(lambda x: checkin_map.get(x))
df_train['affiliate_id'] = df_train['affiliate_id'].map(lambda x: affiliate_id_map.get(x))
df_train['city_id'] = df_train['city_id'].map(lambda x: city_id_map.get(x))
df_train['label'] = df_train['label'].map(lambda x: city_id_map.get(x))
df_train['booker_country'] = df_train['booker_country'].map(lambda x: booker_country_map.get(x))

In [17]:
df_test['checkin'] = df_test['checkin'].map(lambda x: checkin_map.get(x, 0))
df_test['affiliate_id'] = df_test['affiliate_id'].map(lambda x: affiliate_id_map.get(x, 0))
df_test['city_id'] = df_test['city_id'].map(lambda x: city_id_map.get(x, 0))
df_test['label'] = df_test['label'].map(lambda x: city_id_map.get(x, 0))
df_test['booker_country'] = df_test['booker_country'].map(lambda x: booker_country_map.get(x, 0))

# Create dataset for Models

Below we a custom dataset class called 'Booking_Dataset' that inherits from the Pytorch 'Dataset' class. This class is used to create a dataset object that can be used to train a machine learning model.



The class has three main methods:



__init__(self, df): This method is called when a new dataset object is created and takes in a DataFrame as an input. The DataFrame is stored as an instance variable 'self.df' which can be accessed by other methods in the class.



__len__(self): This method returns the number of rows in the DataFrame. It returns the maximum value of the index of the DataFrame.



__getitem__(self, idx): This method is called when an item is accessed from the dataset, it takes an index as an input and returns a tuple of features and label. 

<li>The method uses the 'loc' property of the DataFrame to access the row at the given index. 
<li>It then extracts the 'city_id' and 'label' columns and converts them into tensors using the 'torch.tensor()' function. 
<li>It also extracts the 'affiliate_id' column and converts it into a tensor. 
<li>It then concatenates the 'city_id' and 'affiliate_id' columns into a single tensor called 'features' and returns this tensor and the 'label' tensor as a tuple.

<br><br>
This class allows to use the DataFrame as a dataset object and Pytorch can use this to efficiently load and process the data, which can be used to train and evaluate a machine learning model.<br>

In [18]:
class Booking_Dataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return max(self.df.index)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        city_id = row['city_id'].values
        label_id = row['label'].values
        affiliate_id = row['affiliate_id']
        
        city_id = torch.tensor(row['city_id'].values).view(-1, 1)
        affiliate_id = torch.tensor(row['affiliate_id'].values).view(-1, 1)
        features = torch.cat((city_id, affiliate_id), 1)
        features = features.view(-1, 2)
        
        label_id = torch.tensor(row['label'].values).float()
        
        return features, label_id

# Dataloaders

In [19]:
train_ds = Booking_Dataset(df_train)
train_dl = DataLoader(train_ds, batch_size = 8, shuffle = False, collate_fn=sequence_padding)

In [20]:
test_ds = Booking_Dataset(df_test)
test_dl = DataLoader(test_ds, batch_size = 16, shuffle = False, collate_fn=sequence_padding)

In [21]:
features, label = next(iter(train_dl))

# Models

##  Model 1: Generic RNN

Define Model

In [73]:
# Vanilla RNN using nn.RNN
class Generic_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size):
        super(Generic_RNN, self).__init__()
        
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        
        self.rnn = nn.RNN(emb_size*2, hidden_size, batch_first=True)
        
        self.g = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.rnn(x)
        out = self.g(out)

        return out, hidden

This code defines a Pytorch neural network model called 'Generic_RNN' which inherits from the 'nn.Module' class. The model is a variation of a Vanilla RNN, a type of Recurrent Neural Network (RNN) that uses a single layer of recurrent units.

The class has the following methods:

__init__(self, emb_size, hidden_size, output_size): This method is called when a new model object is created. It takes three parameters as input: 'emb_size' which is the size of the embedding layer, 'hidden_size' which is the number of hidden units in the RNN layer, and 'output_size' which is the number of output units. It creates several layers of the model:

Two embedding layers called 'city_emb' and 'affiliate_emb' which are used to map the categorical variables to a continuous space. 

The embedding layers take the length of the city_id_map and affiliate_id_map dictionaries as input and have emb_size as output size.

An RNN layer called 'rnn' which applies the recurrent operation on the input and has hidden_size number of hidden units.

A Linear layer called 'g' which applies a linear transformation on the output of the RNN layer and has output_size number of output units.

<b>forward(self, x):</b> This method defines the forward pass of the model.<br>
It takes an input 'x' and applies the various layers of the model on it. <br>
The input 'x' is first passed through the 'city_emb' and 'affiliate_emb' layers to get the embeddings of the city and affiliate ID. <br>
These embeddings are then concatenated and passed to the RNN layer. <br>
The output of the RNN layer is passed through the Linear layer 'g' to get the final output. <br>
The final output and the hidden state of the RNN layer are returned as a tuple. <br>

In [61]:
#Here we create an instance of the 'Generic_RNN' model with the parameters emb_size = 24, 
#hidden_size = 100, and output_size = 39879 

g_rnn = Generic_RNN(24, 100, 39879)

In [63]:
print("Model's state_dict:")
for param_tensor in g_rnn.state_dict():
    print(param_tensor, "\t", g_rnn.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
city_emb.weight 	 torch.Size([39879, 24])
affiliate_emb.weight 	 torch.Size([3089, 24])
rnn.weight_ih_l0 	 torch.Size([100, 48])
rnn.weight_hh_l0 	 torch.Size([100, 100])
rnn.bias_ih_l0 	 torch.Size([100])
rnn.bias_hh_l0 	 torch.Size([100])
g.weight 	 torch.Size([39879, 100])
g.bias 	 torch.Size([39879])
Optimizer's state_dict:
state 	 {0: {'step': tensor(143241.), 'exp_avg': tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 5.6052e-45,  5.6052e-45, -5.6052e-45,  ...,  5.6052e-45,
         -5.6052e-45,  5.6052e-45],
        [ 5.6052e-45, -5.6052e-45, -5.6052e-45,  ...,  5.6052e-45,
         -5.6052e-45, -5.6052e-45],
        ...,
        [ 5.6052e-45, -5.6052e-45,  5.6052e-45,  ..., -5.6052e-45,
          5.6052e-45,  5.6052e-45],
        [-7.3981e-30,  4.8001e-30,  8.3404e-30,  ...,  8.6445e-30,
          3.0629e-30,  4.4107e-30],
        [ 5.6052e-45, -5.6052e-45, -5.6052e-45,  ...,  5.6052e-45,
      

In [24]:
# This code allows us to run the model on GPU if available. 
device_no = 0
if torch.cuda.is_available():
    print("set GPU")
    device = torch.device(device_no)
else:
    print("set CPU")
    device = torch.device('cpu')

g_rnn.to(device)

set CPU


Vanilla_RNN(
  (city_emb): Embedding(39879, 24, padding_idx=0)
  (affiliate_emb): Embedding(3089, 24, padding_idx=0)
  (rnn): RNN(48, 100, batch_first=True)
  (g): Linear(in_features=100, out_features=39879, bias=True)
)

### Train & Test

In [25]:
def iterate_dataloader(dataloader: DataLoader, update_params: bool, 
                       loss_func: nn.CrossEntropyLoss, model: nn.Module, 
                       backprop: bool, optimizer) -> float:
    
    if update_params:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    correct_pred = 0.0
    total_pred = 0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        y_pred, _ = model(x)
        # remove the first two elements of the sequence
        if y_pred.shape[1]>=3:
            y_pred = y_pred[:,2:,:]
            y = y[:,2:,:]
            y_pred = y_pred.transpose(1,2)
            loss = loss_func(y_pred, y.squeeze(2))
            total_loss += loss.item()
        else:
            continue
        
        if backprop:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    avg_loss = total_loss / len(dataloader)
    print("loss:", avg_loss)
    
    return avg_loss

def accuracy_at_4(model, dataloader, num_points, k=4, is_train=True) -> float:
    model.eval()
    total_correct = 0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        output, hidden = model(x)
        if is_train:
            output = output[:,2:,:]
              y_ = y[:, 2:,: ]
        else:
            output = output[:,-1:,:]
            y_ = y[:, -1:, :]
        result = torch.topk(output, 4)
        idx = result.indices
        matches = (torch.eq(idx, y_).sum(2)>0).sum(1)>0
        correct = matches.sum().item()
        total_correct += correct
    
    print(total_correct/num_points)
    return (total_correct/num_points)


<b>The iterate_dataloader</b><br>(dataloader: DataLoader, update_params: bool, loss_func: nn.CrossEntropyLoss, model: nn.Module, backprop: bool, optimizer) function takes several inputs:<br><br>


<li>dataloader: a Pytorch dataloader object that loads the data to be used for training or evaluation
<li>update_params: a boolean value that indicates whether the model parameters should be updated during the iteration
<li>loss_func: the loss function to be used for the model<br>


<li>model: the Pytorch model object<br>
<li>backprop: a boolean value that indicates whether backpropagation should be performed to update the model parameters<br>
<li>optimizer: the optimizer to be used for updating the model parameters<br><br>
The function applies the model on the data loaded by the dataloader, and calculates the loss using the specified loss function. <br>If update_params is True, the model is in training mode, otherwise, it is in evaluation mode. If backprop is True, the gradients are calculated and the optimizer is used to update the model parameters. The function returns the average loss of the model on the data.
<br><br><br>
<b>The accuracy_at_4</b>(model, dataloader, num_points, k=4, is_train=True) function takes several inputs:<br><br>

<li>model: the Pytorch model object
<li>dataloader: a Pytorch dataloader object that loads the data to be used for evaluation
<li>num_points: the number of points in the data
<li>k: the number of top predictions to consider for accuracy calculation
<li>is_train: a boolean value that indicates whether the data is from the training set or not<br><br>

In [26]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(g_rnn.parameters(), lr = 0.001)

In [27]:
for _ in range(10):
    iterate_dataloader(train_dl, True, loss_func, g_rnn, True, optimizer)
    accuracy_at_4(g_rnn, test_dl, len(test_ds),is_train=False)

  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 5.892186418762375


  0%|          | 0/4417 [00:00<?, ?it/s]

0.42360600056609116


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 5.238481573121873


  0%|          | 0/4417 [00:00<?, ?it/s]

0.44415510897254457


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.989924242775061


  0%|          | 0/4417 [00:00<?, ?it/s]

0.45217945089159356


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.798499932828316


  0%|          | 0/4417 [00:00<?, ?it/s]

0.45547693178601756


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.650818367901026


  0%|          | 0/4417 [00:00<?, ?it/s]

0.45782621001981316


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.533479962561043


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4575573167279932


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.4395927967360915


  0%|          | 0/4417 [00:00<?, ?it/s]

0.45857628078120577


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.362986407813619


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4582790829323521


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.302632561924581


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4579677328049816


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.251097248153121


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4576846872346448


In [28]:
accuracy_at_4(g_rnn, train_dl, len(train_ds), is_train=True)

  0%|          | 0/27101 [00:00<?, ?it/s]

0.667559950738689


0.667559950738689

In [29]:
accuracy_at_4(g_rnn, test_dl, len(test_ds), is_train=False)

  0%|          | 0/4417 [00:00<?, ?it/s]

0.4576846872346448


0.4576846872346448

## Model 2: Gated Recurrent Unit

Define Model

In [30]:
class GRU_NN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size):
        super(GRU_NN, self).__init__()
        
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        
        self.rnn = nn.GRU(emb_size*2, hidden_size, batch_first=True)
        
        self.g = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.rnn(x)
        out = self.g(out)

        return out, hidden

This code defines a Pytorch neural network model called 'GRU_NN' which inherits from the 'nn.Module' class. The model is a variation of a Gated Recurrent Unit (GRU) type of Recurrent Neural Network (RNN) that uses a gating mechanism to control the flow of information in the recurrent units.<br>

The class has the following methods:<br>

__init__(self, emb_size, hidden_size, output_size): This method is called when a new model object is created. It takes three parameters as input: 'emb_size' which is the size of the embedding layer, 'hidden_size' which is the number of hidden units in the RNN layer, and 'output_size' which is the number of output units. It creates several layers of the model:

<li>Two embedding layers called 'city_emb' and 'affiliate_emb' which are used to map the categorical variables to a continuous space. The embedding layers take the length of the city_id_map and affiliate_id_map dictionaries as input and have emb_size as output size.
<li>An GRU layer called 'rnn' which applies the recurrent operation on the input and has hidden_size number of hidden units.
<li>A Linear layer called 'g' which applies a linear transformation on the output of the RNN layer and has output_size number of output units.<br><br>
<b>forward(self, x):</b> This method defines the forward pass of the model. <li>It takes an input 'x' and applies the various layers of the model on it. <li>The input 'x' is first passed through the 'city_emb' and 'affiliate_emb' layers to get the embeddings of the city and affiliate ID. <li>These embeddings are then concatenated and passed to the GRU layer. <li>The output of the GRU layer is passed through the Linear layer 'g' to get the final output of the model. <li>The method returns the final output and the hidden state of the GRU layer.

<br><br>It's worth noting that the model uses the embedding layers to map the categorical variables 'city_id' and 'affiliate_id' to a continuous space, which is then passed through the GRU layer to capture the temporal dependencies between the bookings. The output of the GRU layer is then passed through the Linear layer to produce the final output, which could be a probability distribution over the possible next city_id for each booking.<br>

In [31]:
gru_rnn = GRU_NN(24, 100, 39879)

In [32]:
device_no = 0
if torch.cuda.is_available():
    print("set GPU")
    device = torch.device(device_no)
else:
    print("set CPU")
    device = torch.device('cpu')

gru_rnn.to(device)

set CPU


GRU_RNN(
  (city_emb): Embedding(39879, 24, padding_idx=0)
  (affiliate_emb): Embedding(3089, 24, padding_idx=0)
  (rnn): GRU(48, 100, batch_first=True)
  (g): Linear(in_features=100, out_features=39879, bias=True)
)

### Train & Test

In [33]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_rnn.parameters(), lr = 0.001)

In [34]:
for _ in range(5):
    iterate_dataloader(train_dl, True, loss_func, gru_rnn, True, optimizer)
    accuracy_at_4(gru_rnn, test_dl, len(test_ds), is_train=False)

  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 5.746227929670957


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4476931786017549


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 5.0916784125224925


  0%|          | 0/4417 [00:00<?, ?it/s]

0.46702519105575996


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.840670724336672


  0%|          | 0/4417 [00:00<?, ?it/s]

0.47226153410699123


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.650630122422952


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4745542032267195


  0%|          | 0/27101 [00:00<?, ?it/s]

loss: 4.504822073842822


  0%|          | 0/4417 [00:00<?, ?it/s]

0.4733795641098217


In [45]:
accuracy_at_4(gru_rnn, train_dl, len(train_ds), is_train=True)

  0%|          | 0/27101 [00:00<?, ?it/s]

0.6717619221136238

## Model 3: Long Short Term Memory

Define Model

In [35]:
class LSTM_NN(nn.Module):
    def __init__(self, emb_size, hidden_size):
        super(LSTM_NN, self).__init__()
        self.hidden_size = hidden_size
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        self.lstm = nn.LSTM(emb_size*2, hidden_size, batch_first=True)
        self.g = nn.Linear(hidden_size, len(city_id_map)+1)
        
    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.lstm(x)
        out = self.g(out) 
        return out, hidden

This code defines a Pytorch neural network model called 'LSTM_NN' which inherits from the 'nn.Module' class. The model is a variation of a Long Short-Term Memory (LSTM) type of Recurrent Neural Network (RNN) that uses a gating mechanism to control the flow of information in the recurrent units.

The class has the following methods:

__init__(self, emb_size, hidden_size): This method is called when a new model object is created. It takes two parameters as input: 'emb_size' which is the size of the embedding layer, 'hidden_size' which is the number of hidden units in the LSTM layer. It creates several layers of the model:<br><br>

<li>Two embedding layers called 'city_emb' and 'affiliate_emb' which are used to map the categorical variables to a continuous space. The embedding layers take the length of the city_id_map and affiliate_id_map dictionaries as input and have emb_size as output size.
<li>An LSTM layer called 'lstm' which applies the recurrent operation on the input and has hidden_size number of hidden units.
<li>A Linear layer called 'g' which applies a linear transformation on the output of the LSTM layer and has len(city_id_map) + 1 number of output units.<br><br>
    <b>forward(self, x):</b> This method defines the forward pass of the model. It takes an input 'x' and applies the various layers of the model on it. The input 'x' is first passed through the 'city_emb' and 'affiliate_emb' layers to get the embeddings of the city and affiliate ID. These embeddings are then concatenated and passed to the LSTM layer. The output of the LSTM layer is passed through the Linear layer 'g' to get the final output of the model. The method returns the final output and the hidden state of the LSTM layer.

Just like GRU model, this model also uses the embedding layers to map the categorical variables 'city_id' and 'affiliate_id' to a continuous space, which is then passed through the LSTM layer to capture the temporal dependencies between the bookings.<br>
    
The output of the LSTM layer is then passed through the Linear layer to produce the final output, which could be a probability distribution over the possible next city_id for each booking. The LSTM layer allows the model to keep track of the long-term dependencies of the input sequence, which is useful in this case where the order of bookings is important.

The LSTM layer also has a hidden state that captures the context of the previous hidden state along with the current input, allowing the model to remember information over a longer period of time. This can be useful in cases where the context of the previous bookings is important to predict the next booking.

In [36]:
lstm = LSTM_NN(50, 100)
lstm.to(device)

Model_LSTM(
  (city_emb): Embedding(39879, 50, padding_idx=0)
  (affiliate_emb): Embedding(3089, 50, padding_idx=0)
  (lstm): LSTM(100, 100, batch_first=True)
  (g): Linear(in_features=100, out_features=39879, bias=True)
)

### Train & Test

In [37]:
loss_fun = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(lstm.parameters(), lr = 0.01)

In [40]:
def single_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):

    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        output, hidden = model(x)
        if output.shape[1] >= 3:
            output = output[:, 2:, :]
            y_ = y[:, 2:, :]
            output = output.transpose(1,2)
            loss = lossFun(output, y_.squeeze(2))
            total_loss += loss.item()
        else:
            continue
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

The function "single_pass" takes several inputs including the model, the dataloader, the optimizer, the loss function, and two flags "backwards" and "print_loss".

The function performs a single pass over the data in the dataloader, computing the model's output and comparing it to the true labels using the loss function. If the flag "backwards" is set to True, the model's parameters are updated using the optimizer and gradients computed from the loss.

If the flag "print_loss" is set to True, the average loss over the entire dataset is printed. At the end, the average loss is returned by the function.

In this function it is also specifically handling the case where the output shape is less than 3 by skipping that iteration.





In [38]:
def single_pass_acc_at_4(model, dataloader, num_points, k=4, is_train=True):
    model.eval()
    total_correct = 0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        output, hidden = model(x)
        if is_train:
            output = output[:,2:,:]
            y_ = y[:, 2:,: ]
        else:
            output = output[:,-1:,:]
            y_ = y[:, -1:, :]
        result = torch.topk(output, 4)
        idx = result.indices
        matches = (torch.eq(idx, y_).sum(2)>0).sum(1)>0
        correct = matches.sum().item()
        total_correct += correct

    return (total_correct/num_points)

The function performs a single pass over the data in the dataloader, computing the model's output. It sets the model to evaluation mode. Then, it selects the last output or the output after the second element of the sequence depending on the value of is_train flag. Then, it uses the torch.topk() function to select the top-k predictions from the output. It then compares the selected predictions to the true labels and counts the number of correct predictions. At the end, the function returns the accuracy of the model at top-k predictions.

The k parameter is the number of top predictions to select and compare with the true labels, with a default value of 4. If the is_train flag is set to True, the function considers the output after the second element of the sequence as the true output, otherwise considers the last output of the sequence as the true output.

In [42]:
num_epochs = 3
for epoch in range(num_epochs):
  print(f"Epoch {epoch+1}")
  train_loss = single_pass(lstm, train_dl, optimizer, loss_fun)
  print(f"Train Loss: {train_loss:.4f}")
  test_loss = single_pass(lstm, test_dl, optimizer, loss_fun, backwards=False)
  print(f"Test Loss: {test_loss:.4f}")
  train_acc = single_pass_acc_at_4(lstm, train_dl, len(train_ds), k=4)
  print(f"Train Accuracy: {train_acc:.4f}")
  test_acc = single_pass_acc_at_4(lstm, test_dl, len(test_ds), k=4, is_train=False)
  print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Loss: 5.6945


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Loss: 6.3938


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Accuracy: 0.5199


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Accuracy: 0.3978
Epoch 2


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Loss: 5.7832


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Loss: 6.4467


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Accuracy: 0.5215


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Accuracy: 0.3993
Epoch 3


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Loss: 5.7763


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Loss: 6.5403


  0%|          | 0/27101 [00:00<?, ?it/s]

Train Accuracy: 0.5245


  0%|          | 0/4417 [00:00<?, ?it/s]

Test Accuracy: 0.4008


# Performance Comparison

The table below compares the Training and Test Accuracy@4 metric for the models:

| Model | Training Accuracy | Test Accuracy
|---|---|---|
| Generic RNN |0.66  | 0.46| 
|GRU |0.67 |0.47 |
|LSTM |  0.52 | 0.40 | 