In [1]:
# setting os environement
import os
os.environ["ENV_NM"] = "dev"
os.chdir('../')

In [4]:
# import statements
from util.util_lib import *
import util.util_cnst as cnst
from CookingDataset import CookingDataset 
from CookingBatchSampler import CookingBatchSampler
from CookingCollator import CookingCollator

In [5]:
# read config file
def get_conf(): 
    conf = EnvYAML(cnst.CONFIG_FL)  
    conf = conf[conf['env_nm']]
    return conf

In [6]:
def read_vocab(conf):
    print(f"vocab read STARTED:- {conf['vocab']}")
    vocab = torch.load(conf['vocab'])
    print(f"vocab read FINISHED:- {conf['vocab']}")

    print("unk:- ", vocab.get_stoi()["<UNK>"])
    print("pad:- ", vocab.get_stoi()["<PAD>"])
    # print("sos:- ", vocab.get_stoi()["<SOS>"]) #not present in dictionary
    print("The first word in vocab is ", vocab.get_itos()[0])
    print("The second word in vocab is ", vocab.get_itos()[1])
    print("The third word in vocab is ", vocab.get_itos()[2])
    print("The last word in vocab is ", vocab.get_itos()[len(vocab)-1])

    return vocab

In [7]:
# Save the encoder to a file
def load_label(conf):
    print(f"lebel encode load STARTED:- {conf['lbl_enc']}")
    with open(conf['lbl_enc'], 'rb') as file:
        le = pickle.load(file)
    print(f"lebel encode load FINISHED:- {conf['lbl_enc']}")
    
    return le

In [8]:
def gen_dataloader(data_set, le, vocab, ind, conf):
    
    data_sampler = CookingBatchSampler(data_set, ind, conf)
    
    if ind == "train":
        cookingCollator = CookingCollator(vocab, le, "train", conf)
        bucket_dataloader = DataLoader(data_set, 
                                       batch_sampler=data_sampler,
                                       collate_fn=cookingCollator)
    elif ind == "valid":
        cookingCollator = CookingCollator(vocab, le, "valid", conf)
        bucket_dataloader = DataLoader(data_set, 
                                       batch_sampler=data_sampler,
                                       collate_fn=cookingCollator)
    elif ind == "test":
        cookingCollator = CookingCollator(vocab, le, "test", conf)
        bucket_dataloader = DataLoader(data_set, 
                                       batch_sampler=data_sampler,
                                       collate_fn=cookingCollator)
    
    return bucket_dataloader

In [9]:
def save_dataloader(dataloader_obj, path):
    print(f"torch dataloader STARTED:- {path}")
    torch.save(dataloader_obj, path)  
    print(f"torch dataloader FINISHED:- {path}")
    return

In [10]:
%%time
if __name__ == "__main__":
    try:
        # reading configuration
        conf = get_conf()
        
        # loading the vocab object
        vocab = read_vocab(conf)
        
        # loading the label encoder
        le = load_label(conf)
        
        # ---------------- TRAIN ---------------- #
        # converting torch Dataset
        train_dataset = CookingDataset("train", conf)
        
        # converting to torch dataloader ( train )
        train_dataloader = gen_dataloader(train_dataset, le, vocab, "train", conf)
        
        print(f"Train Data =====> ")
        print(next(iter(train_dataloader)))
        
        # saving dataloader ( train )
        dataloader_path = conf['data']['data_fl_path'] + conf['data']['train_dataloader']
        save_dataloader(train_dataloader, dataloader_path)
        
        
        # ---------------- VALID ---------------- #
        # converting torch Dataset
        valid_dataset = CookingDataset("valid", conf)
        
        # converting to torch dataloader ( valid )
        valid_dataloader = gen_dataloader(valid_dataset, le, vocab, "valid", conf)
        
        print(f"Valid Data =====> ")
        print(next(iter(valid_dataloader)))
        
        # saving dataloader ( valid )
        dataloader_path = conf['data']['data_fl_path'] + conf['data']['valid_dataloader']
        save_dataloader(valid_dataloader, dataloader_path)
        
        
        # ---------------- TEST ---------------- #
        # converting torch Dataset
        test_dataset = CookingDataset("test", conf)
        
        # converting to torch dataloader ( test )
        test_dataloader = gen_dataloader(test_dataset, le, vocab, "test", conf)
        
        print(f"Test Data =====> ")
        print(next(iter(test_dataloader)))
        
        # saving dataloader ( test )
        dataloader_path = conf['data']['data_fl_path'] + conf['data']['test_dataloader']
        save_dataloader(test_dataloader, dataloader_path)
        
        
    except Exception as e:
        print(traceback.format_exc())

vocab read STARTED:- data/vocab/vocab_obj.pth
vocab read FINISHED:- data/vocab/vocab_obj.pth
unk:-  0
pad:-  1
The first word in vocab is  <UNK>
The second word in vocab is  <PAD>
The third word in vocab is  pepper
The last word in vocab is  zero
lebel encode load STARTED:- data/encoder/label_encoder.pkl
lebel encode load FINISHED:- data/encoder/label_encoder.pkl
data loaded STARTED:- data/train/train_preprocess.csv
data loaded FINISHED:- data/train/train_preprocess.csv
data_df shape:- (31819, 4)
Train Data =====> 
{'text_list': [['raspberries', 'sugar'], ['chicken', 'chicken breasts'], ['sour cream', 'heavy cream'], ['longgrain rice', 'cold water'], ['coconut', 'water'], ['coarse salt', 'polenta', 'extravirgin olive oil'], ['pepper', 'fresh chevre', 'asiago'], ['coarse salt', 'water', 'pork shoulder butt']], 'text_to_vocab_list': tensor([[[ 457,    0,    0],
         [   9,    0,    0],
         [   0,    0,    0]],

        [[  12,    0,    0],
         [  12,   88,    0],
         [

In [None]:
# https://hussainwali.medium.com/transforming-your-text-data-with-pytorch-12ec1b1c9ae6
# https://hussainwali.medium.com/using-fasttext-embeddings-in-pytorch-boosting-neural-network-performance-fe017c39c7c3
# https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb
# https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71
# https://medium.com/geekculture/pytorch-datasets-dataloader-samplers-and-the-collat-fn-bbfc7c527cf1
# https://discuss.pytorch.org/t/batching-tensor-with-different-size/133147

In [None]:
class Classifier(nn.Module):

  def __init__(self, vocab_size, **kwargs):
    #Constructor
    super(Classifier, self).__init__(**kwargs)

    # variables
    self.embedding_dim = 2
    self.hidden_dim = 32
    self.num_layers = 1
    self.bidirectional = True
    self.batch_first = True
    self.output_dim = 1

    #embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=self.embedding_dim, 
                                  padding_idx=0)

    
    #lstm layer
    self.lstm = nn.LSTM(input_size=self.embedding_dim,
                        hidden_size=self.hidden_dim, 
                        num_layers=self.num_layers, 
                        bidirectional=self.bidirectional,
                        batch_first=self.batch_first)

    #dense layer / linear layer
    self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)

    #activation function
    self.act = nn.Sigmoid()

  def forward(self, txt, txt_len):
    print("txt")
    print(txt.shape)
    print(txt)
    print(txt_len.shape)
    print(txt_len)
    '''
    # txt [batch_size, seq_len] 
    ~ seq_len is max sequence length among all the rows in batch
    ~ it means the rows length with less than seq_len will be padded 
    ~ but the padding will be batchwise
    # txt_len [batch_size]
    ~ contains sequence length for each row in batch
    '''
    
    '''
    Step 1: pass through the embedding layer to convert text into vectors
    '''
    # embed_txt ~ [batch_size, seq_len, embedding_dim] 
    embed_txt = self.embedding(txt)
    
    # embed_txt = torch.sum(embed_txt, 1)
    
    print("embed_txt")
    print(embed_txt)
    print(embed_txt.shape)
    print(txt_len.shape)

    '''
    Step 2: passing the embeddings through LSTM layer
    '''

    '''
    Step 2.1: first packing the embeddings to tackle variable length input
    For pytorch to know how to pack and unpack properly, 
    we feed in the length of the original sentence (before padding).
    by default enforce_sorted=True, 
    which requires input sorted by decreasing length, 
    just make sure the target y are also sorted accordingly. 
    '''
    # packed the embedding (only the vocab words without padding)
    embed_txt_packed_pad = nn.utils.rnn.pack_padded_sequence(embed_txt, txt_len.cpu(), batch_first=True)

    
    print("embed_txt_packed_pad")
    print(embed_txt)
    
    '''
    Step 2.2: passing the packed input to LSTM layer
    '''
    # lstm_out ~ [batch_size, seq_len, (2 * hidden_dim)] 
    lstm_out, (h_n, c_n) = self.lstm(embed_txt_packed_pad)

    '''
    Step 2.3: retrieving back the lstm output with zero padding
    '''
    # packed the embedding (with padding)
    embed_txt_pad_packed, lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

    '''
    Step 3: sum all the hidden states
    '''
    # lstm_out ~ [include dimention, remove dimention, include dimention] 
    # concat_out ~ [batch_size, (2 * hidden_dim)] #concatenate hidden states
    # concat_out = embed_txt_pad_packed[ : , -1, : ]  #concatenate hidden states
    sum_ip = embed_txt_pad_packed.sum(dim=1)  #summing up hidden states
    # avg_ip = embed_txt_pad_packed.mean(dim=1)  #averaging the hidden states

    '''
    Step 4: feeding the weighted value to a linear layer
    '''
    # fc_out ~ [batch_size, output_dim]
    fc_out = self.fc(sum_ip)

    '''
    Step 5: feeding the linear output to activation function
    '''
    # out ~ [batch_size, output_dim]
    out = self.act(fc_out)

    return out