In [1]:
# setting os environement
import os
os.environ["ENV_NM"] = "dev"
os.chdir('../')

In [48]:
# import statements
from util.util_lib import *
import util.util_cnst as cnst

In [49]:
# read config file
def get_conf(): 
    conf = EnvYAML(cnst.CONFIG_FL)  
    conf = conf[conf['env_nm']]
    return conf

In [50]:
def read_train_data(conf):
    data_fl_path = conf['data']['data_fl_path'] + conf['data']['train_preprocess_fl_nm']
    print(f"data loaded STARTED:- {data_fl_path}")
    data_df = pd.read_csv(data_fl_path)
    print(f"data loaded FINISHED:- {data_fl_path}")
    print(f"data_df shape:- {data_df.shape}")
    return data_df

In [51]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for idx, text_row in data_iter.iterrows():
        txt_val = text_row['ingredients_processed']
        txt_arr = ast.literal_eval(txt_val)
        for text in txt_arr:
            yield tokenizer(text)

In [52]:
def get_vocab(train_data): 
    train_data = train_data[['ingredients_processed']].copy()
    vocab = build_vocab_from_iterator(
                    yield_tokens(train_data),
                    specials=['<UNK>', '<PAD>'], 
                    max_tokens=20000) 
    vocab.set_default_index(vocab['<UNK>'])
    
    print(f"unk:- ", vocab.get_stoi()["<UNK>"])
    print(f"pad:- ", vocab.get_stoi()["<PAD>"])
    # print("sos:- ", vocab.get_stoi()["<SOS>"]) #not present in dictionary
    print(f"The first word in vocab is ", vocab.get_itos()[0])
    print(f"The second word in vocab is ", vocab.get_itos()[1])
    print(f"The third word in vocab is ", vocab.get_itos()[2])
    print(f"The last word in vocab is ", vocab.get_itos()[len(vocab)-1])
    print(f"vocab_size:- {len(vocab)}")
    return vocab 

In [53]:
def gen_vocab_wt(vocab, conf):
    fasttext = torch_text_vocab.Vectors(name=conf['vocab']['embed_vector_nm'], 
                             url=conf['vocab']['embed_vector_url'])
    
    # Map words to their FastText embeddings
    embedding_size = conf['vocab']['embed_size']
    weight = torch.zeros(len(vocab), embedding_size)
    
    # if word is not present in fastext then generate using this generator
    generator = torch.Generator()
    generator.manual_seed(42)  # Set a specific seed for reproducibility
    
    for i, word in enumerate(vocab.get_itos()):
        if word in fasttext.stoi:
            weight[i] = torch.Tensor(fasttext.vectors[fasttext.stoi[word]])
        else:
            random_tensor = torch.rand(embedding_size, generator=generator)
            weight[i] = random_tensor
            
    return weight

In [54]:
def save_vocab(vocab_obj, conf):
    print(f"vocab save STARTED:- {conf['vocab']['vocab_path']}")
    torch.save(vocab_obj, conf['vocab']['vocab_path'])
    print(f"vocab save FINISHED:- {conf['vocab']['vocab_path']}")
    return

In [57]:
def save_vocab_wt(vocab_wt_obj, conf):
    print(f"vocab weight save STARTED:- {conf['vocab']['vocab_wt_path']}")
    torch.save(vocab_wt_obj, conf['vocab']['vocab_wt_path'])
    print(f"vocab weight save FINISHED:- {conf['vocab']['vocab_wt_path']}")
    return

In [58]:
if __name__ == "__main__":
    try:
        # reading configuration
        conf = get_conf()
        
        # reading train data
        train_df = read_train_data(conf)
        
        # generating vocabulary
        vocab = get_vocab(train_df)
        
        # saving vocabulary
        save_vocab(vocab, conf)
        
        # generating vocabulary weights
        vocab_wt = gen_vocab_wt(vocab, conf)
        
        # saving vocabulary weights
        save_vocab_wt(vocab_wt, conf)
         
    except Exception as e:
        print(traceback.format_exc())

data loaded STARTED:- data/train/train_preprocess.csv
data loaded FINISHED:- data/train/train_preprocess.csv
data_df shape:- (31819, 4)
unk:-  0
pad:-  1
The first word in vocab is  <UNK>
The second word in vocab is  <PAD>
The third word in vocab is  pepper
The last word in vocab is  zero
vocab_size:- 2989
vocab save STARTED:- data/vocab/vocab_obj.pth
vocab save FINISHED:- data/vocab/vocab_obj.pth
vocab weight save STARTED:- data/vocab/vocab_wt_obj.pth
vocab weight save FINISHED:- data/vocab/vocab_wt_obj.pth


In [None]:
# https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb
# https://hussainwali.medium.com/using-fasttext-embeddings-in-pytorch-boosting-neural-network-performance-fe017c39c7c3#
# https://fasttext.cc/docs/en/english-vectors.html