# EVN

In [1]:
%%capture
!pip install transformers[sentencepiece]
!pip install huggingface_hub
!pip install accelerator
!pip install datasets
!pip install peft

# Init

In [13]:
from transformers import AutoTokenizer,AutoModel

checkpoint='intfloat/multilingual-e5-small'
tokenizer=AutoTokenizer.from_pretrained(checkpoint)
pretrained=AutoModel.from_pretrained(checkpoint,
                                    low_cpu_mem_usage=True,
                                    )

In [14]:
tokenizer.tokenize('tôi đi học')
# 2259, 7813, 352

['▁tôi', '▁đi', '▁học']

In [15]:
tmp=tokenizer('Học sinh đang học', 'Cậu ấy rất thích học hóa',return_tensors='pt')
output=pretrained(**tmp)
print(output.pooler_output.shape)

torch.Size([1, 384])


# Tokenize dataset

In [16]:
# from datasets import load_dataset
# from transformers import DataCollatorWithPadding

# raw_dataset = load_dataset("VictorJuiz/Keyword_Doc_intfloat_multilingual_e5")

In [17]:
# %%time
# from transformers import DataCollatorWithPadding

# max_length=256
# def tokenize_function(examples):
#     query=tokenizer(examples['query'], padding='longest', truncation=True, max_length=max_length)
#     document=tokenizer(examples['document'], padding='longest', truncation=True, max_length=max_length)
#     merge={}
#     merge['label']=examples['label']
#     for key in query.keys():
#         merge[key]=[]
#         for i in range(len(query[key])):
#             merge[key].append([query[key][i],document[key][i]])

#     return merge

# tokenized_dataset=raw_dataset.map(tokenize_function,
#                                   batched=True,
#                                   batch_size=1024,
#                                   remove_columns=raw_dataset['train'].column_names)

# # dataset=tokenized_dataset['train'].train_test_split(train_size=0.95,seed=42)
# # dataset['validation']=dataset.pop('test')
# # tokenized_dataset.save_to_disk('.')

# data_collator=DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="pt")

In [18]:
# from huggingface_hub import notebook_login
# #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
# notebook_login()

In [19]:
# tokenized_dataset.push_to_hub("Tokenized_Keyword_Doc_intfloat_multilingual")

# Prepare

In [20]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding

dataset = load_dataset("VictorJuiz/Tokenized_Keyword_Doc_intfloat_multilingual_e5")
data_collator=DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="pt")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 820064
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 13530
    })
})

In [21]:
import torch
from torch.utils.data import DataLoader

def get_dataloader(batch_size,device):
    def procecss_batch(batch):
        query={}
        document={}

        for key in batch[0].keys():
            if key=='label':
                continue
            query[key]=[]
            document[key]=[]

        for sample in batch:
            for key in sample.keys():
                if key=='label':
                    continue
                query[key].append(sample[key][0])
                document[key].append(sample[key][1])

        query={k:v.to(device) for k,v in data_collator(query).items()}
        document={k:v.to(device) for k,v in data_collator(document).items()}
        label=[float(sample['label']) for sample in batch]

        batch={'query': query,
               'document': document,
               'label': torch.tensor(label).to(device)
        }

        return batch
    
    train_dataloader = DataLoader(
        dataset['train'],
        shuffle=True,
        collate_fn=procecss_batch,
        drop_last=True,
        batch_size=batch_size)
    
    val_dataloader = DataLoader(
        dataset['validation'],
        collate_fn=procecss_batch,
        drop_last=True,
        batch_size=batch_size)
    
    return train_dataloader,val_dataloader

# train_dataloader,val_dataloader=get_dataloader(128,'cuda')
# for batch in train_dataloader:
#     print(batch)
#     break

# Single device training

In [22]:
import torch
from peft import LoraConfig,get_peft_model

config=LoraConfig(r=8,
                  lora_alpha=8,
                  use_rslora=True,
                  bias='none',
                  lora_dropout=0.1,
                  target_modules=['word_embeddings','query','key','value','dense'])

model=get_peft_model(pretrained,config)
model = model.to(torch.float16)
# model=torch.compile(model) cause error, conflic with accelerate
print(model.print_trainable_parameters())

trainable params: 2,673,064 || all params: 120,326,824 || trainable%: 2.2215
None


In [23]:
import gc
from tqdm.auto import tqdm
from accelerate import Accelerator


num_epochs=100
batch_size=96
accelerator = Accelerator(gradient_accumulation_steps=2)

train_dataloader,val_dataloader=get_dataloader(batch_size, accelerator.device)
loss_fn = torch.nn.CosineEmbeddingLoss(margin=0.5)
optimizer = torch.optim.SGD(model.parameters(), lr=5e-5)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-6, max_lr=1e-4,step_size_up=300,mode="triangular2")
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.85)

model,optimizer,scheduler,train_dataloader,val_dataloader=accelerator.prepare(model,optimizer,scheduler,train_dataloader,val_dataloader)

gc.collect()
for epoch in tqdm(range(num_epochs)):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        with accelerator.accumulate(model):
            query_embedding=model(**batch['query']).pooler_output
            document_embedding=model(**batch['document']).pooler_output
            loss = loss_fn(query_embedding,document_embedding, batch['label'])

            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

            progress_bar.set_postfix(train_loss=loss.item())

    model.eval()
    val_loss = 0.0
    progress_bar = tqdm(val_dataloader,desc=f"Epoch {epoch+1}/{num_epochs}")

    with torch.no_grad():
        for batch in progress_bar:
            query_embedding=model(**batch['query']).pooler_output
            document_embedding=model(**batch['document']).pooler_output
            
            loss = loss_fn(query_embedding,document_embedding, batch['label'])
            val_loss += loss.item()
            progress_bar.set_postfix(val_loss=val_loss/(progress_bar.n+batch_size))

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/100:   0%|          | 0/8542 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Push to hub

In [None]:
from huggingface_hub import notebook_login

"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"
notebook_login()

In [None]:
model.push_to_hub("lora_fp16_intfloat_multilingual-e5-smal")

# Inference

In [None]:
import torch.nn.functional as F

query='query: Attention Is All You Need'
document='document: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.'

tokenized_query={k:v.to(device) for k,v in tokenizer(query,return_tensors='pt').items()}
query_embedding=model(tokenized_query['input_ids']).pooler_output

tokenized_document={k:v.to(device) for k,v in tokenizer(document,return_tensors='pt').items()}
document_embedding=model(**tokenized_document).pooler_output

print(F.cosine_similarity(query_embedding,document_embedding))