1. Install transformers, datasets, annoy(for collecting embeddings).
2. Mount Google Drive

In [1]:
!pip install --no-cache-dir transformers sentencepiece
!pip install datasets
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

dir = 'drive/MyDrive'

print(os.listdir(dir))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['Colab Notebooks', 'Примечание к форме №1.doc', 'Правила заполнения Формы №1.doc', 'Бланк заявления.doc', 'ML', 'Новый документ.gdoc', 'Kufar', 'saves']


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets

1. Load Quora Question Pairs dataset
2. Load deberta-v3 tokenizer
3. Tokenize dataset and create data loader for PyTorch

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = datasets.load_dataset('SetFit/qqp')
checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)



  0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


----------------------Train code starts here----------------------

In [6]:
def tokenize_dataset(raw_dataset):

    def tokenize_function(example):
        return tokenizer(example["text1"], example["text2"], truncation=True)


    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns(["text1", "text2", "idx", 'label_text'])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")
    tokenized_datasets["train"].column_names

    return tokenized_datasets

In [7]:
from torch.utils.data import DataLoader


def make_dataloader(dataset):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(
        dataset, shuffle=True, batch_size=8, collate_fn=data_collator
    )

    return train_dataloader 

In [None]:
tokenized_datasets = tokenize_dataset(raw_datasets)
train_dataloader = make_dataloader(tokenized_datasets['train'])

In [15]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67])}

1. Load fine-tuned deberta-v3 model. Link for model.
2. https://drive.google.com/drive/folders/1tWaXKEAmE0lb3-_SxhbsVrUw-nm7ZGc4?usp=sharing
3. Use Gpu if torch cuda is available.

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(dir + '/saves/models', num_labels=2)
#model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [9]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

1. Train model

In [None]:
from transformers import get_scheduler
from tqdm.auto import tqdm


def train_model(num_epochs=3):
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)


train_model(num_epochs=3)

In [20]:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)

for pred in torch.softmax(outputs.logits.cpu().detach(), dim=1).data.numpy():
  print("%.2f" % (pred[0] * 100), '% ', "%.2f" % (pred[1] * 100), "% ", sep='')

98.85% 1.15% 
99.99% 0.01% 
99.99% 0.01% 
98.51% 1.49% 
98.82% 1.18% 
97.43% 2.57% 
59.54% 40.46% 
3.48% 96.52% 


----------------find simmilar question with brute forse code----------------

1 Create dataset with pairs: question from dataset and input question

2 Model predicts all pairs and finds most simmilar

3 To find top 5 most simmilar pairs, it needs > 1 hour.

In [21]:
from copy import deepcopy


def change(dataset, question):

    def change_dataset_1(example):
        example['text1'] = question
        return example
    
    def change_dataset_2(example):
        example['text2'] = question
        return example
    
    dataset_1 = dataset.map(change_dataset_1)
    dataset_2 = dataset.map(change_dataset_2)

    changed_dataset = datasets.DatasetDict({
        'train': datasets.concatenate_datasets([dataset_1['train'], dataset_2['train']]),
        'test': datasets.concatenate_datasets([dataset_1['test'], dataset_2['test']]),
        'validation': datasets.concatenate_datasets([dataset_1['validation'], dataset_2['validation']])
    })

    return changed_dataset

In [22]:
def find_top_questions(question):
    ds = change(raw_datasets, question)
    tokenized_ds = tokenize_dataset(ds)
    train_dataloader = make_dataloader(tokenized_ds['train'])

    ans = []

    model.eval()
    print(len(train_dataloader))
    for idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        for i, pred in enumerate(torch.softmax(outputs.logits.cpu().detach(), dim=1).data.numpy()):
            ans.append(pred[1])

    output = []

    for val, idx in sorted([(val, idx) for idx, val in enumerate(ans)], reverse=True)[:5]:
        output.append(ds['train'][idx]['text2'] if idx < 363846 else ds['train'][idx]['text1'])

    return output

In [None]:
question = "Is Java good for machine learning?" 

print(find_top_questions(question))

----------------find simmilar question with embeddings code----------------

1. Create dataset, tokenize it.
2. Make sequence embeddings using dataset.
3. Use Annoy library to create data structure to find top_k most simmilar sequences using embeddings.
4. Give model these top_k sequences and predict top 5.
5. Much faster than brute force

In [10]:
'''Create embeddings from sentences'''

from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer_emb = AutoTokenizer.from_pretrained(MODEL_NAME)
model_emb = AutoModel.from_pretrained(MODEL_NAME)


In [11]:
def get_sentences(dataset, features):
    sentences = {}
    count_id = 0
    for idx in range(len(dataset)):
        for feature in features:
            if dataset[idx][feature] not in sentences:
                sentences[dataset[idx][feature]] = count_id
            count_id += 1
    
    return sentences

In [12]:
def tokenize_sententences(sentences_to_emb, batch_size=8):
    def token(row):
        row = tokenizer_emb(row, padding='max_length', max_length=42, \
                            truncation=True, return_tensors='pt')
        return row

    sentences_to_emb_tokenized = [token(row) for row in sentences_to_emb]
    
    return sentences_to_emb_tokenized

In [13]:
def batch(sentences_to_emb_tokenized, batch_size=8):
    tokenized = {}
    names = ['input_ids', 'token_type_ids', 'attention_mask']

    for name in names:
        tokenized[name] = [torch.tensor([sentences_to_emb_tokenized[i+j] \
                          [name].tolist()[0] for j in range(batch_size)]) \
                          for i in range(0, len(sentences_to_emb_tokenized) \
                          - batch_size + 1, batch_size)]

    batched = [{
            name: tokenized[name][i] for name in names
        } for i in range(len(tokenized[names[0]]))]

    return batched

In [14]:
def get_sentence_embed(encoded_input):
    with torch.no_grad():
        model_output = model_emb(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings

In [None]:
def make_embeddings(batched_tokenized):
    num_batches = 10 ** 2
    # num_batches = len(batched_tokenized)

    embeddings = []
    for i in range(num_batches):
        embeddings += get_sentence_embed(batched_tokenized[i]).tolist()

    return embeddings

def load_embeddings():
    # embeddings can be found here:
    # https://drive.google.com/drive/folders/1tWaXKEAmE0lb3-_SxhbsVrUw-nm7ZGc4?usp=sharing
    embeddings = torch.load(dir + '/saves/embeddings/embeddings.pt')
    
    return embeddings

In [16]:
from annoy import AnnoyIndex
import random


def get_top_k(embeddings, n_trees=100):
    emb_len = len(embeddings[0])
    t = AnnoyIndex(emb_len, 'angular')
    for i in range(len(embeddings)):
        t.add_item(i, embeddings[i])

    t.build(n_trees, n_jobs=-1)

    return t

In [17]:
def prepare():
    sentences = get_sentences(raw_datasets['train'], ['text1', 'text2'])
    sentences_to_emb = [txt for txt in sentences]
    sentences_ids = [sentences[txt] for txt in sentences]

    sentences_to_emb_tokenized = tokenize_sententences(sentences_to_emb)

    batched_tokenized = batch(sentences_to_emb_tokenized)

    # embeddings = make_embeddings(batched_tokenized)
    embeddings = load_embeddings()

    annoy_tree = get_top_k(embeddings, n_trees=100)

    return annoy_tree, sentences_to_emb, sentences_ids

In [18]:
annoy_tree, sentences_to_emb, sentences_ids = prepare()

In [23]:
def find_top_k_emb(question):

    def tokenize_seq(sequence):
        seq = [sequence]
        ans = tokenize_sententences(seq)
        ans = get_sentence_embed(ans[0])

        return ans[0].tolist()

    tok_question = tokenize_seq(question)

    top_k = annoy_tree.get_nns_by_vector(tok_question, 1000)

    potential_texts = [sentences_to_emb[i] for i in top_k][:100]

    data_loader = [tokenizer(text, question, truncation=True) for text in potential_texts]
    data_loader = make_dataloader(data_loader)

    ans = []

    model.eval()
    for idx, batch in enumerate(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        if (idx + 1) % 1000 == 0:
            print(idx)

        for i, pred in enumerate(torch.softmax(outputs.logits.cpu().detach(), dim=1).data.numpy()):
            ans.append(pred[1])

    ans = sorted([(ans[i], i) for i in range(len(ans))], reverse=True)[:5]

    output = []
    for i in range(len(ans)):
        output.append(sentences_to_emb[top_k[ans[i][1]]])

    return output

This block finds top 5 similar questions to inputed one.

In [24]:
question = "How can i use R for machine learning?"

top_k = find_top_k_emb(question)
top_k

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['How can Python be used in machine learning?',
 'What are some computer vision libraries for R?',
 'I studied R and practicing on Dataset from UCI ML Repository? But need a platform where I can find solution (R code) along with dataset.Any Website?',
 'How should I go about learning Machine Learning?',
 'How can one use Python to replace R?']

In [25]:
from time import time

start = time()

for i in range(100):
    find_top_k_emb(sentences_to_emb[i])

print((time() - start) / 100, '- seconds to compute 1 question. Model runs on GPU')

0.4136928868293762 - seconds to compute 1 question. Model runs on GPU
