### Question search engine

In [1]:
#!pip install pytorch_lightning

In [2]:
#!pip install sentencepiece

In [3]:
#!pip install transformers[sentencepiece]

In [4]:
%pip install --upgrade transformers datasets accelerate deepspeed
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deepspeed
  Downloading deepspeed-0.11.1.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━

### Load data and model

In [5]:
qqp = datasets.load_dataset('SetFit/qqp')
print('\n')
print("Sample[0]:", qqp['train'][0])
print("Sample[3]:", qqp['train'][3])

Downloading readme:   0%|          | 0.00/313 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/70.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Sample[0]: {'text1': 'How is the life of a math student? Could you describe your own experiences?', 'text2': 'Which level of prepration is enough for the exam jlpt5?', 'label': 0, 'idx': 0, 'label_text': 'not duplicate'}
Sample[3]: {'text1': 'What can one do after MBBS?', 'text2': 'What do i do after my MBBS ?', 'label': 1, 'idx': 3, 'label_text': 'duplicate'}


In [None]:
model_name = "gchhablani/bert-base-cased-finetuned-qqp"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

### Tokenize the data

In [None]:
MAX_LENGTH = 128
def preprocess_function(examples):
    result = tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

qqp_preprocessed = qqp.map(preprocess_function, batched=True)

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

In [None]:
print(repr(qqp_preprocessed['train'][0]['input_ids'])[:100], "...")

[101, 1731, 1110, 1103, 1297, 1104, 170, 12523, 2377, 136, 7426, 1128, 5594, 1240, 1319, 5758, 136,  ...


### Evaluation

We randomly chose a model trained on QQP - but is it any good?

One way to measure this is with validation accuracy - which is what you will implement next.

In [None]:
val_set = qqp_preprocessed['validation']
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=1, shuffle=False, collate_fn=transformers.default_data_collator
)

In [None]:
for batch in val_loader:
     break  
print("Sample batch:", batch)

with torch.no_grad():
  predicted = model(
      input_ids=batch['input_ids'],
      attention_mask=batch['attention_mask'],
      token_type_ids=batch['token_type_ids']
  )

print('\nPrediction (probs):', torch.softmax(predicted.logits, dim=1).data.numpy())

Sample batch: {'labels': tensor([0]), 'idx': tensor([0]), 'input_ids': tensor([[  101,  2009,  1132,  2170,   118,  4038,  1177,  2712,   136,   102,
          2009,  1132,  1117, 10224,  4724,  1177,  2712,   136,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,   

We will measure the validation accuracy of our model.

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [7]:
from tqdm.auto import tqdm

def validation_accuracy(model, loader, batch_size=256, device=device):
  with torch.no_grad():
    model = model.to(device)
    summ, countt = 0, 0
    for batch in tqdm(loader):
        target = batch['labels'].numpy()
        countt += len(target)
        predicted = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )
        probs = torch.softmax(predicted.logits, dim=1)
        predicted = probs.argmax(dim=-1).data.cpu().detach().numpy()
        summ += sum(predicted == target)
    return summ / countt

In [None]:
batch_size = 256
val_loader = torch.utils.data.DataLoader(
        val_set, batch_size=batch_size, shuffle=False, collate_fn=transformers.default_data_collator, num_workers=2
    )

accuracy = validation_accuracy(model, val_loader)

  0%|          | 0/158 [00:00<?, ?it/s]

In [None]:
accuracy

0.9083848627256987

Неплохое качество

In [None]:
assert 0.9 < accuracy < 0.91

In [None]:
del val_loader

### Train the model 

Fine-tune our own model. We use [DeBERTa-v3](https://huggingface.co/microsoft/deberta-v3-base).

In [8]:
from pytorch_lightning import LightningModule
import numpy as np

# Сама модель
class DeBERTav3Tunning(LightningModule):
    def __init__(self, name = "microsoft/deberta-v3-base"):
        super().__init__()
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(name, use_fast=False)
        self.bert = transformers.AutoModel.from_pretrained(name)
        params = 0
        for _ in self.bert.parameters():
            params += 1
        for i, param in enumerate(self.bert.parameters()):   # все слои берта, кроме последних 90, зафризим, иначе долго все будет обучаться
            if i < params - 90:
                param.requires_grad = False

        self.head = nn.Sequential(   # приделаем еще голову для файн тюна
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

        self.criterion = nn.BCELoss()

    def forward(self, batch):
        bert_output = self.bert(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )['last_hidden_state']
        bert_output = torch.max(bert_output, dim=1).values

        return self.head(bert_output).squeeze(1)

    def training_step(self, batch, *args):
        pred = self.forward(batch)
        loss = self.criterion(pred, batch['labels'].to(device) + 0.0)
        self.log("training_loss", loss.item())
        return loss

    def validation_step(self, batch, *args):
      with torch.no_grad():
        pred = self.forward(batch)
        predicted = (pred > 0.5).cpu().detach().numpy()
        target = batch['labels'].cpu().detach().numpy()
        ans = np.mean(predicted.reshape(-1) == target.reshape(-1))
        self.log("validation_acc", ans)
        return ans

    def configure_optimizers(self):
        optimizer = torch.optim.Adam([{"params": self.bert.parameters(), "lr": 3e-5},  # сделаем маленький шаг для берта и большой для головы
                              {"params": self.head.parameters()}], lr=1e-3)
        return optimizer

In [9]:
import warnings
warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

In [10]:
model = DeBERTav3Tunning().to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [11]:
# Тут надо уже немного по-другому токенизировать данные
MAX_LENGTH = 128
def preprocess_function(examples):
    result = model.tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

qqp_preprocessed = qqp.map(preprocess_function, batched=True)

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

In [12]:
train_loader = torch.utils.data.DataLoader(qqp_preprocessed['train'], batch_size=32, collate_fn=transformers.default_data_collator)
valid_loader = torch.utils.data.DataLoader(qqp_preprocessed['validation'], batch_size=128, collate_fn=transformers.default_data_collator)

In [13]:
from tqdm.auto import tqdm

def validation_accuracy_loader(model, validation_loader, batch_size=256):
  with torch.no_grad():
    summ, cnt = 0, 0
    for batch in tqdm(validation_loader):
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred = model(batch)
        predicted = (pred > 0.5).cpu().detach().numpy()
        target = batch['labels'].cpu().detach().numpy()
        summ += sum(predicted == target)
        cnt += len(target)
    return summ / cnt

Посмотрим на качество до обучения, оно должно быть не очень большим

In [None]:
validation_accuracy_loader(model, valid_loader)

  0%|          | 0/316 [00:00<?, ?it/s]

0.6318080633193174

Как и ожидалось, чуть лучше рандома

Попробуем потренить по одной эпохе, чтобы не ждать долго много эпох (так как есть предположение, что хорошего качества можно достигнуть после небольшого числа эпох)

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

trainer = Trainer(callbacks = [ModelCheckpoint(dirpath="deberta_unfreeze_epoch0/",
                                               save_top_k=3,
                                               monitor="validation_acc")],
                  max_epochs=1,
                  accelerator="auto",
                  gradient_clip_val=0.1)

trainer.fit(model, train_loader, valid_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type           | Params
---------------------------------------------
0 | bert      | DebertaV2Model | 183 M 
1 | head      | Sequential     | 102 K 
2 | criterion | BCELoss        | 0     
---------------------------------------------
40.7 M    Trainable params
143 M     Non-trainable params
183 M     Total params
735.737   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
from google.colab import files

torch.save(model.state_dict(), 'Deberta_fixed_epoch0')
files.download('Deberta_fixed_epoch0')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = model.to(device)
validation_accuracy_loader(model, valid_loader)

  0%|          | 0/316 [00:00<?, ?it/s]

0.9011377689834281

Ого, уже после первой эпохи очень хорошее качество вышло, обучим еще одну на всякий случай

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Подгружаем чекпоинт
model = DeBERTav3Tunning().to(device)
model.load_state_dict(torch.load('drive/My Drive/Deberta_fixed_epoch0'))

<All keys matched successfully>

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

trainer = Trainer(callbacks = [ModelCheckpoint(dirpath="deberta_unfreeze_epoch1/",
                                               save_top_k=3,
                                               monitor="validation_acc")],
                  max_epochs=1,
                  accelerator="auto",
                  gradient_clip_val=0.1)

trainer.fit(model, train_loader, valid_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type           | Params
---------------------------------------------
0 | bert      | DebertaV2Model | 183 M 
1 | head      | Sequential     | 102 K 
2 | criterion | BCELoss        | 0     
---------------------------------------------
40.7 M    Trainable params
143 M     Non-trainable params
183 M     Total params
735.737   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
from google.colab import files

torch.save(model.state_dict(), 'Deberta_fixed_epoch1')
files.download('Deberta_fixed_epoch1')

In [None]:
model = model.to(device)
validation_accuracy_loader(model, valid_loader)

  0%|          | 0/316 [00:00<?, ?it/s]

0.9078819358562124

Стало лучше, но accuracy уже не так быстро растет. В целом, получили качество, сравнимое с той начальной моделью. Думаю, после еще одной эпохи можно ту модель и побить, но уж больно долго ждать :)

### Try the full pipeline

Finally, it is time to use our model to find duplicate questions.
We implement a function that takes a question and finds top-5 potential duplicates in the training set. For now, it is fine if our function is slow, as long as it yields correct results.

In [None]:
# Как выглядят данные
qqp['train'][0]

{'text1': 'How is the life of a math student? Could you describe your own experiences?',
 'text2': 'Which level of prepration is enough for the exam jlpt5?',
 'label': 0,
 'idx': 0,
 'label_text': 'not duplicate'}

In [16]:
def potential_duplicates(model, query, topk = 7):
  with torch.no_grad():
    model.eval()

    def tokenize_with_query(examples):
        result = model.tokenizer(
            examples['text1'], [query for _ in range(len(examples['text1']))],
            padding='max_length', max_length=MAX_LENGTH, truncation=True
        )
        return result

    qqp_preprocessed = qqp.map(tokenize_with_query, batched=True)
    batch_size=256
    train_loader = torch.utils.data.DataLoader(qqp_preprocessed['train'], batch_size=256, collate_fn=transformers.default_data_collator, shuffle=False)
    results = []
    for i, batch in enumerate(tqdm(train_loader)):
        output = model(batch)
        for j in range(output.shape[0]):
            results.append((output[j].item(), i * batch_size + j))
    results = sorted(results)[::-1][:topk]
    answer = []
    for score, pos in results:
        answer.append((qqp['train'][pos]['text1'], 'score: ' + str(score)))
    return answer

In [None]:
potential_duplicates(model, 'Which level of preparation is enough for the exam?')

  0%|          | 0/1422 [00:00<?, ?it/s]

[('How can I become a RTO officer? What is the procedure to become a RTO officer?',
  'score: 0.7032522559165955'),
 ('What is height og SSC CGL preparation what is height of SSC CGL preparation?',
  'score: 0.48323410749435425'),
 ('What is digital marketing? What is good Way Learning for digital Marketing? Any good website.',
  'score: 0.4514493942260742'),
 ("What should be my backup plan for IAS? What's YOUR backup plan for UPSC?",
  'score: 0.43846362829208374'),
 ('*>||<* 1800><251><4919 *>||<* Cisco Router@@Tech Support Phone Number? Cisco Router Tech Support Number Cisco Router?',
  'score: 0.4100422263145447'),
 ("How can I become a UFC fighter? What's the right direction I have to follow to become a real MMA UFC fighter?",
  'score: 0.3530273735523224'),
 ('What should I do before an exam?', 'score: 0.328325092792511')]

In [None]:
potential_duplicates(model, 'How to solve NLP homeworks quickly')

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

  0%|          | 0/1422 [00:00<?, ?it/s]

[('Is it possible to make a ghost appear at school in front of you? How can you make a ghost appear in front of you during class or in school?',
  'score: 0.5781528949737549'),
 ('*>||<* 1800><251><4919 *>||<* Cisco Router@@Tech Support Phone Number? Cisco Router Tech Support Number Cisco Router?',
  'score: 0.5700299143791199'),
 ('How do I protect a business idea from being stolen from VC? How do I protect the idea from being copied?',
  'score: 0.4015324115753174'),
 ('What is pyramid scheme? How to identify pyramid scheme?',
  'score: 0.3838697373867035'),
 ('Why do my dogs fight with each other? I have two lovely pedigreed dogs but sometimes they fight with each other. Why do they fight with each other?',
  'score: 0.3837631344795227'),
 ('Since more and more dark energy appears does this mean that it is infinite or that the potential dark energy that can be created is infinite?',
  'score: 0.36210429668426514'),
 ('Since more and more dark energy appears does this mean that it is

In [None]:
potential_duplicates(model, 'Do you speak english?')

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

  0%|          | 0/1422 [00:00<?, ?it/s]

[('What is 1-800–251–4919?))}} //Belkin Router Number Belkin Router Techincal Support Phone Number?',
  'score: 0.9645280838012695'),
 ('What is 1-800–251–4919?))}} //Belkin Router Number Belkin Router Techincal Support Phone Number?',
  'score: 0.9645280838012695'),
 ('What is 1-800–251–4919?))}} //Belkin Router Number Belkin Router Techincal Support Phone Number?',
  'score: 0.9645280838012695'),
 ('What is 1-800–251–4919?))}} //Belkin Router Number Belkin Router Techincal Support Phone Number?',
  'score: 0.9645280838012695'),
 ('Is there any way to contact Dropbox support dropbox customer service phone number?',
  'score: 0.954854428768158'),
 ('*>||<* 1800><251><4919 *>||<* Cisco Router@@Tech Support Phone Number? Cisco Router Tech Support Number Cisco Router?',
  'score: 0.9421745538711548'),
 ('Are there people who had successful long distance relationships? Can you tell me about your successful experience with long distance relationship?',
  'score: 0.9246458411216736')]

In [None]:
potential_duplicates(model, 'How to get a high mark for this homework?')

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

  0%|          | 0/1422 [00:00<?, ?it/s]

[('Is it possible to make a ghost appear at school in front of you? How can you make a ghost appear in front of you during class or in school?',
  'score: 0.417463481426239'),
 ('How can I create an app similar to Uber/Ola on a small scale? How can I create an app similar to Uber/Ola on a small scale?',
  'score: 0.38508933782577515'),
 ('What is digital marketing? What is good Way Learning for digital Marketing? Any good website.',
  'score: 0.3107476532459259'),
 ('Why do my dogs fight with each other? I have two lovely pedigreed dogs but sometimes they fight with each other. Why do they fight with each other?',
  'score: 0.2550905644893646'),
 ('How do I protect a business idea from being stolen from VC? How do I protect the idea from being copied?',
  'score: 0.22825607657432556'),
 ('What is height og SSC CGL preparation what is height of SSC CGL preparation?',
  'score: 0.22494187951087952'),
 ('*>||<* 1800><251><4919 *>||<* Cisco Router@@Tech Support Phone Number? Cisco Router T

In [17]:
potential_duplicates(model, 'When the Hedgehog started to work in YSDA?')

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

  0%|          | 0/1422 [00:00<?, ?it/s]

[('Why do my dogs fight with each other? I have two lovely pedigreed dogs but sometimes they fight with each other. Why do they fight with each other?',
  'score: 0.7089277505874634'),
 ('How do I protect a business idea from being stolen from VC? How do I protect the idea from being copied?',
  'score: 0.4392576217651367'),
 ('What is pyramid scheme? How to identify pyramid scheme?',
  'score: 0.4331546127796173'),
 ('If universe expands and vacuum energy is created with it (with no limit),is there infinite potential energy/infinite vacuum energy that can be created?',
  'score: 0.4132094085216522'),
 ('If universe expands and vacuum energy is created with it (with no limit),is there infinite potential energy/infinite vacuum energy that can be created?',
  'score: 0.4132094085216522'),
 ('If universe expands and vacuum energy is created with it (with no limit),is there infinite potential energy/infinite vacuum energy that can be created?',
  'score: 0.4132094085216522'),
 ('If univers

Now we will try to find a way to run the function faster than just passing over all questions in a loop. For isntance, we can form a short-list of potential candidates using a cheaper method, and then run our tranformer on that short list. 

Можно воспользоваться предобученными эмбеддингами и быстрым поиском соседей в качестве шорт-листа кандидатов для дальнейшего засовывания в трансформер

In [None]:
!pip install nearpy

Collecting nearpy
  Downloading NearPy-1.0.0-py2.py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitarray (from nearpy)
  Downloading bitarray-2.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.5/286.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitarray, nearpy
Successfully installed bitarray-2.8.2 nearpy-1.0.0


In [None]:
import gensim.downloader as api
embeddings = api.load('glove-twitter-100')



In [None]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

# Эмбеддинги для целой фразы
def get_phrase_embedding(phrase):
    tokens = tokenizer.tokenize(phrase)
    answer = np.zeros(100)
    for token in tokens:
        if token in embeddings:
            answer += embeddings[token]
    return answer / (len(tokens) + 1e-5)

In [None]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from nearpy.distances import CosineDistance
from nearpy.filters import NearestFilter
import numpy as np

class KNNPhrases:
    def __init__(self, data):
        self.data = data
        self.data_vec = np.array([get_phrase_embedding(phrase) for phrase in data])
        self.rbp = RandomBinaryProjections('default', 10)
        self.engine = Engine(self.data_vec.shape[1], lshashes=[self.rbp], distance=CosineDistance(), vector_filters=[NearestFilter(300)])
        for index in range(self.data_vec.shape[0]):
            self.engine.store_vector(self.data_vec[index], data[index])

    def find_neighbours(self, query, k=10):
        vector = get_phrase_embedding(query)
        neighb = self.engine.neighbours(vector)[:k]
        return [cur[1] for cur in neighb]

In [None]:
knn = KNNPhrases(qqp['train']['text1'])

In [None]:
def potential_duplicates_with_preinference(model, query, topk = 7):
  with torch.no_grad():
    model.eval()
    result_texts = knn.find_neighbours(query, 200) # будем оставлять 200 кандидатов

    def tokenize_with_query(examples):
        result = model.tokenizer(
            examples, query,
            padding='max_length', max_length=MAX_LENGTH, truncation=True
        )
        return result

    qqp_preprocessed = [tokenize_with_query(cur_text) for cur_text in result_texts]
    batch_size=32
    train_loader = torch.utils.data.DataLoader(qqp_preprocessed, batch_size=32, collate_fn=transformers.default_data_collator, shuffle=False)
    results = []
    for i, batch in enumerate(tqdm(train_loader)):
        output = model(batch)
        for j in range(output.shape[0]):
            results.append((output[j].item(), i * batch_size + j))
    results = sorted(results)[::-1][:topk]
    answer = []
    for score, pos in results:
        answer.append((qqp['train'][pos]['text1'], 'score: ' + str(score)))
    return answer

In [None]:
potential_duplicates_with_preinference(model, 'Which level of preparation is enough for the exam?')

  0%|          | 0/7 [00:00<?, ?it/s]

[("How will Donald Trump's presidency affect international students?",
  'score: 0.18433359265327454'),
 ('What is the National nanotechnology initiative?',
  'score: 0.1102752685546875'),
 ('Why do most Bollywood movies contain too many sex scenes? Is it because the Bollywood audience are so fond of sex? Are they always horny?',
  'score: 0.07212664186954498'),
 ('? to be deleted', 'score: 0.05765345320105553'),
 ('What are the best books on cosmology?', 'score: 0.04378563538193703'),
 ('Where can I learn to invest in stocks?', 'score: 0.031871676445007324'),
 ('Can you grow a tree in zero gravity?', 'score: 0.030380714684724808')]

Как приятно не ждать час

In [None]:
potential_duplicates_with_preinference(model, 'How to solve NLP homeworks quickly')

  0%|          | 0/7 [00:00<?, ?it/s]

[('What is the best free VPN?', 'score: 0.015922440215945244'),
 ('Do you need a passport to go to Jamaica from the United States?',
  'score: 0.0129385469481349'),
 ('Is there a correlation between Trump supporters and IQ?',
  'score: 0.0018967223586514592'),
 ('How do you convert 16 into a fraction?', 'score: 0.0014135742094367743'),
 ('How do I install Windows 10 on a specific hard drive?',
  'score: 0.0007538457866758108'),
 ('Is it safe to travel to Italy now?', 'score: 0.0005956431850790977'),
 ('When should you lose your virginity?', 'score: 0.0004803922201972455')]

('When should you lose your virginity?', 'score: 0.0004803922201972455') - почему так смешно)))

In [None]:
potential_duplicates_with_preinference(model, 'Do you speak english?')

  0%|          | 0/7 [00:00<?, ?it/s]

[("How will Donald Trump's presidency affect international students?",
  'score: 0.22766469419002533'),
 ('What is the responsibility of SAP ERP key user?',
  'score: 0.1943996697664261'),
 ('Do you need a passport to go to Jamaica from the United States?',
  'score: 0.19022955000400543'),
 ('What will the people who have Black Money in Swiss Bank do after the demonetisation of ₹1000 & ₹500 note?',
  'score: 0.11323118209838867'),
 ('How is the life of a math student? Could you describe your own experiences?',
  'score: 0.09380163997411728'),
 ("What will be Hillary Clinton's policy towards India if she becomes president?",
  'score: 0.07610737532377243'),
 ('What is the best self help book you have read? Why? How did it change your life?',
  'score: 0.06801753491163254')]

In [None]:
potential_duplicates_with_preinference(model, 'How to get a high mark for this homework?')

  0%|          | 0/7 [00:00<?, ?it/s]

[('I am in the second year of my CSE and I want to crack GATE 2017. How do I start my preparation? What topics should I be more concentrated on?',
  'score: 0.0006082686595618725'),
 ("What's the best way to spend a long weekend?",
  'score: 0.0004553330654744059'),
 ('If Trump were elected, would he pardon Edward Snowden?',
  'score: 0.0003124581999145448'),
 ('How do obtain telegram groups link?', 'score: 0.0002995592076331377'),
 ('Do we need smaller states?', 'score: 9.167198732029647e-05'),
 ('What does Americans think of Vietnamese people?',
  'score: 7.837523298803717e-05'),
 ('What are some neurogaming startups?', 'score: 6.732662586728111e-05')]

In [None]:
potential_duplicates_with_preinference(model, 'When the Hedgehog started to work in YSDA?')

  0%|          | 0/7 [00:00<?, ?it/s]

[('Export clothing from India?', 'score: 0.00295375008136034'),
 ('When/how did you realize were not straight?',
  'score: 0.0028831581585109234'),
 ('If a die is rolled, what is the probability that the number is greater than 4?',
  'score: 0.0014093662612140179'),
 ('How do I lose weight fast?', 'score: 0.0006207233527675271'),
 ('How do I control my horny emotions?', 'score: 0.0005585408653132617'),
 ('Which are the best books to learn C++?', 'score: 0.0004639343824237585'),
 ('How do you convert 16 into a fraction?', 'score: 0.00033959400025196373')]