# **1. Dependencies**


In [1]:
!pip install sentence-transformers scikit-learn numpy bitsandbytes-cuda111 datasets
!pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in links: https://download.pytorch.org/whl/cu111/torch_stable.html


In [2]:
import json
from sklearn import preprocessing as pre
import numpy as np
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, InputExample
from sentence_transformers import losses, evaluation
from datasets import load_dataset
import random



# **2. Dataset**

## **2.1. sickr-sts**:

- link: https://huggingface.co/datasets/mteb/sickr-sts/tree/main


In [3]:
data_path = "test.jsonl"
with open(data_path, 'r', encoding='utf-8') as f:
    data = [json.loads(x) for x in f.readlines()]

# Show 10 first examples in data file
print("sickr-sts dataset examples:")
for x in data[:10]:
    print(x)
print()

# Normalize label from [0-5] -> [0-1]
raw_scores = np.array([x["score"] for x in data])
print(f"Original distribution of label: [{min(raw_scores)}, {max(raw_scores)}]")
norm_scores = raw_scores.reshape(-1, 1)
norm_scores = pre.MinMaxScaler().fit_transform(norm_scores)
norm_scores = norm_scores.astype(np.float32)
print(f"Normalize distribution of label: [{min(norm_scores)}, {max(norm_scores)}]")

for idx in range(len(data)):
    data[idx].update({"norm_score": norm_scores[idx][0]})

test_examples = {"sentences1": [], "sentences2": [], "scores": []}
for example in data:
    test_examples["sentences1"].append(example["sentence1"])
    test_examples["sentences2"].append(example["sentence2"])
    test_examples["scores"].append(example["norm_score"])

sickr-sts dataset examples:
{'sentence1': 'A group of kids is playing in a yard and an old man is standing in the background', 'sentence2': 'A group of boys in a yard is playing and a man is standing in the background', 'score': 4.5}
{'sentence1': 'A group of children is playing in the house and there is no man standing in the background', 'sentence2': 'A group of kids is playing in a yard and an old man is standing in the background', 'score': 3.2}
{'sentence1': 'The young boys are playing outdoors and the man is smiling nearby', 'sentence2': 'The kids are playing outdoors near a man with a smile', 'score': 4.7}
{'sentence1': 'The kids are playing outdoors near a man with a smile', 'sentence2': 'A group of kids is playing in a yard and an old man is standing in the background', 'score': 3.4}
{'sentence1': 'The young boys are playing outdoors and the man is smiling nearby', 'sentence2': 'A group of kids is playing in a yard and an old man is standing in the background', 'score': 3.7}
{

## 2.2 **QQP_triplets**

- link: https://huggingface.co/datasets/embedding-data/QQP_triplets


In [4]:
# dataset overview
dataset_id = "QQP_triplets"
qqp_triplets_dataset = load_dataset(dataset_id)
print(f"- The {dataset_id} dataset has {qqp_triplets_dataset['train'].num_rows} examples.")
print(f"- Each example is a {type(qqp_triplets_dataset['train'][0])} with a {type(qqp_triplets_dataset['train'][0]['set'])} as value.")
print(f"- Examples look like this:")
for k, v in qqp_triplets_dataset['train'][0]["set"].items():
    if type(v) != list:
        print(f"   {k.upper()}: {v}")
    else:
        print(f"   {k.upper()}")
        for value in v:
            print(f'\t- {value}')

- The QQP_triplets dataset has 101762 examples.
- Each example is a <class 'dict'> with a <class 'dict'> as value.
- Examples look like this:
   QUERY: Why in India do we not have one on one political debate as in USA?
   POS
	- Why cant we have a public debate between politicians in India like the one in US?
   NEG
	- Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?
	- Why do politicians, instead of having a decent debate on issues going in and around the world, end up fighting always?
	- Can educated politicians make a difference in India?
	- What are some unusual aspects about politics and government in India?
	- What is debate?
	- Why does civic public communication and discourse seem so hollow in modern India?
	- What is a Parliamentary debate?
	- Why do we always have two candidates at the U.S. presidential debate. yet the ballot has about 7 candidates? Isn't that a misrepresentation of democracy?
	- Why is civic public communica

In [5]:
# Preprocess data
train_examples = []
train_data = qqp_triplets_dataset['train']['set']
n_examples = qqp_triplets_dataset['train'].num_rows

data_set = []
for example in train_data:
  query = example['query']
  pos_sens, neg_sens = example['pos'], example['neg']
  for pos_sen in pos_sens:
    for neg_sen in neg_sens:
      data_set.append([query, pos_sen, neg_sen])

In [6]:
# Create train dataloader & evaluation data
random.shuffle(data_set)
r_train = 0.8
num_train = int(len(data_set) * r_train)
train_samples = data_set[:num_train]
eval_samples = data_set[num_train:]

train_examples = [InputExample(texts=x) for x in train_samples]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=256)

eval_examples = {"anchors": [], "positives": [], "negatives": []}
for sample in eval_samples:
    eval_examples["anchors"].append(sample[0])
    eval_examples["positives"].append(sample[1])
    eval_examples["negatives"].append(sample[2])


# **3. Model**


In [4]:
word_embedding_model = models.Transformer('all-MiniLM-L6-v2', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# **4. Training**

In [8]:
train_loss = losses.TripletLoss(model=model)
evaluator = evaluation.TripletEvaluator(**eval_examples)

In [9]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)], epochs=2, 
    warmup_steps=100, 
    evaluator=evaluator, 
    evaluation_steps=500,
    output_path="qqp_triplets_model"
)

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8726 [00:00<?, ?it/s]

KeyboardInterrupt: 

# **5. Testing**

In [5]:
# Define your train dataset, the dataloader and the train loss
train_loss = losses.CosineSimilarityLoss(model)

evaluator = evaluation.EmbeddingSimilarityEvaluator(**test_examples)


In [11]:
result = evaluator(model, output_path="qqp_triplets_model")

In [12]:
result

0.7758332298648364