In [1]:
from core.embeddings import EmbeddingSearcher
import ir_datasets
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader


  from tqdm.autonotebook import tqdm, trange


### Load Variables
#### not: current model all-MiniLM-L6-v2

In [2]:
wikiR = "wikir/en1k/training"
antique = "antique/test/non-offensive"
antique_train = "antique/train/split200-train"

choosenDataSet = antique
dataset = ir_datasets.load(choosenDataSet)

embedding = EmbeddingSearcher()



In [3]:
# Iterate through each document add and normalize 

for index,doc in enumerate(dataset.docs_iter()):
    embedding.add_document(doc_id=doc.doc_id, text=doc.text,)



### Prepare Trainning Dataset

In [4]:
train_examples = []
train_dataset = ir_datasets.load(antique_train)

# Iterate over the dataset and prepare InputExamples
for query in train_dataset.queries_iter():
    query_id = query.query_id
    query_text = query.text
    for qrel in train_dataset.qrels_iter():
        if qrel.query_id == query_id:
            doc = None
            for id, text in embedding.documents:
                if (id == qrel.doc_id):
                    doc = text
                    break

            label = 0
            if (qrel.relevance > 2):
                label = 1

            train_examples.append(InputExample(
                texts=[query_text, doc], label=float(label)))

### Train Model on top of the current model all-MiniLM-L6-v2

In [5]:
# The trained model is now saved to the 'output_path' directory
train_dataloader = DataLoader(
            train_examples, shuffle=True, batch_size=16)

        # Define the loss function
train_loss = losses.CosineSimilarityLoss(embedding.model)

        # Train the model
embedding.model.fit(train_objectives=[(train_dataloader, train_loss)],
                       epochs=1,  # may need to increase this for better performance
                       output_path='./output/trained_model')

 32%|███▏      | 500/1577 [26:15<1:05:37,  3.66s/it]

{'loss': 0.253, 'grad_norm': 1.6056740283966064, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.32}


 63%|██████▎   | 1000/1577 [52:07<23:42,  2.46s/it] 

{'loss': 0.1916, 'grad_norm': 1.083077311515808, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.63}


 95%|█████████▌| 1500/1577 [1:21:19<04:51,  3.79s/it]

{'loss': 0.1625, 'grad_norm': 0.7994086146354675, 'learning_rate': 3e-06, 'epoch': 0.95}


100%|██████████| 1577/1577 [1:25:45<00:00,  3.26s/it]


{'train_runtime': 5145.1173, 'train_samples_per_second': 4.903, 'train_steps_per_second': 0.307, 'train_loss': 0.20018289054430355, 'epoch': 1.0}


                                                                             

### Build Documents Embeddings using the trained model

In [6]:
embedding.build_documents_embeddings()
embedding.save("antique_embed.pkl")