# Calculate embedding vectors for mixed data

In [7]:
!pip install pathvalidate
import json
from functools import reduce
# conda activate instructor
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import trange, tqdm
from time import sleep
import statistics
import pickle
from tqdm.autonotebook import tqdm
from pathvalidate import sanitize_filename

from IPython.display import display
import time

flat_map = lambda f, xs: reduce(lambda a, b: a + b, map(f, xs))

import json
import IPython.core.formatters

class JsonDumpTryingFormatter(
    IPython.core.formatters.PlainTextFormatter
):
    def __call__(self, obj):
        try:
            return json.dumps(
                obj,
                indent=2,
                default=self._json_default
            )
        except TypeError:
            return super().__call__(obj)

    def _json_default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        raise TypeError(f"Unsupported type {type(obj)}")

_ipy = IPython.get_ipython()
_formatters = _ipy.display_formatter.formatters
_formatters["text/plain"] = JsonDumpTryingFormatter()
import IPython
_ipy = IPython.get_ipython()
_formatters = _ipy.display_formatter.formatters
_json_formatter = _formatters["application/json"]
_json_formatter.for_type(dict, lambda obj: obj)
_json_formatter.for_type(list, lambda obj: obj)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




<function __main__.<lambda>(obj)>

In [None]:
#load test data
#!pip install dbrepo
from dbrepo.RestClient import RestClient
client = RestClient(endpoint="https://test.dbrepo.tuwien.ac.at", username="11905148", password="nixda")


table_data = client.get_table_data(database_id ="e31b2788-e1da-4a4a-9892-d2b5a1216df6", table_id="41c10fad-83e7-4df3-a188-eb872e05646a",size=1000)

data = dict(table_data)

cyber = []
krimisi = []
innotech = []

all_data = []

for i in range(len(data['bh_category'])):
    all_data.append({"CC_filename": data['cc_filename'][i], "content": data['content'][i], "BH_category": data['bh_category'][i],"CC_normalized_url": data['cc_normalized_url'][i],"is_relevant": data['is_relevant'][i]})

#Local test data
"""
with open("savedata.json") as file:
    data = json.loads(file.read())
all_data = data
"""

irr = [x for x in all_data if not x['is_relevant']][:10]

rel = [x for x in all_data if x['is_relevant']][:10]

test_data = []+irr+rel





### Data format
```json
[
    {
        "BH_category":[],
        "CC_filename":"",
        "CC_normalized_url":"",
        "is_relevant":false,
        "content":""
    }
]
```

In [None]:
# embeddings cache

def generate_embeddings(model_name, long_passage=False):

    model = SentenceTransformer(model_name,device="cpu",trust_remote_code=True)

    if long_passage:
        embeds = model.encode(["Represent this sentence for searching relevant passages: Articles or news about crises, military or security.",
                            "Represent this sentence for searching relevant passages: Articles or news about new innovations and technology.",
                            "Represent this sentence for searching relevant passages: Articles or news about cyber security."])
    else:
        embeds = model.encode(["crises, military or security","new innovations or technology","cyber security"])
    
    
    embedding_cache = {
        "cyber":embeds[2],
        "krimisi":embeds[0],
        "innotech":embeds[1]
    }
    all_embeddings = []
    
    for rec in tqdm(test_data, total=len(test_data), desc="Processing records"):
        content = rec["content"]
        url = rec["CC_normalized_url"]
        is_relevant = rec["is_relevant"]
        categories = ["cyber","krimisi","innotech"] #rec["BH_category"]
        embeddings = model.encode([content])
        all_embeddings.append({
            "model":model_name,
            "categories":categories,
            "categories_embedding": [embedding_cache[category] for category in categories] if rec["BH_category"] else list(embedding_cache.values()),
            "url":url,
            "content_embedding":embeddings[0],
            "is_relevant": is_relevant
        })
    # save for further processing
    embedings_output_path = f"model_data/{sanitize_filename(model_name)}.pickle"
    if long_passage:
        embedings_output_path = f"model_data/{sanitize_filename(model_name)}.pickle_long_passage"

    with open(embedings_output_path,"wb") as file:
        pickle.dump(all_embeddings,file)


In [15]:
#model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2",device="cpu")
#model = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-cos-v1",device="cuda")


model_name = "./custom/test_encoder_only_base_bge-large-en-v1.5"
#generate_embeddings(model_name)
#model_name = "BAAI/bge-large-en-v1.5"
generate_embeddings(model_name,False)

2025-04-28 23:50:33,503 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: ./custom/test_encoder_only_base_bge-large-en-v1.5


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]t/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s],  2.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s],  2.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s],  2.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s],  2.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s],  2.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s],  2.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s],  2.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s],  2.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s],  2.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.69it/s]3,  2.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]3,  2.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]3,  2.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.72it/s]2,  2.65it/s]
Batches: 10