In [2]:
import os

import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from pymilvus import MilvusClient
from langchain_core.messages import AIMessage, HumanMessage
from tqdm import tqdm

In [None]:
from transformers import AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import torch

mc = MilvusClient("milvus_demo.db")

model_name = "BAAI/bge-large-en-v1.5"
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
encoder = SentenceTransformer(model_name, device=DEVICE)

base_model = "meta-llama/Llama-3.2-1B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto")

def embed_texts(batch_texts):
    embeddings = encoder.encode(
        batch_texts,
        convert_to_tensor=True,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    batch_embeddings = embeddings.cpu().numpy().astype(np.float32)
    return batch_embeddings


  from pkg_resources import DistributionNotFound, get_distribution
Some parameters are on the meta device because they were offloaded to the disk.


In [None]:
import ast
import pandas as pd

def convert_row_to_triplets(row):
    job_title = row["job_title"]
    category = row["category"]
    description = row["job_description"]

    try:
        skills = ast.literal_eval(row["job_sill_set"])
    except:
        skills = []

    triplets = []
    for skill in skills:
        triplets.append([job_title, "requires skill", skill])
        triplets.append([job_title, "provides expertise in", skill])

    triplets.append([job_title, "is categorized under", category])

    triplets.append([job_title, "is described as", description[:200] + "..."])

    return {
        "passage": description,
        "triplets": triplets
    }

df = pd.read_csv("data/all_job_post.csv")
nano_dataset = [convert_row_to_triplets(row) for _, row in df.iterrows()]

In [None]:
from collections import defaultdict

entityid_2_relationids = defaultdict(list)
relationid_2_passageids = defaultdict(list)

entities = []
entity2id = {}
relations = []
relation2id = {}
passages = []

for passage_id, dataset_info in enumerate(nano_dataset):
    passage, triplets = dataset_info["passage"], dataset_info["triplets"]
    passages.append(passage)

    for subj, _, obj in triplets:
        if subj not in entity2id:
            entity2id[subj] = len(entities)
            entities.append(subj)
        subj_id = entity2id[subj]

        if obj not in entity2id:
            entity2id[obj] = len(entities)
            entities.append(obj)
        obj_id = entity2id[obj]

        relation = " ".join([subj, _, obj])
        if relation not in relation2id:
            relation2id[relation] = len(relations)
            relations.append(relation)
        relation_id = relation2id[relation]

        entityid_2_relationids[subj_id].append(relation_id)
        entityid_2_relationids[obj_id].append(relation_id)  
        relationid_2_passageids[relation_id].append(passage_id)      
    

In [None]:
embedding_dim = encoder.get_sentence_embedding_dimension()

def create_milvus_collection(collection_name: str):
    if mc.has_collection(collection_name=collection_name):
        mc.drop_collection(collection_name=collection_name)
    mc.create_collection(
        collection_name=collection_name,
        dimension=embedding_dim
    )

entity_col_name = "entity_collection"
relation_col_name = "relation_collection"
passage_col_name = "passage_collection"

create_milvus_collection(entity_col_name)
create_milvus_collection(relation_col_name)
create_milvus_collection(passage_col_name)

In [None]:
def milvus_insert(
    collection_name: str,
    text_list: list[str],
):
    batch_size = embedding_dim
    for start in tqdm(range(0, len(text_list), batch_size), desc="Inserting {collection_name}"):
        end = start + batch_size
        batch_texts = text_list[start : end]
        batch_embeddings = embed_texts(batch_texts)

        batch_ids = list(range(start,end))
        batch_data = [
            {
                "id": id_,
                "text": text,
                "vector": vector,
            }
            for id_, text, vector in zip(batch_ids, batch_texts, batch_embeddings)
        ]
        mc.insert(
            collection_name=collection_name,
            data=batch_data,
        )


milvus_insert(
    collection_name=relation_col_name,
    text_list=relations,
)

milvus_insert(
    collection_name=entity_col_name,
    text_list=entities,
)

milvus_insert(
    collection_name=passage_col_name,
    text_list=passages,
)

In [None]:
query = "What skills does a Engineer need ?"

query_ner_list = ["IT"]

query_ner_embeddings = [
    embed_texts(query_ner) for query_ner in query_ner_list
]

top_k = 3

entity_search_res = mc.search(
    collection_name=
)