In [1]:
import os

import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from pymilvus import MilvusClient
from langchain_core.messages import AIMessage, HumanMessage
from tqdm import tqdm

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import torch

mc = MilvusClient("milvus_demo.db")

model_name = "BAAI/bge-large-en-v1.5"
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
encoder = SentenceTransformer(model_name, device=DEVICE)

base_model = "meta-llama/Llama-3.2-1B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model)

def embed_texts(batch_texts):
    embeddings = encoder.encode(
        batch_texts,
        convert_to_tensor=True,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    batch_embeddings = embeddings.cpu().numpy().astype(np.float32)
    return batch_embeddings


  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import DistributionNotFound, get_distribution
Some parameters are on the meta device because they were offloaded to the disk.


In [3]:
import ast
import pandas as pd

def convert_row_to_triplets(row):
    job_title = row["job_title"]
    category = row["category"]
    description = row["job_description"]

    try:
        skills = ast.literal_eval(row["job_sill_set"])
    except:
        skills = []

    triplets = []
    for skill in skills:
        triplets.append([job_title, "requires skill", skill])
        triplets.append([job_title, "provides expertise in", skill])

    triplets.append([job_title, "is categorized under", category])

    triplets.append([job_title, "is described as", description[:200] + "..."])

    return {
        "passage": description,
        "triplets": triplets
    }

df = pd.read_csv("data/all_job_post.csv")
nano_dataset = [convert_row_to_triplets(row) for _, row in df.iterrows()]

In [4]:
from collections import defaultdict

entityid_2_relationids = defaultdict(list)
relationid_2_passageids = defaultdict(list)

entities = []
entity2id = {}
relations = []
relation2id = {}
passages = []

for passage_id, dataset_info in enumerate(nano_dataset):
    passage, triplets = dataset_info["passage"], dataset_info["triplets"]
    passages.append(passage)

    for subj, _, obj in triplets:
        if subj not in entity2id:
            entity2id[subj] = len(entities)
            entities.append(subj)
        subj_id = entity2id[subj]

        if obj not in entity2id:
            entity2id[obj] = len(entities)
            entities.append(obj)
        obj_id = entity2id[obj]

        relation = " ".join([subj, _, obj])
        if relation not in relation2id:
            relation2id[relation] = len(relations)
            relations.append(relation)
        relation_id = relation2id[relation]

        entityid_2_relationids[subj_id].append(relation_id)
        entityid_2_relationids[obj_id].append(relation_id)  
        relationid_2_passageids[relation_id].append(passage_id)      
    

In [5]:
embedding_dim = encoder.get_sentence_embedding_dimension()

def create_milvus_collection(collection_name: str):
    if mc.has_collection(collection_name=collection_name):
        mc.drop_collection(collection_name=collection_name)
    mc.create_collection(
        collection_name=collection_name,
        dimension=embedding_dim
    )

entity_col_name = "entity_collection"
relation_col_name = "relation_collection"
passage_col_name = "passage_collection"

create_milvus_collection(entity_col_name)
create_milvus_collection(relation_col_name)
create_milvus_collection(passage_col_name)

In [6]:
def milvus_insert(
    collection_name: str,
    text_list: list[str],
):
    batch_size = embedding_dim
    for start in tqdm(range(0, len(text_list), batch_size), desc="Inserting {collection_name}"):
        end = start + batch_size
        batch_texts = text_list[start : end]
        batch_embeddings = embed_texts(batch_texts)

        batch_ids = list(range(start,end))
        batch_data = [
            {
                "id": id_,
                "text": text,
                "vector": vector,
            }
            for id_, text, vector in zip(batch_ids, batch_texts, batch_embeddings)
        ]
        mc.insert(
            collection_name=collection_name,
            data=batch_data,
        )


milvus_insert(
    collection_name=relation_col_name,
    text_list=relations,
)

milvus_insert(
    collection_name=entity_col_name,
    text_list=entities,
)

milvus_insert(
    collection_name=passage_col_name,
    text_list=passages,
)

Inserting {collection_name}: 100%|██████████| 2/2 [02:39<00:00, 79.82s/it]
Inserting {collection_name}: 100%|██████████| 2/2 [02:11<00:00, 65.77s/it]
Inserting {collection_name}: 100%|██████████| 2/2 [16:11<00:00, 485.81s/it]


In [8]:
query = "What skills does an Engineer need ?"

query_ner_list = ["Engineer"]

query_ner_embeddings = [
    embed_texts(query_ner) for query_ner in query_ner_list
]

top_k = 3

entity_search_res = mc.search(
    collection_name=entity_col_name,
    data=query_ner_embeddings,
    anns_field="vector",
    search_params={"metric_type":"COSINE","params": {"nprobe":10}},
    limit=top_k,
    output_fields=["id", "text"]
)

query_embedding = embed_texts([query])

relation_search_res = mc.search(
    collection_name=relation_col_name,
    data=query_embedding,
    anns_field="vector",
    search_params={"metric_type":"COSINE","params": {"nprobe":10}},
    limit=top_k,
    output_fields=["id", "text"]
)

In [9]:
for i, hits in enumerate(entity_search_res):
    print(f"Query Entity: {query_ner_list[i]}")
    for hit in hits:
        print(f"{hit.id}, Score: {hit.score:.4f}, {hit.entity.get('text')}")

print("Relation search results")
for hit in relation_search_res[0]:
    print(f"{hit.id}, Score: {hit.score:.4f}, {hit.entity.get('text')}")

Query Entity: Engineer
868, Score: 0.7559, Sales Engineer
392, Score: 0.7442, Information Technology Engineer
1009, Score: 0.7119, Technical Sales and Proposal Engineer 
Relation search results
325, Score: 0.6050, Information Technology Infrastructure Engineer is described as JOB TITLE- IT INFRASTRUCTURE ENGINEERLOCATION- DEVENS, MADIRECT CLIENT
"MUST HAVE LIST:
10+ YEARS OF EXPERIENCE WITH THE FOLLOWING:• KNOWLEDGE OF DATA CENTER OPERATION AND IT INFRASTRUCTURE.• KNOWLEDG...
383, Score: 0.5971, Information Technology Operations Engineer is described as MUST HAVES:
5+ YEARS OF EXPERIENCE WITHIN THE ITSM SPACE SPECIFICALLY HANDLING PRODUCTION READINESS ISSUES AND AREASHANDS ONE EXPERIENCE CREATING PRODUCTION READINESS REVIEWS (PRR) OR CHECKLISTS TO DO...
391, Score: 0.5899, Information Technology Engineer is described as IT ENGINEER - BOCA RATON, FL - ONSITE 
SUMMARY:THE IT ENGINEER WILL BE RESPONSIBLE FOR MANAGING AND PROVIDING DIRECT HANDS-ON SUPPORT TO ALL OF OUR LOCATIONS. THE INDIV

In [None]:
# col=["entity_collection", "relation_collection", "passage_collection"]
# for name in col:
#     if mc.has_collection(collection_name=name):
#         mc.drop_collection(collection_name=name)

In [None]:
top_relation_hits = relation_search_res[0]
context_passages = [hit.entity.get("text") for hit in top_relation_hits]
context_text = "\n\n".join(context_passages)

entity_names = [hit.entity.get("text") for hits in entity_search_res for hit in hits]
context_text = "Related roles: " + ", ".join(entity_names) + "\n\n" + context_text

In [None]:
def generate_text(prompt, max_new_tokens=256, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)

    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result    

In [None]:
prompt = """You are an AI assistant analyzing job data.
Summarize the following information into:
1. A concise summary
2. A bullet list of skills

Context:
Engineers are responsible for managing and providing direct hands-on support...
"""

response = generate_text(prompt)
print(response)