# ✅ LLM Job Classification System - FAISS Version (Offline)

In [None]:
import pandas as pd
import numpy as np
import os
import faiss
from sentence_transformers import SentenceTransformer
from ctransformers import AutoModelForCausalLM

## Step 1: Load Models from Local Disk

In [None]:
# Load pre-downloaded Sentence Transformer model
embedder = SentenceTransformer('D:/NLP/all-MiniLM-L6-v2/')

# Load pre-downloaded Mistral-7B-Instruct LLM
llm = AutoModelForCausalLM.from_pretrained(
    model_path="./", 
    model_file="D:/NLP/LLM/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=0  # Use CPU fully
)

print("✅ Models loaded successfully.")

## Step 2: Prepare SOC Codes and Data Labels

In [None]:
# Define synthetic SOC data
soc_data = [
    {"soc_code": "2111", "title": "Biological Scientists", "description": "Research and analysis of biological data and processes."},
    {"soc_code": "2122", "title": "Mechanical Engineers", "description": "Design and development of mechanical systems."},
    {"soc_code": "2425", "title": "Data Analysts", "description": "Data analysis, pattern recognition, and reporting."},
    {"soc_code": "4112", "title": "Data Entry Clerks", "description": "Inputting, updating and managing data records."},
    {"soc_code": "3421", "title": "Data Scientists", "description": "Advanced data modeling, machine learning, and AI research."},
]

# Define synthetic Data Labels
data_labels = [
    {"label": "Data Entry", "description": "Typing, inputting, and administrative handling of data."},
    {"label": "Database Management", "description": "Managing SQL servers, database optimization, data warehousing."},
    {"label": "Data Analytics", "description": "Analyzing datasets, pattern finding, report generation."},
    {"label": "Data Science", "description": "Predictive modeling, machine learning, and AI development."}
]

print("✅ SOC and Label metadata prepared.")

## Step 3: Build FAISS Indexes

In [None]:
# Embed SOC Codes
documents_soc = [entry["description"] for entry in soc_data]
embeddings_soc = embedder.encode(documents_soc, show_progress_bar=True)

# Embed Data Labels
documents_labels = [entry["description"] for entry in data_labels]
embeddings_labels = embedder.encode(documents_labels, show_progress_bar=True)

# Create FAISS indexes
soc_index = faiss.IndexFlatL2(embeddings_soc.shape[1])
soc_index.add(np.array(embeddings_soc))

label_index = faiss.IndexFlatL2(embeddings_labels.shape[1])
label_index.add(np.array(embeddings_labels))

# Save metadata separately
soc_metadata = soc_data
label_metadata = data_labels

print("✅ FAISS indexes created.")

## Step 4: Job Classification Function

In [None]:
def classify_job(job_description):
    # Embed the incoming job description
    embedding = embedder.encode([job_description])

    # Find closest SOC code
    _, soc_indices = soc_index.search(np.array(embedding), k=1)
    soc_result = soc_metadata[soc_indices[0][0]]

    # Find closest Data Label
    _, label_indices = label_index.search(np.array(embedding), k=1)
    label_result = label_metadata[label_indices[0][0]]

    # Optional: Get LLM generated summary
    prompt = f"""
You are a job classification assistant. Given the following job description, classify the main function of the job:

{job_description}

Main function:
"""
    llm_response = llm(prompt, stream=False)

    return {
        "predicted_soc_code": soc_result["soc_code"],
        "predicted_soc_title": soc_result["title"],
        "predicted_data_label": label_result["label"],
        "llm_generated_summary": llm_response
    }

## Step 5: Example Usage

In [None]:
sample_job_description = "We are looking for someone to manage large-scale data warehouses and optimize SQL databases. Experience in database management is essential."

result = classify_job(sample_job_description)

print("\n\n===== Classification Result =====")
for key, value in result.items():
    print(f"{key}: {value}")

print("\n✅ Job classification completed successfully.")