# ✅ LLM Job Classification System - FAISS Version (Offline)

In [None]:
import pandas as pd
import numpy as np
import os
import faiss
from sentence_transformers import SentenceTransformer
from ctransformers import AutoModelForCausalLM

## Step 1: Load Models from Local Disk

In [None]:
# Load pre-downloaded Sentence Transformer model
embedder = SentenceTransformer('D:/NLP/all-MiniLM-L6-v2/')

# Load pre-downloaded Mistral-7B-Instruct LLM
llm = AutoModelForCausalLM.from_pretrained(
    model_path="./", 
    model_file="D:/NLP/LLM/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=0  # Use CPU fully
)

print("✅ Models loaded successfully.")

## Step 2: Prepare SOC Codes and Data Labels

In [None]:
# Define synthetic SOC data
soc_data = [
    {"soc_code": "2111", "title": "Biological Scientists", "description": "Research and analysis of biological data and processes."},
    {"soc_code": "2122", "title": "Mechanical Engineers", "description": "Design and development of mechanical systems."},
    {"soc_code": "2425", "title": "Data Analysts", "description": "Data analysis, pattern recognition, and reporting."},
    {"soc_code": "4112", "title": "Data Entry Clerks", "description": "Inputting, updating and managing data records."},
    {"soc_code": "3421", "title": "Data Scientists", "description": "Advanced data modeling, machine learning, and AI research."},
]

# Define synthetic Data Labels
data_labels = [
    {"label": "Data Entry", "description": "Typing, inputting, and administrative handling of data."},
    {"label": "Database Management", "description": "Managing SQL servers, database optimization, data warehousing."},
    {"label": "Data Analytics", "description": "Analyzing datasets, pattern finding, report generation."},
    {"label": "Data Science", "description": "Predictive modeling, machine learning, and AI development."}
]

print("✅ SOC and Label metadata prepared.")

## Step 3: Build FAISS Indexes

In [None]:
# Embed SOC Codes
documents_soc = [entry["description"] for entry in soc_data]
embeddings_soc = embedder.encode(documents_soc, show_progress_bar=True)

# Embed Data Labels
documents_labels = [entry["description"] for entry in data_labels]
embeddings_labels = embedder.encode(documents_labels, show_progress_bar=True)

# Create FAISS indexes
soc_index = faiss.IndexFlatL2(embeddings_soc.shape[1])
soc_index.add(np.array(embeddings_soc))

label_index = faiss.IndexFlatL2(embeddings_labels.shape[1])
label_index.add(np.array(embeddings_labels))

# Save metadata separately
soc_metadata = soc_data
label_metadata = data_labels

print("✅ FAISS indexes created.")

## Step 4: Job Classification Function

In [None]:
import time

def classify_job(job_description, threshold=0.5, use_llm=False):
    start_time = time.time()

    # Embed the incoming job description
    embedding = embedder.encode([job_description])

    # Find closest SOC code
    distances_soc, soc_indices = soc_index.search(np.array(embedding), k=1)
    soc_distance = distances_soc[0][0]
    soc_result = soc_metadata[soc_indices[0][0]]

    # Find closest Data Label
    distances_label, label_indices = label_index.search(np.array(embedding), k=1)
    label_distance = distances_label[0][0]
    label_result = label_metadata[label_indices[0][0]]

    # Apply thresholds
    predicted_soc_code = soc_result["soc_code"] if soc_distance <= threshold else "Unknown"
    predicted_soc_title = soc_result["title"] if soc_distance <= threshold else "Unknown"
    predicted_data_label = label_result["label"] if label_distance <= threshold else "Unknown"

    # Optional LLM explanation
    if use_llm:
        prompt = f"""
You are a job classification assistant.
Given the following job description, explain why it belongs to the category '{predicted_data_label}' and SOC code '{predicted_soc_code}'.

Job Description:
{job_description}

Explanation:"""
        llm_generated_summary = llm(prompt, stream=False)
    else:
        llm_generated_summary = "LLM explanation disabled."

    elapsed_time = time.time() - start_time

    return {
        "predicted_soc_code": predicted_soc_code,
        "predicted_soc_title": predicted_soc_title,
        "predicted_data_label": predicted_data_label,
        "soc_distance": soc_distance,
        "label_distance": label_distance,
        "llm_generated_summary": llm_generated_summary,
        "prediction_time_seconds": elapsed_time
    }

## Step 5: Example Usage

In [None]:
test_cases = [
    "Looking for a data entry clerk to input customer orders and update internal systems.",
    "We need an analyst to generate business intelligence dashboards and perform statistical modelling.",
    "Develop and optimize PostgreSQL databases for enterprise clients.",
    "Researcher required to work on advanced AI models for speech and language processing.",
    "Warehouse staff needed to manage inventory and shipping.",
    "asdasdqwe123@# gibberish text not related to anything"
]



In [None]:
for desc in test_cases:
    print("📝 Job Description:")
    print(desc)
    print("🔍 Prediction:")
    result = classify_job(desc, use_llm=True)  # set False to disable LLM
    for k, v in result.items():
        print(f"{k}: {v}")
    print("-" * 80)

In [None]:
import pandas as pd

# Run predictions for all test cases
test_cases = [
    "Looking for a data entry clerk to input customer orders and update internal systems.",
    "We need an analyst to generate business intelligence dashboards and perform statistical modelling.",
    "Develop and optimize PostgreSQL databases for enterprise clients.",
    "Researcher required to work on advanced AI models for speech and language processing.",
    "Warehouse staff needed to manage inventory and shipping.",
    "asdasdqwe123@# gibberish text not related to anything"
]

results = []
for i, desc in enumerate(test_cases):
    res = classify_job(desc, threshold=0.6, use_llm=False)
    res["index"] = f"Job {i+1}"
    results.append(res)

df_confidence = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
bar_width = 0.35
index = range(len(df_confidence))

# Plot both distances
plt.bar(index, df_confidence['soc_distance'], bar_width, label='SOC Distance', color='steelblue')
plt.bar([i + bar_width for i in index], df_confidence['label_distance'], bar_width, label='Label Distance', color='darkorange')

plt.axhline(0.6, color='red', linestyle='--', label='Threshold (0.6)')

plt.xlabel('Job Description')
plt.ylabel('Distance (lower = better match)')
plt.title('Prediction Confidence by Distance to Nearest Match')
plt.xticks([i + bar_width / 2 for i in index], df_confidence['index'])
plt.legend()
plt.tight_layout()
plt.show()