In [1]:
!pip install transformers langchain scikit-learn

Collecting langchain
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain)
  Downloading langchain_core-0.3.18-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.143-py3-none-any.whl.metadata (13 kB)
Collecting packaging>=20.0 (from transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting requests-toolbelt<2.0.0,>=1.0.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Downloading langchain-0.3.7-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading langchain_core-0.3.18-py3-none-any.whl 

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Example employee data
employee_data = [
    {"id": 1, "info": "John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi."},
    {"id": 2, "info": "Sarah is from NY, visited family in Canada recently"}
]

# Load embedding model (distilbert-base-uncased)
embedding_model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

# Function to embed text data
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = embedding_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Create embeddings for employee data
embeddings = np.vstack([embed_text(entry["info"]) for entry in employee_data])

# Index embeddings using NearestNeighbors
index = NearestNeighbors(n_neighbors=1, metric="cosine").fit(embeddings)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [3]:
import os

# Replace 'YOUR_HUGGINGFACE_TOKEN' with the token you generated
os.environ["HUGGINGFACE_TOKEN"] = "your token here"

In [4]:
!pip install langchain-community

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Downloading langchain_community-0.3.7-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading pydantic_settings-2.6.1-py3-none-any.whl (28 kB)
Installing collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.0 langchain-community-0.3.7 pydantic-settings-2.6.1


In [5]:
# Load generation model (Llama 2 or similar) directly for text generation
generation_model_name = "meta-llama/Llama-2-7b-chat-hf"  # Substitute with accessible model if needed
gen_tokenizer = AutoTokenizer.from_pretrained(generation_model_name, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
gen_model = AutoModelForCausalLM.from_pretrained(generation_model_name, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
generation_pipeline = pipeline("text-generation", model=gen_model, tokenizer=gen_tokenizer)

# Retrieval function
def retrieve_context(query, top_k=1):
    query_embedding = embed_text(query)
    distances, indices = index.kneighbors(query_embedding, n_neighbors=top_k)
    return [employee_data[idx]['info'] for idx in indices[0]]

# Function to generate a response
def generate_response(query):
    # Retrieve relevant context
    context = retrieve_context(query, top_k=1)[0]
    
    # Combine context and query for input
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    
    # Generate answer with generation pipeline
    result = generation_pipeline(prompt, max_length=300, do_sample=True)
    
    # Debug print to inspect the result structure
    print("Result from generation pipeline:", result)
    
    # Check the output structure and access text accordingly
    if isinstance(result, list) and "generated_text" in result[0]:
        return result[0]["generated_text"]
    elif isinstance(result, list):
        return result[0]
    else:
        return result





tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [6]:
query = "Where did John go on his last vacation?"
response = generate_response(query)
print("Response:", response)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Result from generation pipeline: [{'generated_text': 'Context: John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi.\nQuestion: Where did John go on his last vacation?\nAnswer: According to the passage, John went on his last vacation to Bali.'}]
Response: Context: John loves pets, has a golden retriever named Max, and enjoys volunteering at th

In [7]:
!pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.0-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.7/268.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.0


In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load Sentence Transformer model
embedding_model_name = "hkunlp/instructor-large"  # Or use another model optimized for semantic similarity
sentence_model = SentenceTransformer(embedding_model_name)

# Updated function to calculate risk score with sentence embeddings
def calculate_risk_score(expected_answer, user_answer):
    # Generate embeddings for both expected and user-provided answers
    expected_embedding = sentence_model.encode([expected_answer])
    user_embedding = sentence_model.encode([user_answer])
    
    # Calculate cosine similarity using sentence embeddings
    similarity_score = cosine_similarity(expected_embedding, user_embedding)[0][0]
    
    # Risk score is inversely related to similarity
    risk_score = (1 - similarity_score) * 100
    return risk_score

# Example query and LLM-generated answer
query = "Where did John go on his last vacation?"
expected_answer = "He went to Bali"  # or generate_response(query) if using dynamic generation

# User-provided answer
user_answer = "Some place in Indonesia, Bali I guess"  # Testing with a correct answer

# Calculate risk score based on similarity
risk_score = calculate_risk_score(expected_answer, user_answer)

# Decision based on threshold
threshold = 45  # Define a threshold; if risk score is higher, flag as suspicious
access_decision = "Access Granted" if risk_score < threshold else "Access Denied"

# Output results
print("User Answer:", user_answer)
print("Risk Score:", risk_score)
print("Access Decision:", access_decision)


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

User Answer: Some place in Indonesia, Bali I guess
Risk Score: 7.182079553604126
Access Decision: Access Granted


In [9]:
# User-provided answer
user_answer = "He went to Bali"  # correct answer

# Calculate risk score based on similarity
risk_score = calculate_risk_score(expected_answer, user_answer)

# Decision based on threshold
threshold = 45  # Define a threshold; if risk score is higher, flag as suspicious
access_decision = "Access Granted" if risk_score < threshold else "Access Denied"

# Output results
print("User Answer:", user_answer)
print("Risk Score:", risk_score)
print("Access Decision:", access_decision)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

User Answer: He went to Bali
Risk Score: 0.0
Access Decision: Access Granted


In [10]:
import hashlib
import time
import json

# Example employee data
employee_data = [
    {"id": 1, "info": "John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi."},
    {"id": 2, "info": "Sarah is from NY, visited family in Canada recently"}
]

# Blockchain structure to store hashes of employee data
class Block:
    def __init__(self, index, data_hash, previous_hash):
        self.index = index
        self.timestamp = time.time()
        self.data_hash = data_hash  # Hash of the employee data
        self.previous_hash = previous_hash  # Link to the previous block's hash
        self.hash = self.compute_hash()  # Hash of the current block

    def compute_hash(self):
        block_string = json.dumps({
            "index": self.index,
            "timestamp": self.timestamp,
            "data_hash": self.data_hash,
            "previous_hash": self.previous_hash
        }, sort_keys=True).encode()
        return hashlib.sha256(block_string).hexdigest()

class Blockchain:
    def __init__(self):
        self.chain = []
        self.create_genesis_block()

    def create_genesis_block(self):
        # Create the first block in the blockchain
        genesis_block = Block(0, "0", "0")
        self.chain.append(genesis_block)

    def add_block(self, data_hash):
        previous_hash = self.chain[-1].hash
        new_block = Block(len(self.chain), data_hash, previous_hash)
        self.chain.append(new_block)

    def is_chain_valid(self):
        # Check if the blockchain is valid
        for i in range(1, len(self.chain)):
            current_block = self.chain[i]
            previous_block = self.chain[i - 1]
            # Check if current block's hash is valid
            if current_block.hash != current_block.compute_hash():
                return False
            # Check if the block links are consistent
            if current_block.previous_hash != previous_block.hash:
                return False
        return True

# Function to hash employee data
def hash_employee_data(employee):
    employee_string = json.dumps(employee, sort_keys=True).encode()
    return hashlib.sha256(employee_string).hexdigest()

# Initialize blockchain
blockchain = Blockchain()

# Add employee data to blockchain as hashed entries
for employee in employee_data:
    data_hash = hash_employee_data(employee)
    blockchain.add_block(data_hash)

# Check blockchain integrity function
def check_integrity(new_data):
    """Verify the data against stored blockchain hashes"""
    data_hash = hash_employee_data(new_data)
    # Check if hash matches any existing block in the blockchain
    for block in blockchain.chain:
        if block.data_hash == data_hash:
            print("Data integrity confirmed.")
            return True
    print("Data has been tampered with or is unverified.")
    return False

# Example: Checking integrity of data
tampered_data = {"id": 1, "info": "John loves cats, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi."}  # Tampered entry
check_integrity(tampered_data)  # Should print warning about tampering

# Example: Checking integrity of original data
check_integrity(employee_data[0])  # Should confirm integrity


Data has been tampered with or is unverified.
Data integrity confirmed.


True

In [11]:
# Define function to dynamically generate questions
def generate_dynamic_question(employee_info, risk_level):
    # Define prompt based on risk level and employee info
    prompt = f"Based on the following information about an employee: '{employee_info}', generate a {risk_level}-risk question for verification purposes. The answer must be present in employee info."
    
    # Generate question using the pipeline
    result = generation_pipeline(prompt, max_length=300, do_sample=True)
    
    # Retrieve generated text
    if isinstance(result, list) and "generated_text" in result[0]:
        question = result[0]["generated_text"].strip()
    else:
        question = result[0] if isinstance(result, list) else result

    return question

# Function to determine question difficulty based on risk score
def get_question_difficulty(risk_score):
    if risk_score < 10:
        return "low"
    elif risk_score < 15:
        return "medium"
    else:
        return "high"

# Example response generation function integrating with risk score
def generate_risk_aware_question(user_answer, expected_answer, employee_info):
    # Calculate risk score based on similarity
    expected_embedding = sentence_model.encode([expected_answer])
    user_embedding = sentence_model.encode([user_answer])
    
    # Calculate cosine similarity using sentence embeddings
    similarity_score = cosine_similarity(expected_embedding, user_embedding)[0][0]
    
    # Risk score is inversely related to similarity
    risk_score = (1 - similarity_score) * 100

    # Select question difficulty based on risk score
    difficulty = get_question_difficulty(risk_score)
    
    # Dynamically generate question based on employee data and risk level
    question = generate_dynamic_question(employee_info, difficulty)

    # Output for testing
    print(f"Risk Score: {risk_score:.2f} - Question Difficulty: {difficulty.capitalize()}")
    print(f"Next Question: {question}")

    return question, risk_score

# Example Usage
query = "Where did John go on his last vacation?"
expected_answer = "Bali"
user_answer = "He went to Paris"  # Incorrect answer, triggering higher risk
employee_info = "John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi."

next_question, current_risk_score = generate_risk_aware_question(user_answer, expected_answer, employee_info)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Risk Score: 19.56 - Question Difficulty: High
Next Question: Based on the following information about an employee: 'John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi.', generate a high-risk question for verification purposes. The answer must be present in employee info.
What is the name of the local animal shelter where John volunteers on w

In [15]:
def agentic_rag(query, risk_threshold=15):
    """Retrieves context, generates dynamic questions, and refines based on user responses autonomously."""
    
    # Initial retrieval context
    context = retrieve_context(query, top_k=1)[0]
    history = []  # To store question/response history

    # Generate initial response
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    initial_response = generation_pipeline(prompt, max_length=300, do_sample=True)[0]["generated_text"]
    response = initial_response  # Initialize response with initial response

    # Simulate user answer and calculate risk
    user_answer = "Simulated User Answer"
    risk_score = calculate_risk_score(initial_response, user_answer)

    # Dynamic adjustment of query flow based on risk score
    while risk_score > risk_threshold:
        # Adjust query difficulty for high-risk score
        refined_prompt = f"Context: {context}\nGiven higher risk detected, please verify:\n{query}\nAnswer:"
        response = generation_pipeline(refined_prompt, max_length=300, do_sample=True)[0]["generated_text"]
        
        # Log and re-evaluate
        history.append((query, response))
        user_answer = "Bali"  # Simulate hacker answer
        risk_score = calculate_risk_score(response, user_answer)

    return response, risk_score, history

# Example Query Execution with Agentic RAG
query = "Where did John go on his last vacation?"
response, final_risk_score, conversation_history = agentic_rag(query)
print("Final Response:", response)
print("Final Risk Score:", final_risk_score)
print("Conversation History:", conversation_history)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Final Response: Context: John loves pets, has a golden retriever named Max, and enjoys volunteering at the local animal shelter on weekends. He visited Bali last summer and stayed at the Serenity Resort. During his trip, he took a surfing lesson, tried local Balinese cuisine, and attended a cultural dance event. John works as a software engineer in the finance department and has been with the company since 2018. He attended Stanford University, where he studied Computer Science. John’s hobbies include hiking, reading science fiction novels, and playing the guitar. He has a younger sister, Emma, who lives in Boston. Recently, he went on a business trip to Tokyo and enjoyed visiting the Tokyo Tower and trying authentic sushi.
Given higher risk detected, please verify:
Where did John go on his last vacation?
Answer: Bali.
Final Risk Score: 14.834016561508179
Conversation History: [('Where did John go on his last vacation?', 'Context: John loves pets, has a golden retriever named Max, and 