In [None]:
!pip install transformers torch faiss-cpu chromadb pandas sentence-transformers




In [None]:
import pandas as pd

# Load dataset
file_path = "Company_Internal_Documentation_Dataset.csv"
df = pd.read_csv(file_path)

df = df.fillna("N/A")
text_columns = ["Department", "Role", "Task/Process", "Tools Used", "Best Practice", "Common Issue & Solution", "Contact Person"]
for col in text_columns:
    df[col] = df[col].astype(str).str.strip().str.lower()

print("Data loaded and processed successfully!")


Data loaded and processed successfully!


In [None]:

import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a combined text column for retrieval
df["retrieval_text"] = (
    "Department: " + df["Department"] + ", Role: " + df["Role"] + ", Task: " + df["Task/Process"] +
    ", Tools: " + df["Tools Used"] + ", Best Practice: " + df["Best Practice"] +
    ", Common Issues: " + df["Common Issue & Solution"] + ", Contact Person: " + df["Contact Person"]
)

# Generate embeddings
embeddings = embedding_model.encode(df["retrieval_text"].tolist(), convert_to_numpy=True)

# Create FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

# Save FAISS index and metadata
faiss.write_index(faiss_index, "company_qa_faiss.index")
with open("company_qa_metadata.pkl", "wb") as f:
    pickle.dump(df.to_dict(), f)

print("FAISS index and metadata saved successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index and metadata saved successfully!


In [None]:
faiss_index = faiss.read_index("company_qa_faiss.index")
with open("company_qa_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)
    df = pd.DataFrame.from_dict(metadata)





def retrieve_context(query, top_k=3):
    query_vector = embedding_model.encode([query])
    distances, indices = faiss_index.search(query_vector, top_k * 3)

    retrieved_docs = []
    seen_roles = set()

    for i in range(len(indices[0])):
        match_idx = indices[0][i]
        row = df.iloc[match_idx]
        role = row["Role"]

        #contact info
        contact_info = f"Contact Person: {row['Contact Person']}" if row["Contact Person"] != "N/A" else "No assigned contact"

        if "contact" in query.lower() or "who should I contact" in query.lower():
            retrieved_docs.append(f"Role: {row['Role']}, {contact_info}")

        elif role.lower() in query.lower() and role not in seen_roles:
            retrieved_docs.append(
                f"Department: {row['Department']}, Role: {row['Role']}, "
                f"Task: {row['Task/Process']}, Tools: {row['Tools Used']}, "
                f"Best Practice: {row['Best Practice']}, Issues: {row['Common Issue & Solution']}, "
                f"{contact_info}"
            )
            seen_roles.add(role)

        if len(retrieved_docs) >= top_k:
            break

    return "\n\n".join(retrieved_docs) if retrieved_docs else "No relevant results found."




print("Retrieval function is ready!")



Retrieval function is ready!


In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if torch.cuda.is_available():
    phi_model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="auto"
    )
else:
    phi_model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map={"": "cpu"}
    )
    # Save the model locally
tokenizer.save_pretrained("models/phi-2")
phi_model.save_pretrained("models/phi-2")

print("Phi-2 Model loaded successfully!")



tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading model (this may take several minutes)...


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Saving model locally...
  adding: phi-2/ (stored 0%)
  adding: phi-2/added_tokens.json (deflated 84%)
  adding: phi-2/config.json (deflated 47%)
  adding: phi-2/model-00002-of-00002.safetensors (deflated 8%)
  adding: phi-2/vocab.json (deflated 59%)
  adding: phi-2/tokenizer.json (deflated 82%)
  adding: phi-2/tokenizer_config.json (deflated 94%)
  adding: phi-2/model.safetensors.index.json (deflated 96%)
  adding: phi-2/generation_config.json (deflated 24%)
  adding: phi-2/special_tokens_map.json (deflated 75%)
  adding: phi-2/model-00001-of-00002.safetensors (deflated 8%)
  adding: phi-2/merges.txt (deflated 53%)


In [None]:
def generate_response(query, top_k=3):
    context = retrieve_context(query, top_k)

    if "No relevant results found." in context:
        return "I couldn't find specific data related to your request in the company documentation."

    prompt = f"""
    You are an AI assistant designed to help new employees.
    Answer in a natural, human-like way, without using code or programming syntax.
    Do NOT format your response as a class, function, or structured data.
    Simply provide a professional, conversational answer.

    CONTEXT:
    {context}

    QUESTION: {query}

    ANSWER:
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    outputs = phi_model.generate(
        inputs.input_ids,
        max_length=1024,
        num_return_sequences=1,
        temperature=0.5,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-process the response to filter out unwanted content
    response = response.replace(prompt.strip(), "").strip()
    # Further filtering to remove any unwanted Q&A structures
    unwanted_phrases = ["QUESTION:", "ANSWER:", "return"]
    for phrase in unwanted_phrases:
        if phrase in response:
            response = response.split(phrase)[0].strip()
    # Prevent duplicated content by ensuring the response is not just a repeated context
    if response.lower().startswith(context.lower()):
        response = response[len(context):].strip()

    return response

In [None]:
query = "What are my tasks as a content writer, what tools i can use?"
response = generate_response(query)

print("AI Response:\n", response)



AI Response:
 As a content writer, your tasks include creating engaging and informative content for our marketing campaigns. You can use tools like MailChimp to manage and send out newsletters and other promotional materials. It is important to ensure proper documentation before execution to avoid any issues like incorrect permissions leading to access denial. If you encounter any issues, you can contact John Doe for assistance.
    """
