In [44]:
import os
from langchain_groq import ChatGroq
from langchain.document_loaders import PyMuPDFLoader, WebBaseLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
import json


In [45]:
import os
os.environ["GROQ_API_KEY"] = "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"

### TASK 1: SOURCE DISCOVERY ###
### Load resume document

In [46]:
# Define sources
pdf_files = ["CV_Khin_Yadanar_Hlaing.pdf"]
web_links = ["https://www.linkedin.com/in/kyhlaing/"]

In [47]:
documents = []

# Load PDF documents
for pdf_file in pdf_files:
    if os.path.exists(pdf_file):
        pdf_loader = PyMuPDFLoader(pdf_file)
        documents.extend(pdf_loader.load())


In [48]:
# Load LinkedIn (or other web) data
for link in web_links:
    web_loader = WebBaseLoader(link)
    documents.extend(web_loader.load())


In [49]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [50]:
# Generate embeddings using SentenceTransformer
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)

In [51]:
# Create retriever
retriever = vectorstore.as_retriever()

# Define a custom prompt template
prompt_template = PromptTemplate(
    template="You are an AI assistant. Answer the following question based on the provided context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:",
    input_variables=["context", "question"]
)


In [52]:
# Initialize Groq LLM
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [53]:
# Initialize QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

### TASK 2: ANALYSIS AND PROBLEM SOLVING ###

In [54]:
retriever_model = "FAISS (all-MiniLM-L6-v2)"
generator_model = "Groq LLaMA-3.1-8B"

# Document the models used
print(f"Retriever Model: {retriever_model}")
print(f"Generator Model: {generator_model}")

# Analyze potential issues (hallucinations, unrelated responses)
def analyze_response(question, response):
    print(f"Q: {question}\nA: {response['result']}\nSources: {[doc.metadata for doc in response['source_documents']]}")

Retriever Model: FAISS (all-MiniLM-L6-v2)
Generator Model: Groq LLaMA-3.1-8B


### TASK 3: CHATBOT DEVELOPMENT & JSON OUTPUT ###

In [55]:
questions = [
    "How old am I?",
    "What is my highest level of education?",
    "What major or field of study did I pursue during my education?",
    "How many years of work experience do I have?",
    "What type of work or industry have I been involved in?",
    "Can you describe your current role or job responsibilities?",
    "What are your core beliefs regarding the role of technology in shaping society?",
    "How do you think cultural values should influence technological advancements?",
    "As a master’s student, what is the most challenging aspect of your studies so far?",
    "What specific research interests or academic goals do you hope to achieve during your time as a master’s student?"
]

In [56]:
# Store responses in JSON format
qa_responses = []
for question in questions:
    response = qa_chain.invoke({"query": question})
    qa_responses.append({"question": question, "answer": response['result']})
    analyze_response(question, response)

# Save responses as JSON
with open("chatbot_responses.json", "w") as json_file:
    json.dump(qa_responses, json_file, indent=4)

print("Responses saved to chatbot_responses.json")

Q: How old am I?
A: You are 29 years old, as you were born on August 5, 1996.
Sources: [{'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-03-15T00:31:18+07:00', 'source': 'CV_Khin_Yadanar_Hlaing.pdf', 'file_path': 'CV_Khin_Yadanar_Hlaing.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'python-docx', 'subject': '', 'keywords': '', 'moddate': '2025-03-15T00:31:18+07:00', 'trapped': '', 'modDate': "D:20250315003118+07'00'", 'creationDate': "D:20250315003118+07'00'", 'page': 0}, {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-03-15T00:31:18+07:00', 'source': 'CV_Khin_Yadanar_Hlaing.pdf', 'file_path': 'CV_Khin_Yadanar_Hlaing.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'python-docx', 'subject': '', 'keywords': '', 'moddate': '2025-03-15T00:31:18+07:00', 'trapped': '', 'modDate': "D:20250315003118+07'00'", 'creationDate': "D:20250315003118+07'00'", 'page': 0}, 