# Install Resource Library

In [2]:
!pip install langchain-core langgraph>0.2.27
!pip install -qU langchain-openai
!pip install langchain-groq
!pip install pymysql
!pip install cryptography
!pip install langchain_community
!pip install faiss-cpu
!pip install boto3
!pip install rank_bm25

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.9/415.9 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.4/567.4 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.12 requires async-timeout<5.0.0,>=4.0.0; python_version < "3.11", but you have async-timeout 5.0.1 which is incompatible.[0m[31m
[0mCollecting langchain-groq
  Downloading langchain_groq-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.19.0-py3-none-any.whl.metadata (15 kB)
Downloading langchain_groq-0.2.5-py3-none-any.whl (15 kB)
Downloading groq-0.19.0-py3-none-any.whl (1

# Import Library

In [None]:
# Utilization
import os
from typing_extensions import Annotated, TypedDict
from typing import Sequence, Literal, Optional, Dict, Any
from io import StringIO  # read file from s3
import pandas as pd
# LangChain
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
# S3 Object AWS
import boto3
# Document and Splitter
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Huggingface Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# Hybrid search to enhance semantic search
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
# Vector Store
from langchain_community.vectorstores import FAISS
# QNA
from langchain.chains import RetrievalQA
# GPU
import torch
# kaggle secrets key
from kaggle_secrets import UserSecretsClient

# LangSmith set up to track process

In [40]:
# Define API KEY for GROQ, AWS credentials
user_secrets = UserSecretsClient()
os.environ["AWS_ACCESS_KEY_ID"] = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
os.environ["GROQ_API_KEY"]= user_secrets.get_secret("GROQ_API_KEY")
os.environ["S3_BUCKET_JOB"] = user_secrets.get_secret("S3_BUCKET_JOB")

In [41]:
# Define model chatbot
llm = ChatGroq(temperature=0, 
               model_name="mistral-saba-24b", # model name
              max_tokens=2048, # limit the length of response
              streaming=True, # enable streaming response
              max_retries=2 # enhance call back for the program
            )

In [42]:
# test llm connection
system = "You are a helpful assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | llm
chain.invoke({"text": "Where is Ho Chi Minh CIty located."})

AIMessage(content='Ho Chi Minh City, formerly known as Saigon, is located in the southern region of Vietnam. It is the largest city in the country and serves as its economic hub. Here are some geographical details:\n\n- **Country**: Vietnam\n- **Region**: Southern Vietnam\n- **Coordinates**: Approximately 10.8503° N, 106.6880° E\n- **Nearest Major Landmarks**: The city is situated along the Saigon River, which flows into the Mekong River Delta.\n\nHo Chi Minh City is known for its vibrant culture, bustling markets, and historical sites, including the Reunification Palace and the War Remnants Museum. It is a major economic center and a popular destination for both business and tourism.', additional_kwargs={}, response_metadata={'finish_reason': 'stop'}, id='run-74ea9d77-f1c3-42ef-a5df-e7c0939aa810-0', usage_metadata={'input_tokens': 20, 'output_tokens': 158, 'total_tokens': 178})

In [9]:
# Read file from s3 
s3_client = boto3.client("s3")
response = s3_client.get_object(Bucket=os.environ["S3_BUCKET_JOB"], Key = "job_data.csv")
content = response["Body"].read().decode("utf-8")

In [10]:
# convert content into Data Frame
df = pd.read_csv(StringIO(content))

In [11]:
df.head()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
0,Data Engineer Intern,OPSWAT,"Ho Chi Minh City, Vietnam",2 days ago,Over 100 applicants,"On-site,Internship,0 of 10 skills match Requir...",The Position\nWe are looking for an eager and ...
1,Data Engineer Intern,ActiveFence,"Hanoi, Hanoi, Vietnam",5 days ago,70 people clicked apply,"Hybrid,Internship",About the Role:\nWe are seeking a motivated an...
2,"Software Intern (Fullstack), Digital Business",VNG Corporation,"Phường Chí Minh, Hai Duong, Vietnam",2 days ago,95 people clicked apply,"On-site,Full-time",Mô tả công việc\nAssist in full-stack web deve...
3,"Technical Intern, SLT Data Application Engineer",Ampere,Ho Chi Minh City Metropolitan Area,2 weeks ago,41 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
4,Software Engineer Intern - QA,ShopBack,Ho Chi Minh City Metropolitan Area,Reposted 19 hours ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...


In [12]:
df.tail()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
16,Software Engineer Intern - Frontend,ShopBack,Ho Chi Minh City Metropolitan Area,2 weeks ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...
17,Game Designer ( 6 - months internship),Gameloft,"Hanoi, Hanoi, Vietnam",2 weeks ago,19 applicants,"On-site,Full-time",Join The Game!\nLeader in the development and ...
18,"DevOps Engineer Intern, KMS Healthcare",KMS Healthcare,"Đà Nang, Da Nang City, Vietnam",2 days ago,3 applicants,"Hybrid,Internship",Company Description\nKMS Technology\nwas estab...
19,"Technical Intern, BMC Software Engineer",Ampere,Ho Chi Minh City Metropolitan Area,Reposted 1 week ago,Over 100 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
20,Junior AI Software Engineer,AvePoint,"Hanoi, Hanoi, Vietnam",Reposted 5 days ago,52 applicants,"On-site,Full-time",Responsibilities\nDevelopment Support:\nAssist...


In [13]:
# check type of data
type(df)

pandas.core.frame.DataFrame

# Analyze the data

In [14]:
# check all features of data
df.columns.to_list()

['job_title',
 'company_name',
 'job_location',
 'job_time_posted',
 'job_applicants_applied',
 'job_role',
 'job_details']

In [15]:
# check null data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   job_title               21 non-null     object
 1   company_name            21 non-null     object
 2   job_location            21 non-null     object
 3   job_time_posted         21 non-null     object
 4   job_applicants_applied  21 non-null     object
 5   job_role                21 non-null     object
 6   job_details             20 non-null     object
dtypes: object(7)
memory usage: 1.3+ KB


In [16]:
df.head()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
0,Data Engineer Intern,OPSWAT,"Ho Chi Minh City, Vietnam",2 days ago,Over 100 applicants,"On-site,Internship,0 of 10 skills match Requir...",The Position\nWe are looking for an eager and ...
1,Data Engineer Intern,ActiveFence,"Hanoi, Hanoi, Vietnam",5 days ago,70 people clicked apply,"Hybrid,Internship",About the Role:\nWe are seeking a motivated an...
2,"Software Intern (Fullstack), Digital Business",VNG Corporation,"Phường Chí Minh, Hai Duong, Vietnam",2 days ago,95 people clicked apply,"On-site,Full-time",Mô tả công việc\nAssist in full-stack web deve...
3,"Technical Intern, SLT Data Application Engineer",Ampere,Ho Chi Minh City Metropolitan Area,2 weeks ago,41 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
4,Software Engineer Intern - QA,ShopBack,Ho Chi Minh City Metropolitan Area,Reposted 19 hours ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...


# Loading Documents

In [43]:
# Convert into documents
documents = [
    Document(
        page_content=f'''Job Title: {row["job_title"]}
        \nCompany Name: {row["company_name"]}
        \nJob Location: {row["job_location"]}
        \nTime of the job that is posted in Linkedin: {row["job_time_posted"]}
        \nApplicants of Job that is applied: {row["job_applicants_applied"]}
        \nRole of the job: {row["job_role"]}
        \nDetails of the job: it include: qualification, requirement, beneficial and something like that: {row["job_details"]}''',
        metadata={
            "job_title": row["job_title"],
            "company_name": row["company_name"],
            "job_location": row["job_location"]
        },
    )
    for _, row in df.iterrows()
]

# Preprocessing text with Splitter and Load it into Embedding

In [44]:
# use GPU if it exist
device = "cuda" if torch.cuda.is_available() else "cpu"

In [45]:
# Split long text
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=100,
    add_start_index=True
)
split_docs = splitter.split_documents(documents)

# Embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"device": device}, # use GPU or CPU
    encode_kwargs={"batch_size": 32, "normalize_embeddings": True} # fit with CPU
)

# Store documents in FAISS
vector_db = FAISS.from_documents(
    split_docs, 
    embeddings,
    distance_strategy="COSINE" # change distance
)

# Save to disk
vector_db.save_local("faiss_index")

In [None]:
# Load your FAISS index
vector_db = FAISS.load_local(
    "/kaggle/working/faiss_index", 
    embeddings,
    allow_dangerous_deserialization=True 
    # because langchain requires explicit permission to desserilization the saved FAISS
)
vector_retriever = vector_db.as_retriever(search_kwargs={"k": 5})

# Create BM25 retriever for keyword search
bm25_retriever = BM25Retriever.from_documents(split_docs)
bm25_retriever.k = 5

# Create ensemble retriever for hybrid search: Semantic Search and Keyword Search
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.3, 0.7]  # Weight semantic search higher
)

In [63]:
# Search for similar job postings
query = "Recommend for me about job opening at OPSWAT"
retrieved_jobs = vector_db.similarity_search(query, k=2)

for job in retrieved_jobs:
    print(job.page_content)  # Print retrieved job postings

Job Title: Data Engineer Intern
        
Company Name: OPSWAT
        
Job Location: Ho Chi Minh City, Vietnam
        
Time of the job that is posted in Linkedin: 2 days ago
        
Applicants of Job that is applied: Over 100 applicants
        
Role of the job: On-site,Internship,0 of 10 skills match Required skills are missing from your profile,0 of 10 skills match
        
Details of the job: it include: qualification, requirement, beneficial and something like that: The Position
We are looking for an eager and motivated Data Engineer Intern to join our team. In this role, you will assist in building and maintaining data pipelines, supporting data integration tasks, and learning how to leverage cloud and database technologies. This internship provides hands-on experience in data engineering, and you will have the opportunity to work with experienced professionals and grow your skills in data technologies.
What You Will be Doing
Job Title: DevOps Engineer Intern, KMS Healthcare
   

# Test performance between Semantic Search and Hybrid Search

In [47]:
# Without Hybrid Search
# Create retriever from vector database
retriever = vector_db.as_retriever(search_kwargs={"k": 5})

# Set up RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

# Ask a job-related question
response = qa_chain.run("Company available job about Data Engineer or Data Engineer Intern?")
print(response)

Based on the provided context, the following companies have job postings related to Data Engineer or Data Engineer Intern:

1. **OPSWAT** - Data Engineer Intern
2. **ActiveFence** - Data Engineer Intern


In [48]:
# With Hybrid Search
# Set up RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm, retriever=ensemble_retriever)

# Ask a job-related question
response = qa_chain.run("Company available job about Data Engineer or Data Engineer Intern?")
print(response)

Based on the provided context, the following companies have job postings related to Data Engineer or Data Engineer Intern:

1. **OPSWAT** - Data Engineer Intern
   - Location: Ho Chi Minh City, Vietnam
   - Posted: 2 days ago

2. **ActiveFence** - Data Engineer Intern
   - Location: Hanoi, Hanoi, Vietnam
   - Posted: 5 days ago


# QNA Enhancement

In [49]:
# Create a custom prompt template that emphasizes using context
template = """Answer the question based only on the following context:

{context}

Question: {question}
Answer: """

PROMPT = PromptTemplate(
    template=template, 
    input_variables=["context", "question"]
)

# Set up a more detailed chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=ensemble_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
    verbose=True  # Add this to see processing details
)

# Run query with full response
result = qa_chain({"query": "Tell me about job openings at OPSWAT"})
print("Answer:", result["result"])
print("\nSource documents used:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content[:150]}...")
    print(f"Metadata: {doc.metadata}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: Based on the provided context, there is one job opening at OPSWAT:

**Job Title:** Data Engineer Intern

**Company Name:** OPSWAT

**Job Location:** Ho Chi Minh City, Vietnam

**Time of the job that is posted on LinkedIn:** 2 days ago

**Applicants of Job that is applied:** Over 100 applicants

**Role of the job:** On-site, Internship

**Details of the job:**
- **The Position:** OPSWAT is looking for an eager and motivated Data Engineer Intern to join their team. In this role, you will assist in building and maintaining data pipelines, supporting data integration tasks, and learning how to leverage cloud and database technologies. This internship provides hands-on experience in data engineering, and you will have the opportunity to work with experienced professionals and grow your skills in data technologies.

Source documents used:

Document 1:
Content: Job Title: Data Engineer Intern
        
Company Nam

In [33]:
# Run query with full response
result = qa_chain({"query": "Tell me about job openings for Data Engineer role"})
print("Answer:", result["result"])
print("\nSource documents used:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content[:150]}...")
    print(f"Metadata: {doc.metadata}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: Based on the provided context, there are two job openings for the Data Engineer role:

1. **Data Engineer Intern at OPSWAT**
   - **Company Name:** OPSWAT
   - **Job Location:** Ho Chi Minh City, Vietnam
   - **Posted:** 2 days ago
   - **Applicants:** Over 100 applicants
   - **Role:** On-site, Internship
   - **Responsibilities:**
     - Assist in building, testing, and optimizing simple data pipelines using tools like Azure Data Factory.
     - Manage and integrate data from different sources (SQL and NoSQL databases).
     - Write basic SQL queries for data extraction, transformation, and loading (ETL).
     - Support data engineers in automating processes and writing Python scripts for data manipulation.
     - Monitor data quality and assist with resolving data issues.
     - Collaborate with team members to ensure proper documentation of processes.
     - Learn about data modeling and best practices

In [34]:
# Run query with full response
result = qa_chain({"query": "Tell me about job openings for Data field"})
print("Answer:", result["result"])
print("\nSource documents used:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content[:150]}...")
    print(f"Metadata: {doc.metadata}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: Based on the provided context, there are three job openings in the data field:

1. **Data Engineer Intern at OPSWAT**
   - **Location:** Ho Chi Minh City, Vietnam
   - **Posted:** 2 days ago
   - **Applicants:** Over 100 applicants
   - **Role:** On-site, Internship
   - **Responsibilities:**
     - Assist in building, testing, and optimizing data pipelines using tools like Azure Data Factory.
     - Manage and integrate data from different sources (SQL and NoSQL databases).
     - Write basic SQL queries for ETL processes.
     - Support data engineers in automating processes and writing Python scripts for data manipulation.
     - Monitor data quality and assist with resolving data issues.
     - Collaborate with team members to ensure proper documentation of processes.
     - Learn about data modeling and best practices for data engineering.
   - **Qualifications:**
     - Currently pursuing a degree in