# Install Resource Library

In [None]:
!pip install langchain-core langgraph>0.2.27
!pip install -qU langchain-openai
!pip install langchain-groq
!pip install pymysql
!pip install cryptography
!pip install langchain_community
!pip install faiss-cpu
!pip install boto3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.9/415.9 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.4/567.4 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.12 requires async-timeout<5.0.0,>=4.0.0; python_version < "3.11", but you have async-timeout 5.0.1 which is incompatible.[0m[31m
[0mCollecting langchain-groq
  Downloading langchain_groq-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.19.0-py3-none-any.whl.metadata (15 kB)
Downloading langchain_groq-0.2.5-py3-none-any.whl (15 kB)
Downloading groq-0.19.0-py3-none-a

# Import Library

In [None]:
# Utilization
import os
from typing_extensions import Annotated, TypedDict
from typing import Sequence, Literal, Optional, Dict, Any
from io import StringIO
import pandas as pd
# LangChain
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
# S3 Object AWS
import boto3
# Document and Splitter
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Huggingface Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# Vector Store
from langchain_community.vectorstores import FAISS
# QNA
from langchain.chains import RetrievalQA
# kaggle secret key
from kaggle_secrets import UserSecretsClient

# LangSmith set up to track process

In [None]:
# Define API KEY Groq
user_secrets = UserSecretsClient()
os.environ["AWS_ACCESS_KEY_ID"] = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
os.environ["GROQ_API_KEY"]= user_secrets.get_secret("GROQ_API_KEY")
os.environ["S3_BUCKET_JOB"] = user_secrets.get_secret("S3_BUCKET_JOB")

In [9]:
# Define model chatbot
llm = ChatGroq(temperature=0, 
               model_name="mistral-saba-24b",
              max_tokens=2048,
              streaming=True,
              max_retries=2)

In [None]:
# test connection
system = "You are a helpful assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | llm
chain.invoke({"text": "Where is Ho Chi Minh CIty located."})

AIMessage(content='Ho Chi Minh City, formerly known as Saigon, is located in the southern region of Vietnam. It is the largest city in the country and serves as its economic hub. Here are some geographical details:\n\n- **Country**: Vietnam\n- **Region**: Southern Vietnam\n- **Coordinates**: Approximately 10.8503° N, 106.6880° E\n- **Nearest Major Landmarks**: The city is situated along the Saigon River, which flows into the Mekong River Delta.\n\nHo Chi Minh City is known for its vibrant culture, bustling markets, and historical sites, including the Reunification Palace and the War Remnants Museum. It is a major economic center and a popular destination for both business and tourism.', additional_kwargs={}, response_metadata={'finish_reason': 'stop'}, id='run-a09e49fc-315b-4ae5-ab61-a84436ed2187-0', usage_metadata={'input_tokens': 20, 'output_tokens': 158, 'total_tokens': 178})

In [19]:
# Read file from s3 
s3_client = boto3.client("s3")
response = s3_client.get_object(Bucket=os.environ["S3_BUCKET_JOB"], Key = "job_data.csv")
content = response["Body"].read().decode("utf-8")

In [None]:
# convert content into Data Frame
df = pd.read_csv(StringIO(content))

In [26]:
df.head()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
0,Data Engineer Intern,OPSWAT,"Ho Chi Minh City, Vietnam",2 days ago,Over 100 applicants,"On-site,Internship,0 of 10 skills match Requir...",The Position\nWe are looking for an eager and ...
1,Data Engineer Intern,ActiveFence,"Hanoi, Hanoi, Vietnam",5 days ago,70 people clicked apply,"Hybrid,Internship",About the Role:\nWe are seeking a motivated an...
2,"Software Intern (Fullstack), Digital Business",VNG Corporation,"Phường Chí Minh, Hai Duong, Vietnam",2 days ago,95 people clicked apply,"On-site,Full-time",Mô tả công việc\nAssist in full-stack web deve...
3,"Technical Intern, SLT Data Application Engineer",Ampere,Ho Chi Minh City Metropolitan Area,2 weeks ago,41 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
4,Software Engineer Intern - QA,ShopBack,Ho Chi Minh City Metropolitan Area,Reposted 19 hours ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...


In [27]:
df.tail()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
16,Software Engineer Intern - Frontend,ShopBack,Ho Chi Minh City Metropolitan Area,2 weeks ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...
17,Game Designer ( 6 - months internship),Gameloft,"Hanoi, Hanoi, Vietnam",2 weeks ago,19 applicants,"On-site,Full-time",Join The Game!\nLeader in the development and ...
18,"DevOps Engineer Intern, KMS Healthcare",KMS Healthcare,"Đà Nang, Da Nang City, Vietnam",2 days ago,3 applicants,"Hybrid,Internship",Company Description\nKMS Technology\nwas estab...
19,"Technical Intern, BMC Software Engineer",Ampere,Ho Chi Minh City Metropolitan Area,Reposted 1 week ago,Over 100 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
20,Junior AI Software Engineer,AvePoint,"Hanoi, Hanoi, Vietnam",Reposted 5 days ago,52 applicants,"On-site,Full-time",Responsibilities\nDevelopment Support:\nAssist...


In [28]:
# check type of data
type(df)

pandas.core.frame.DataFrame

# Analyze the data

In [30]:
# check all features of data
df.columns.to_list()

['job_title',
 'company_name',
 'job_location',
 'job_time_posted',
 'job_applicants_applied',
 'job_role',
 'job_details']

In [31]:
# check null data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   job_title               21 non-null     object
 1   company_name            21 non-null     object
 2   job_location            21 non-null     object
 3   job_time_posted         21 non-null     object
 4   job_applicants_applied  21 non-null     object
 5   job_role                21 non-null     object
 6   job_details             20 non-null     object
dtypes: object(7)
memory usage: 1.3+ KB


In [48]:
df.head()

Unnamed: 0,job_title,company_name,job_location,job_time_posted,job_applicants_applied,job_role,job_details
0,Data Engineer Intern,OPSWAT,"Ho Chi Minh City, Vietnam",2 days ago,Over 100 applicants,"On-site,Internship,0 of 10 skills match Requir...",The Position\nWe are looking for an eager and ...
1,Data Engineer Intern,ActiveFence,"Hanoi, Hanoi, Vietnam",5 days ago,70 people clicked apply,"Hybrid,Internship",About the Role:\nWe are seeking a motivated an...
2,"Software Intern (Fullstack), Digital Business",VNG Corporation,"Phường Chí Minh, Hai Duong, Vietnam",2 days ago,95 people clicked apply,"On-site,Full-time",Mô tả công việc\nAssist in full-stack web deve...
3,"Technical Intern, SLT Data Application Engineer",Ampere,Ho Chi Minh City Metropolitan Area,2 weeks ago,41 people clicked apply,"On-site,Internship",Description\nInvent the future with us.\nRecog...
4,Software Engineer Intern - QA,ShopBack,Ho Chi Minh City Metropolitan Area,Reposted 19 hours ago,Over 100 people clicked apply,"Hybrid,Internship",Our Journey\nShopBack started as a spark of in...


# Loading Documents

In [61]:
# Convert into documents
documents = [
    Document(
        page_content=f'''Job Title: {row["job_title"]}
        \nCompany Name: {row["company_name"]}
        \nJob Location: {row["job_location"]}
        \nTime of the job that is posted in Linkedin: {row["job_time_posted"]}
        \nApplicants of Job that is applied: {row["job_applicants_applied"]}
        \nRole of the job: {row["job_role"]}
        \nDetails of the job: it include: qualification, requirement, beneficial and something like that: {row["job_details"]}''',
        metadata={
            "job_title": row["job_title"],
            "company_name": row["company_name"]    
        },
    )
    for _, row in df.iterrows()
]

# Processing text with Splitter and Load it into Embedding

In [62]:
# Split long text
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = splitter.split_documents(documents)

# Embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Store documents in FAISS
vector_db = FAISS.from_documents(split_docs, embeddings)

# Save to disk
vector_db.save_local("faiss_index")

# Retriever 

In [63]:
# Search for similar job postings
query = "Recommend for me about job opening at OPSWAT"
retrieved_jobs = vector_db.similarity_search(query, k=2)

for job in retrieved_jobs:
    print(job.page_content)  # Print retrieved job postings

Job Title: Data Engineer Intern
        
Company Name: OPSWAT
        
Job Location: Ho Chi Minh City, Vietnam
        
Time of the job that is posted in Linkedin: 2 days ago
        
Applicants of Job that is applied: Over 100 applicants
        
Role of the job: On-site,Internship,0 of 10 skills match Required skills are missing from your profile,0 of 10 skills match
        
Details of the job: it include: qualification, requirement, beneficial and something like that: The Position
We are looking for an eager and motivated Data Engineer Intern to join our team. In this role, you will assist in building and maintaining data pipelines, supporting data integration tasks, and learning how to leverage cloud and database technologies. This internship provides hands-on experience in data engineering, and you will have the opportunity to work with experienced professionals and grow your skills in data technologies.
What You Will be Doing
Job Title: DevOps Engineer Intern, KMS Healthcare
   

# QNA

In [67]:
# Create retriever from vector database
retriever = vector_db.as_retriever(search_kwargs={"k": 5})

# Set up RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

# Ask a job-related question
response = qa_chain.run("Company available job about Data Engineer or Data Engineer Intern?")
print(response)

Based on the provided context, the following companies have job postings related to Data Engineer or Data Engineer Intern:

1. **OPSWAT** - Data Engineer Intern
2. **ActiveFence** - Data Engineer Intern


In [68]:
response = qa_chain.run("OPSWAT")
print(response)

I don't know.
