## RAG demo
### Match resume with a list of JD (from Adzuna)

In [75]:
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.document_loaders import Docx2txtLoader
from dotenv import load_dotenv
import os

import requests

### Initiate vector store model and llm model

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = InMemoryVectorStore(embeddings)


No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


In [33]:
model = OllamaLLM(model="llama3.2")

### Job query - Adzuna

In [76]:
load_dotenv()  # Loads variables from .env into environment

app_id = os.getenv("app_id")
app_key = os.getenv("app_key")

In [78]:

# Define the endpoint and query parameters
url = 'http://api.adzuna.com/v1/api/jobs/gb/search/2'

params = {
    'app_id': app_id,
    'app_key': app_key,
    'results_per_page': 20,
    'what': 'data scientist',
    'content-type': 'application/json'
}

# Make the GET request
response = requests.get(url, params=params)

# initiate langchain documents
documents = []

# Check if it worked
if response.status_code == 200:
    data = response.json()  # Parse JSON response
    for job in data.get('results', []):
        content = f"Title: {job.get('title')}\n\nDescription: {job.get('description')}"
        metadata = {
            "id": job.get("id"),
            "company": job.get("company", {}).get("display_name"),
            "location": job.get("location", {}).get("display_name"),
            "salary_min": job.get("salary_min"),
            "salary_max": job.get("salary_max"),
            "url": job.get("redirect_url")
        }

        documents.append(Document(page_content=content, metadata=metadata))

    # documents now contains LangChain Document objects
    print(f"Loaded {len(documents)} job listings as LangChain Documents.")
else:
    print("Error:", response.status_code, response.text)

Loaded 20 job listings as LangChain Documents.


In [79]:
documents

[Document(metadata={'id': '5123040031', 'company': 'Aviva', 'location': 'Norwich, Norfolk', 'salary_min': 59776.69, 'salary_max': 59776.69, 'url': 'https://www.adzuna.co.uk/jobs/land/ad/5123040031?se=JqAONdwY8BGlD81SPiZXLA&utm_medium=api&utm_source=ad3ef867&v=E0BEAF007B584B3D4E6DDE94799DF5042F848491'}, page_content="Title: Lead Data Scientist\n\nDescription: Lead Data Scientist Salary: London £60,000 - £75,000 / National £50,000 - £65,000 Are you excited by innovation and keen to deliver meaningful change? Would you like the chance to help shape a new Data Science team, working together to uncover opportunity and solve real business problems in an exciting growing area. If yes, then you may be the perfect fit for this opportunity to make your mark as a Lead Data Scientist within Aviva's new Global Corporate & Specialty (GCS) Data and AI Team. The i…"),
 Document(metadata={'id': '5124103180', 'company': 'GMA Consulting', 'location': 'Bristol, South West England', 'salary_min': 100000, '

### Chunk the text: no need since it's short

In [None]:
# def split_text(documents):
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=1000,
#         chunk_overlap=200,
#         add_start_index=True
#     )
#     data = text_splitter.split_documents(documents)

#     return data


### Index docs

In [26]:
vector_store.add_documents(documents=documents)

['c185421b-db3e-4a3e-a7dc-9cec5036d371',
 '46a521a5-6eba-4138-a21b-22cdd1348aab',
 'f4a4113d-85e8-44f5-b898-a603e4debc7c',
 '436674a8-6c6b-40a5-8f3c-90a56008b333',
 '3840768c-ec57-4fc6-b852-b17596bfcddd',
 'fa990f38-25f8-4f3a-a773-389a57936498',
 '61e7a073-9651-4fd4-a9c4-99a3f2ca5643',
 '845c61b2-7c74-4f41-adc9-10b6232c061b',
 '2fcb16b1-1d48-472f-b84c-4267759b6236',
 'b34ed0a0-6d25-4054-8ddb-68a0b8f52705',
 '1a55663d-477e-495b-a217-7360919e006a',
 'a15380c2-b8df-412d-9c4a-485137a65d85',
 'f0c710e4-bd07-46ee-8e72-d1c8e865aee3',
 '51fbc6e4-753c-451e-ad7c-350faf4cf119',
 '21e902cd-bb2d-4fe5-af8e-af5fd6e96c21',
 '5453b51d-35b1-4659-a4bf-ecdd62fcc597',
 '3b48bae9-9cd9-4333-8d28-74933fe6f132',
 'eace9265-c324-4308-9376-e4ad7e93fd04',
 'a9560158-ddb0-474c-90bb-fdf125ec6154',
 '279e8a17-589e-4e79-b033-d8f47e99d451']

### Query and retrieval

In [53]:
template = """
You are an assistant for job matching tasks. Use the following pieces of retrieved context (job description, job title) to find the job that suits the query (resume or cv) most. If you don't know the answer, just say that you don't know. 
Question: {question} 
Context: {context} 
Answer:
"""

### Load and index resume

In [60]:
# load the doc
loader = Docx2txtLoader("resume.docx")

query = loader.load()

query

[Document(metadata={'source': 'resume.docx'}, page_content='YIJIN BAO\n\n\t\t+1-773-490-7200\n\n\t\tChicago, Illinois\n\n\t\tkay062@uchicago.edu, LinkedIn | GitHub\n\n\n\n\n\neducation\n\n\tHarris School of Public Policy, University of Chicago\tChicago, IL\n\n\tData Analytics Specialization of Master of Public Policy (MPP), Graduate with Honor\tJune 2024\n\nData Science: ETL pipeline, NLP, Deep Learning, Machine Learning, Computer Vision\n\nData Analytics: data collection, management, cleaning, manipulation, sanity check, analysis, visualization, GIS\n\nPublic Policy analysis: modeling and research for transportation policy, healthcare policy, urban policy\n\nBusiness data product: dashboard design, interactive chatbot, database management, automated workflows\n\n\n\n\tShanghai International Studies University\tShanghai, China\n\n\tDual Bachelor of Laws and English, Outstanding Undergraduate Thesis Honor\tJuly 2022\n\nInternational Law; International Economics Law; International Financ

In [68]:
# Assume query is a LangChain Document
if isinstance(query[0], Document):
    query_text = query[0].page_content
else:
    query_text = query[0]  # If it's already a string

### Invoke answer

In [72]:
# Retrieval a list of documents based on similarity search
retrieve_documents = vector_store.similarity_search(query_text)
retrieve_documents[0]

context = "\n\n".join([doc.page_content for doc in retrieve_documents])

In [74]:
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model
chain.invoke({"question": query_text, "context": context})

"Based on the provided context (resume or CV), I found a potential match for YIJIN BAO's skills:\n\n**Job Title: Data Scientist**\n\n**Location:** Farnborough, Hampshire\n\nThis job title aligns with YIJIN BAO's skills in Machine Learning, Data Visualization, Python, R, SQL, and other areas mentioned on their resume. Additionally, the role involves working with data integration, AI, and system optimization, which also matches YIJIN BAO's experience as a Data Science fellow at Illinois Department of Employment Safety.\n\nPlease note that without more information about YIJIN BAO's specific interests or preferences, it's difficult to confirm if this job is the best fit."