In [1]:
# Setup ChatGroq LLM with model and API key, zero randomness

from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    groq_api_key="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",
    temperature=0
)

In [2]:
# Uses Playwright to fetch HTML content from a webpage.
# - Launches headless browser, sets User-Agent, and opens URL
# - Waits for page to load and returns full HTML
# - Wraps HTML in a LangChain Document with source metadata

from playwright.async_api import async_playwright
from langchain_core.documents import Document

async def fetch_html(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.set_extra_http_headers({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
        })
        await page.goto(url, timeout=60000)
        await page.wait_for_load_state("domcontentloaded")
        html = await page.content()
        await browser.close()
        return html

html = await fetch_html("https://openai.com/careers/data-scientist/") # Input URL 
doc = Document(page_content=html, metadata={"source": "openai.com"})

In [3]:
# Parse HTML, extract visible text, wrap in LangChain Document, and preview

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(separator="\n", strip=True)
doc = Document(page_content=text, metadata={"source": "openai.com"})

print(doc.page_content[:5000])

Data Scientist | OpenAI
Skip to main content
Log in
Switch to
ChatGPT
(opens in a new window)
Sora
(opens in a new window)
API Platform
(opens in a new window)
Home
About Us
Our Charter
Careers
Brand Guidelines
Research
Back to main menu
Research Index
Research Overview
Research Residency
Latest Advancements
OpenAI o3 and o4-mini
GPT-4.5
OpenAI o1
GPT-4o
Sora
Safety
Back to main menu
Safety Approach
Security & Privacy
For Business
Back to main menu
Business Overview
Solutions
Contact Sales
ChatGPT
Back to main menu
Explore ChatGPT
Team
Enterprise
Education
Pricing
Download
Sora
Back to main menu
Sora Overview
Features
Pricing
Help Center
(opens in a new window)
Sora Log in
(opens in a new window)
API Platform
Back to main menu
Platform Overview
Pricing
API Log in
(opens in a new window)
Documentation
(opens in a new window)
Developer Forum
(opens in a new window)
Stories
Company
Back to main menu
About Us
Our Charter
Careers
Brand Guidelines
News
Log in
OpenAI
Careers
Data Scientist
Da

In [4]:
from langchain_core.prompts import PromptTemplate

prompt_extract = PromptTemplate.from_template(
         """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `location`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)

chain_extract = prompt_extract | llm 
res = chain_extract.invoke(input={'page_data':doc})

In [5]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

{'role': 'Data Scientist',
 'location': 'San Francisco',
 'experience': '7+ years',
 'skills': ['Python',
  'SQL',
  'Tableau',
  'Predictive modeling',
  'Causal inference'],
 'description': 'Design, implement, and own critical models, reporting, and analysis to support high-stakes decision-making. Partner with Finance leadership and teams across Research, Applied and Go-To-Market to deliver actionable insights.'}

In [6]:
import pandas as pd

df = pd.read_csv("company_portfolio.csv")
print(df.head())

                           Techstack                                  Links
0            React, Node.js, MongoDB    https://example.com/react-portfolio
1           Angular,.NET, SQL Server  https://example.com/angular-portfolio
2  Vue.js, Ruby on Rails, PostgreSQL      https://example.com/vue-portfolio
3              Python, Django, MySQL   https://example.com/python-portfolio
4          Java, Spring Boot, Oracle     https://example.com/java-portfolio


In [7]:
import uuid
import chromadb

client = chromadb.PersistentClient('vectorstore')
collection = client.get_or_create_collection(name="portfolio")

if not collection.count():
    for _, row in df.iterrows():
        collection.add(documents=row["Techstack"],
                       metadatas={"links": row["Links"]},
                       ids=[str(uuid.uuid4())])

In [9]:
links = collection.query(query_texts=job['skills'], n_results=2).get('metadatas', [])
links

[[{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'}],
 [{'links': 'https://example.com/magento-portfolio'},
  {'links': 'https://example.com/wordpress-portfolio'}],
 [{'links': 'https://example.com/flutter-portfolio'},
  {'links': 'https://example.com/android-tv-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/magento-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/android-tv-portfolio'}]]

In [12]:
job

{'role': 'Data Scientist',
 'location': 'San Francisco',
 'experience': '7+ years',
 'skills': ['Python',
  'SQL',
  'Tableau',
  'Predictive modeling',
  'Causal inference'],
 'description': 'Design, implement, and own critical models, reporting, and analysis to support high-stakes decision-making. Partner with Finance leadership and teams across Research, Applied and Go-To-Market to deliver actionable insights.'}

In [11]:
job['skills']

['Python', 'SQL', 'Tableau', 'Predictive modeling', 'Causal inference']

In [13]:
prompt_email = PromptTemplate.from_template(
        """
        ### JOB DESCRIPTION:
        {job_description}
        
        ### INSTRUCTION:
        You are Karthik Mohan, a business development executive at AtliQ. AtliQ is an AI & Software Consulting company dedicated to facilitating
        the seamless integration of business processes through automated tools. 
        Over our experience, we have empowered numerous enterprises with tailored solutions, fostering scalability, 
        process optimization, cost reduction, and heightened overall efficiency. 
        Your job is to write a cold email to the client regarding the job mentioned above describing the capability of AtliQ 
        in fulfilling their needs.
        Also add the most relevant ones from the following links to showcase Atliq's portfolio: {link_list}
        Remember you are Mohan, BDE at AtliQ. 
        Do not provide a preamble.
        ### EMAIL (NO PREAMBLE):
        
        """
        )

chain_email = prompt_email | llm
res = chain_email.invoke({"job_description": str(job), "link_list": links})
print(res.content)

Subject: Expert Data Science Solutions for Informed Decision-Making

Dear Hiring Manager,

I came across the Data Scientist role at your esteemed organization in San Francisco, and I was impressed by the job description. As a business development executive at AtliQ, an AI & Software Consulting company, I believe our expertise can help you design, implement, and own critical models, reporting, and analysis to support high-stakes decision-making.

With 7+ years of experience in mind, our team of experts can leverage Python, SQL, Tableau, Predictive modeling, and Causal inference to deliver actionable insights that drive business growth. Our capabilities in data science and analytics can empower your Finance leadership and teams across Research, Applied, and Go-To-Market to make informed decisions.

Our portfolio showcases our proficiency in machine learning and Python, which can be viewed at https://example.com/ml-python-portfolio and https://example.com/python-portfolio. These examples 