## Imports

In [4]:
import dotenv
from pydantic import BaseModel
import datetime
from google.genai import types, Client
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
import os

dotenv.load_dotenv()
SERPAPI_KEY = os.getenv('SERPAPI_KEY')
GEMINI_API_KEY= os.getenv('GEMINI_API_KEY')
MONGO_URI = os.getenv('MONGO_URI')
DATABASE_NAME = os.getenv('DATABASE_NAME')
COLLECTION_NAME = os.getenv('COLLECTION_NAME')

## Models

In [5]:
class ApplyOption(BaseModel):
    title: str
    link: str

class JobHilight(BaseModel):
    title: str
    items: list[str]

class Job(BaseModel):
    job_id: str
    title: str
    company: str
    location: str
    posted_date: datetime.date
    description: str
    job_highlights: list[JobHilight]
    schedule_type: str
    thumbnail: str | None
    apply: ApplyOption
    via: str
    job_vector: list[float] | None = None

## Factory and Embedding

In [6]:
gemini_client = Client(api_key=GEMINI_API_KEY)

In [7]:
def job_factory(response: dict) -> Job:
    delta_days = response['detected_extensions'].get('posted_at', 0)
    if delta_days:
        delta_days = [int(s) for s in delta_days.split() if s.isdigit()][0]

    job_obj = Job(
        job_id=response['job_id'],
        title=response['title'],
        company=response['company_name'],
        location=response['location'],
        posted_date=datetime.date.today() - datetime.timedelta(days=delta_days),
        description=response['description'],
        job_highlights=[JobHilight(**jobhilight) for jobhilight in response.get('job_highlights', [])],
        schedule_type=response['detected_extensions'].get('schedule_type', ''),
        thumbnail=response.get('thumbnail', None),
        apply=ApplyOption(**response["apply_options"][0]),
        via=response['via']
    )
    return job_obj

In [8]:
def job_highlights_to_text(job_highlights: list[JobHilight]) -> list[str]:
    highlights_text = "\n".join([
        highlight.title + "\n" + "\n".join(highlight.items) for highlight in job_highlights
        ])
    return highlights_text

def add_embedding_to_jobs(jobs: list[Job]) -> Job:
    text_list = []
    for job in jobs:
        # text_list.append(job_highlights_to_text(job.job_highlights))
        text_list.append(job.description) # for now, we will use the description for the embeddings
    
    result = gemini_client.models.embed_content(
        model="text-embedding-004",
        contents=text_list,
        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
    )

    for job in jobs:
        embed = result.embeddings.pop(0)
        job.job_vector = embed.values

## Querying Jobs

In [6]:
from serpapi import GoogleSearch


def get_jobs(
    query: str, location: str = "Brazil", next_page_token: str = None
) -> tuple[list[dict], str]:
    params = {
        "engine": "google_jobs",
        "q": query,
        "location": location,
        "next_page_token": next_page_token,
        "api_key": SERPAPI_KEY,
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    if "serpapi_pagination" in results:
        next_page_token = results["serpapi_pagination"].get("next_page_token", None)
    jobs_results = results.get("jobs_results", [])
    return jobs_results, next_page_token


In [13]:
jobs_response, next_page_token = get_jobs("Embedded Systems Engineer")

In [17]:
jobs_response[5]["description"]

'About Nimble: Nimble is a cutting-edge company at the forefront of innovation in video and IoT devices.\n\nJob Brief:\n\nThe firmware team at Nimble has an immediate opening for a Senior Embedded Firmware Engineer. We are looking for an expert in streaming video within complex channel environments. As part of this role, you will be working on implementing everything from RTOS to device drivers, applications, communication protocols, and test features in our video and IoT devices.\n\nThis position requires adaptability to a fast-paced environment, where your contributions will make a significant impact on our products.\nKey Responsibilities\n• Design and develop advanced firmware solutions for video and IoT devices.\n• Implement efficient and scalable software architecture.\n• Collaborate with cross-functional teams to integrate firmware components.\n• Develop and maintain device drivers, applications, and communication protocols.\n• Conduct thorough testing and debugging of firmware.\

# Sending to mongoDB

In [14]:
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# Create an index on the job_id field
collection.create_index("job_id", unique=True)

'job_id_1'

# Batch Transactions

In [9]:
import json

In [18]:
def scrapping_trasanction(query: str, location: str|None = "Brazil", max_page: int = 1000):
    next_page_token = None
    for batch_num in range(max_page):
        try:
            # print(f"Processing batch number {batch_num}")
            jobs_results, next_page_token = get_jobs(
                query, location=location, next_page_token=next_page_token
            )
            parsed_objects = [job_factory(job) for job in jobs_results]
            parsed_objects = list(filter(lambda job: job.description is not None, parsed_objects))
            add_embedding_to_jobs(parsed_objects)
            collection.insert_many(
                [job.model_dump(mode="json") for job in parsed_objects],
                ordered=False
            )

            if not next_page_token:
                break

        except BulkWriteError as bulkexcep:
            if bulkexcep.details["writeErrors"][0]["code"] == 11000:
                # print("Job already exists")
                ...
            else:
                print(f"Error: {bulkexcep.details}")
        except Exception as e:
            print(f"Error: {e}")
            pass

In [19]:
queries_list = json.load(open("query-jobs.json"))["queries"]

In [None]:
for query in queries_list:
    print(query)
    scrapping_trasanction(query, "Brazil", 1000)

# Fetching the jobs from the database

In [16]:
from pymongo.operations import SearchIndexModel

In [17]:
search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "path": "job_vector",
        "numDimensions": 768,
        "similarity": "dotProduct",
        "quantization": "scalar"
      }
    ]
  },
  name="vector_index",
  type="vectorSearch"
)

In [52]:
indexes = list(collection.list_search_indexes())
indexes_names = [index['name'] for index in indexes]
if 'vector_index' not in indexes_names:
    result = collection.create_search_index(model=search_index_model)
    print(result)
else:
    print('vector_index already exists')

vector_index already exists


In [82]:
with open("test_files/test_resume.txt", "r") as f:
    test_resume = f.read()

In [83]:
result = gemini_client.models.embed_content(
        model="text-embedding-004",
        contents=[test_resume],
        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
)

vector_resume = result.embeddings[0].values

In [84]:
pipeline = [
  {
    '$vectorSearch': {
        'exact': True,
        'index': 'vector_index', 
        'path': 'job_vector', 
        'queryVector': vector_resume,
        # 'numCandidates': 700, 
        'limit': 20
    }
  }, {
    '$project': {
        '_id': 0, 
        'title': 1, 
        'description': 1, 
        #   'score': {
        #     '$meta': 'vectorSearchScore'
        #   }
    }
  }
]

In [85]:
search_result = collection.aggregate(pipeline)

In [86]:
search_result = list(search_result)

In [87]:
for i in search_result:
    print(i['title'])

Software Engineer I Jobs
Software Engineer
Software Engineer (Aircraft Systems) - TRU Simulation
Research Software Engineer
Embedded Software Engineer
Firmware Engineer
Senior Software Developer - Embedded Systems
Java Full Stack Developer
Android Auto Expert for Real-Time Embedded Systems
Senior Software Engineer for Android Set-Top Box Platform
Java Software Engineer (Senior)
Senior Embedded Systems Developer for High-Precision Machines
Software Engineer - (Profile Core Banking)
Sr Software Engineer
Embedded Software Developer
Hardware & Firmware Solutions Engineer
Full Stack Software Engineer, Senior Advisor
Senior Developer - Android Expertise
Senior Embedded Firmware Software Development Lead
Developing Innovative Android Solutions
