In [None]:
from utils import (
    load_config,
    make_clickable,
    truncate_string,
)
from processing_utils import (
    post_processing,
    get_top_jobs,
)
from jobspy import scrape_jobs
import pandas as pd

from itertools import product
import os

config = load_config("config.yaml")
default_location = config["default_location"]

jobs_dict_list = [
    {
        "job_title": job_title,
        "is_remote": is_remote,
        "location": location,
        "distance": distance,
    }
    for job_title, is_remote, location, distance in product(
        config["search_job_titles"],
        config["remote_list"],
        config["search_locations"],
        config["distances"],
    )
]

config["job_titles"] = (
    config["desired_job_titles"] + config["acceptable_job_titles"]
)

today = pd.Timestamp.now().strftime("%d/%m/%Y")
filename = config["filename"]

In [12]:
# if remote job, set location to default_location
for job in jobs_dict_list:
    if job["is_remote"]:
        job["location"] = default_location

# remove duplicates
jobs_dict_list = [dict(t) for t in {tuple(d.items()) for d in jobs_dict_list}]

In [13]:
def run_jobs_search(jobs_dict_list):
    jobs_dfs = []

    for job_dict in jobs_dict_list:
        print(job_dict)

        google_search_term = (
            job_dict["job_title"] + " near " + job_dict["location"]
            if not job_dict["is_remote"]
            else job_dict["job_title"] + " remote UK"
        )

        jobs = scrape_jobs(
            site_name=["indeed", "glassdoor", "google", "linkedin"],
            search_term=job_dict["job_title"],
            google_search_term=google_search_term,
            location=job_dict["location"],
            distance=job_dict["distance"],
            results_wanted=200,
            hours_old=24 * 7,
            country_indeed="UK",
            is_remote=job_dict["is_remote"],
            linkedin_fetch_description=True,  # gets more info such as description, direct job url (slower),
            verbose=False,
        )

        jobs_dfs.append(jobs)

    jobs_df = pd.concat(jobs_dfs)

    jobs_df = jobs_df.drop_duplicates(subset=["id"])
    jobs_df = jobs_df.drop(columns=["company_logo"])
    jobs_df["date_posted"] = pd.to_datetime(
        jobs_df["date_posted"], format="%d/%m/%Y"
    )
    jobs_df["rundate"] = str(today)
    return jobs_df

In [14]:
if not os.path.exists(filename):
    jobs_df = run_jobs_search(jobs_dict_list)
    jobs_df.to_csv(filename, index=False)
else:
    existing_jobs_df = pd.read_csv(filename)
    unique_dates = existing_jobs_df["rundate"].unique()
    if today in unique_dates:
        print("Jobs already fetched today")
        jobs_df = existing_jobs_df
    else:
        print("Fetching jobs")
        jobs_df = pd.concat(
            [existing_jobs_df, run_jobs_search(jobs_dict_list)]
        )
        jobs_df.to_csv(filename, index=False)

Jobs already fetched today


In [15]:
jobs_df = post_processing(jobs_df, config)
top_jobs, data_science_jobs = get_top_jobs(
    jobs_df, config["columns_list"], threshold=2
)

In [16]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 1000)

In [18]:
mapping = {
    "job url": make_clickable,
    "job_url_direct": make_clickable,
    "description": truncate_string,
    "title": truncate_string,
    "location": truncate_string,
    "job_function": truncate_string,
    "company_industry": truncate_string,
    "company_url": make_clickable,
    "company_description": truncate_string,
}

top_jobs.columns = top_jobs.columns.str.replace("_", " ")
mapping = {key.replace("_", " "): value for key, value in mapping.items()}
styled_df = top_jobs.style.format(mapping)
styled_df

Unnamed: 0,site,job url,job url direct,title,company,industry,location,date posted,salary source,is remote,job level,job function,emails,description,company industry,company url,company addresses,company num employees,company revenue,company description,score,salary,days per week
821,linkedin,Click,Click,Senior Data Scientist,Microsoft,ecom,"England, United Kingdom",06/12/2024,,True,not applicable,Engineering and Information Te...,,"As a Senior Data Scientist, yo...",Software Development,Click,,,,,5.5,,
808,linkedin,Click,,Lead Data Scientist,Gravitas Recruitment Group (Global) Ltd,food,"London Area, United Kingdom",02/12/2024,,True,mid-senior level,Supply Chain and Information T...,,**Founding / Lead Data Scienti...,Data Infrastructure and Analyt...,Click,,,,,5.0,95000.0,2.0
652,linkedin,Click,,Data Science Manager,Omnis Partners,marketing,"London Area, United Kingdom",02/12/2024,,True,director,"Analyst, Consulting, and Marke...",,**DATA SCIENCE LEAD / MANAGER*...,"Marketing Services, Business C...",Click,,,,,5.0,90000.0,
1143,indeed,Click,Click,Data Scientist,Unipart Technologies Group,manufacturing,"Leeds, ENG, GB",04/12/2024,,True,,,,Job Advert  Instrumentel i...,,Click,,51 to 200,,07/12/2024,5.0,,
314,indeed,Click,Click,Data Scientist,Unipart Technologies Group,manufacturing,"Leeds, ENG, GB",04/12/2024,,True,,,,Job Advert  Instrumentel i...,,Click,,51 to 200,,Unipart Technologies Group bus...,5.0,,
825,linkedin,Click,,Senior Data Scientist,Harnham,telecom,"London Area, United Kingdom",04/12/2024,,True,mid-senior level,Analyst,,**Senior Data Scientist** ...,Marketing Services and Telecom...,Click,,,,,4.5,90000.0,
905,google,Click,,"Senior Data Science Manager, B...",Monzo,banking,Harrow,,,True,,,tech-hiring@monzo.com,ðŸš€ Weâ€™re on a mission to m...,,,,,,07/12/2024,4.0,110000.0,
66,google,Click,,"Senior Data Science Manager, C...",Monzo,banking,Cardiff,13/11/2024,,True,,,,??London | UK remote | ?? Â£11...,,,,,,,4.0,110000.0,
72,google,Click,,"Senior Data Science Manager, B...",Monzo,banking,Harrow,,,True,,,tech-hiring@monzo.com,ðŸš€ Weâ€™re on a mission to m...,,,,,,,4.0,110000.0,
73,google,Click,,"Senior Data Science Manager, O...",Monzo,banking,Harrow,,,True,,,tech-hiring@monzo.com,ðŸš€ Weâ€™re on a mission to m...,,,,,,,4.0,1000.0,
