In [1]:
from utils import (
    load_config,
    make_clickable,
    truncate_string,
)
from processing_utils import (
    post_processing,
    get_top_jobs,
)
from jobspy import scrape_jobs
import pandas as pd

from itertools import product
import os

config = load_config("config.yaml")
default_location = config["default_location"]

jobs_dict_list = [
    {
        "job_title": job_title,
        "is_remote": is_remote,
        "location": location,
        "distance": distance,
    }
    for job_title, is_remote, location, distance in product(
        config["search_job_titles"],
        config["remote_list"],
        config["search_locations"],
        config["distances"],
    )
]

config["job_titles"] = (
    config["desired_job_titles"] + config["acceptable_job_titles"]
)

today = pd.Timestamp.now().strftime("%d/%m/%Y")
filename = config["filename"]

In [2]:
# if remote job, set location to default_location
for job in jobs_dict_list:
    if job["is_remote"]:
        job["location"] = default_location

# remove duplicates
jobs_dict_list = [dict(t) for t in {tuple(d.items()) for d in jobs_dict_list}]

In [None]:
def run_jobs_search(jobs_dict_list):
    jobs_dfs = []

    for job_dict in jobs_dict_list:
        print(job_dict)

        google_search_term = (
            job_dict["job_title"] + " near " + job_dict["location"]
            if not job_dict["is_remote"]
            else job_dict["job_title"] + " remote UK"
        )

        jobs = scrape_jobs(
            site_name=["indeed", "glassdoor", "google", "linkedin"],
            search_term=job_dict["job_title"],
            google_search_term=google_search_term,
            location=job_dict["location"],
            distance=job_dict["distance"],
            results_wanted=50,
            hours_old=24 * 7,
            country_indeed="UK",
            is_remote=job_dict["is_remote"],
            linkedin_fetch_description=True,  # gets more info such as description, direct job url (slower),
            verbose=False,
        )

        jobs_dfs.append(jobs)

    jobs_df = pd.concat(jobs_dfs)

    jobs_df = jobs_df.drop_duplicates(subset=["id"])
    jobs_df = jobs_df.drop(columns=["company_logo"])
    jobs_df["date_posted"] = pd.to_datetime(
        jobs_df["date_posted"], format="%d/%m/%Y"
    )
    jobs_df["rundate"] = str(today)
    return jobs_df

In [None]:
if not os.path.exists(filename):
    jobs_df = run_jobs_search(jobs_dict_list)
    jobs_df.to_csv(filename, index=False)
else:
    existing_jobs_df = pd.read_csv(filename)
    unique_dates = existing_jobs_df["rundate"].unique()
    if today in unique_dates:
        print("Jobs already fetched today")
        jobs_df = existing_jobs_df
    else:
        print("Fetching jobs")
        jobs_df = pd.concat(
            [existing_jobs_df, run_jobs_search(jobs_dict_list)]
        )
        jobs_df.to_csv(filename, index=False)

{'job_title': 'Data Science Manager', 'is_remote': False, 'location': 'Sheffield, UK', 'distance': 200}


2024-12-11 20:17:20,990 - ERROR - JobSpy:LinkedIn - LinkedIn: HTTPSConnectionPool(host='www.linkedin.com', port=443): Max retries exceeded with url: /jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science+Manager&location=Sheffield%2C+UK&distance=200&pageNum=0&start=0&f_TPR=r604800 (Caused by ResponseError('too many 429 error responses'))
2024-12-11 20:17:21,005 - INFO - JobSpy:Linkedin - finished scraping


{'job_title': 'Data Science Manager', 'is_remote': False, 'location': 'Leeds, UK', 'distance': 200}


2024-12-11 20:30:42,287 - ERROR - JobSpy:LinkedIn - LinkedIn: HTTPSConnectionPool(host='www.linkedin.com', port=443): Max retries exceeded with url: /jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science+Manager&location=Leeds%2C+UK&distance=200&pageNum=0&start=146&f_TPR=r604800 (Caused by ResponseError('too many 429 error responses'))


KeyboardInterrupt: 

In [30]:
jobs_df = post_processing(jobs_df, config)
top_jobs, data_science_jobs = get_top_jobs(
    jobs_df, config["columns_list"], threshold=2
)

In [31]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 1000)

In [32]:
mapping = {
    "job url": make_clickable,
    "job_url_direct": make_clickable,
    "description": truncate_string,
    "title": truncate_string,
    "location": truncate_string,
    "job_function": truncate_string,
    "company_industry": truncate_string,
    "company_url": make_clickable,
    "company_description": truncate_string,
}

top_jobs.columns = top_jobs.columns.str.replace("_", " ")
mapping = {key.replace("_", " "): value for key, value in mapping.items()}
styled_df = top_jobs.style.format(mapping)
styled_df

Unnamed: 0,site,job url,job url direct,title,company,industry,location,date posted,salary source,is remote,job level,job function,emails,description,company industry,company url,company addresses,company num employees,company revenue,company description,score,salary,days per week
821,linkedin,Click,Click,Senior Data Scientist,Microsoft,ecom,"England, United Kingdom",06/12/2024,,True,not applicable,Engineering and Information Te...,,"As a Senior Data Scientist, yo...",Software Development,Click,,,,,5.5,,
808,linkedin,Click,,Lead Data Scientist,Gravitas Recruitment Group (Global) Ltd,food,"London Area, United Kingdom",02/12/2024,,True,mid-senior level,Supply Chain and Information T...,,**Founding / Lead Data Scienti...,Data Infrastructure and Analyt...,Click,,,,,5.0,95000.0,2.0
652,linkedin,Click,,Data Science Manager,Omnis Partners,marketing,"London Area, United Kingdom",02/12/2024,,True,director,"Analyst, Consulting, and Marke...",,**DATA SCIENCE LEAD / MANAGER*...,"Marketing Services, Business C...",Click,,,,,5.0,90000.0,
1143,indeed,Click,Click,Data Scientist,Unipart Technologies Group,manufacturing,"Leeds, ENG, GB",04/12/2024,,True,,,,Job Advert  Instrumentel i...,,Click,,51 to 200,,07/12/2024,5.0,,
2625,linkedin,Click,,Lead Data Scientist,Gravitas Recruitment Group (Global) Ltd,food,"London Area, United Kingdom",2024-12-09 00:00:00,,True,mid-senior level,Supply Chain and Information T...,,**Founding / Lead Data Scienti...,Data Infrastructure and Analyt...,Click,,,,,5.0,95000.0,2.0
314,indeed,Click,Click,Data Scientist,Unipart Technologies Group,manufacturing,"Leeds, ENG, GB",04/12/2024,,True,,,,Job Advert  Instrumentel i...,,Click,,51 to 200,,Unipart Technologies Group bus...,5.0,,
2318,linkedin,Click,,"Senior Data Scientist, People ...",Bank of England,,"Leeds, England, UK",NaT,,True,,,,,,Click,,,,,4.5,,
825,linkedin,Click,,Senior Data Scientist,Harnham,telecom,"London Area, United Kingdom",04/12/2024,,True,mid-senior level,Analyst,,**Senior Data Scientist** ...,Marketing Services and Telecom...,Click,,,,,4.5,90000.0,
2291,google,Click,,Senior Data Scientist (Remote),MarkJames Search,ecom,Sheffield,2024-12-05 00:00:00,,True,,,,Senior Data Scientist Remote ...,,,,,,,4.5,90000.0,
2246,google,Click,,Senior Data Scientist - Consul...,Graduate Recruitment Bureau,consulting,London,2024-11-21 00:00:00,,True,,,,Join an organisation full of e...,,,,,,,4.5,,
