In [None]:
#Accuracy Scripts for Jooble Job Postings written with assistance from Gemini AI

In [75]:
import pandas as pd
import re
import string

# Load datasets
gt_df = pd.read_csv('Jooble Ground Truth.csv')
gemini_df = pd.read_csv('../data/jooble_jobs_gemini.csv')

In [77]:
def extract_job_url_clean(url):
    base_url = url.split("?")[0]
    return base_url

In [78]:
gt_df['clean_url'] = gt_df['job_url'].apply(extract_job_url_clean)
gemini_df['clean_url'] = gemini_df['job_url'].apply(extract_job_url_clean)

del gt_df['job_url']
del gemini_df['job_url']

In [79]:
def clean_tags(tag_str):
    if pd.isna(tag_str): return ""
    tags = re.split(r'[\n,]', str(tag_str))
    return ",".join(sorted([t.strip().lower() for t in tags if t.strip()]))

def clean_salary(val):
    if pd.isna(val) or val == "": return None
    clean_val = re.sub(r'[$,]', '', str(val))
    try: return float(clean_val)
    except: return None

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

# Data Cleaning
for df in [gt_df, gemini_df]:
    df['job_title'] = df['job_title'].str.strip().str.lower()
    df['company_name'] = df['company_name'].str.strip().str.lower()
    df['location'] = df['location'].str.strip().str.lower()
    df['salary'] = df['salary'].apply(clean_salary)
    df['salary_type'] = df['salary_type'].str.replace('per hour', 'hourly').str.strip().str.lower()
    df['job_tags'] = df['job_tags'].apply(clean_tags)
    df['job_description'] = df['job_description'].str.strip().str.lower()
    df['job_description'] = df['job_description'].apply(remove_punctuations)

# Merge and Compare
merged = pd.merge(gt_df, gemini_df.drop_duplicates(subset=['clean_url']), on='clean_url', suffixes=('_gt', '_gemini'))


In [80]:
import numpy as np
import re

def check_match_fixed(row):
    fields = ['job_title', 'company_name', 'location', 'salary', 'salary_type', 'job_tags']
    res = {}
    for f in fields:
        val_gt = row[f+'_gt']
        val_gemini = row[f+'_gemini']
        
        # FIX 1: If both are missing (NaN), consider it a match
        if pd.isna(val_gt) and pd.isna(val_gemini):
            res[f] = True
        else:
            res[f] = val_gt == val_gemini
            
    # FIX 2: Collapse all double/triple spaces into single spaces before comparing
    desc_gt = re.sub(r'\s+', ' ', str(row['job_description_gt']).strip())
    desc_gemini = re.sub(r'\s+', ' ', str(row['job_description_gemini']).strip())
    
    # Substring check for description
    res['job_description'] = desc_gemini in desc_gt or desc_gt in desc_gemini
    
    res['all_match'] = all(res.values())
    return pd.Series(res)

eval_results = merged.apply(check_match_fixed, axis=1)

print(f"Exact Matches: {eval_results['all_match'].sum()}")
print(f"Total Field Matches: {eval_results.drop(columns='all_match').values.sum()}")
merged

Exact Matches: 19
Total Field Matches: 139


Unnamed: 0,job_title_gt,company_name_gt,location_gt,salary_gt,salary_type_gt,job_description_gt,job_tags_gt,clean_url,job_title_gemini,company_name_gemini,location_gemini,salary_gemini,salary_type_gemini,job_description_gemini,job_tags_gemini
0,data administrator - research scientist 2,minnesota department of health,"saint paul, mn",32.4,hourly,job details working title data administrator j...,"day shift,full time,hourly pay,live in,local a...",https://jooble.org/away/5968186734884599128,data administrator - research scientist 2,minnesota department of health,"saint paul, mn",32.4,hourly,job details working title data administrator j...,"day shift,full time,hourly pay,live in,local a..."
1,mch data scientist student worker - student wo...,minnesota department of health,"saint paul, mn",19.85,hourly,job details working title mch data scientist s...,"day shift,flexible hours,full time,hourly pay,...",https://jooble.org/away/-5093056149542795597,mch data scientist student worker - student wo...,minnesota department of health,"saint paul, mn",19.85,hourly,job details working title mch data scientist s...,"day shift,flexible hours,full time,hourly pay,..."
2,data engineer,loxo,"austin, tx",,not listed,job description job description as an early hi...,"flexible hours,remote work",https://jooble.org/away/-4137854180822498642,data engineer,loxo,"austin, tx",,not listed,job description job description as an early hi...,"flexible hours,remote work"
3,2026 internship program - data engineering intern,toll brothers,"fort washington, pa",,not listed,creating an exceptional place to work that is ...,"internship,local area,work at office",https://jooble.org/desc/-8958610974848145229,2026 internship program - data engineering intern,toll brothers,"fort washington, pa",,not listed,creating an exceptional place to work that is ...,"internship,local area,work at office"
4,data scientist / ml engineer,infotron,"cape coral, fl",,not listed,additional benefits about the role imagine wri...,remote work,https://jooble.org/away/-4478983366068565842,data scientist / ml engineer,infotron,"cape coral, fl",,not listed,additional benefits about the role imagine wri...,remote work
5,data & ai engineer (remote),fortifyiq,"salem, ma",,not listed,job description job description were seeking a...,remote job,https://jooble.org/away/6138173150673953044,data & ai engineer (remote),fortifyiq,"salem, ma",,not listed,job description job description were seeking a...,remote job
6,data entry clerk remote | part-time or full-ti...,spot on media,remote,30.0,hourly,we’re looking for reliable and detailoriented ...,"extra income,flexible hours,full time,part tim...",https://jooble.org/desc/9003126740454289747,data entry clerk remote | part-time or full-ti...,spot on media,remote,30.0,hourly,we’re looking for reliable and detailoriented ...,"extra income,flexible hours,full time,part tim..."
7,data scientist,"buyers edge platform, llc","waltham, ma",,not listed,job description job description we are seeking...,"flexible hours,local area,remote work",https://jooble.org/away/5763598451851120629,data scientist,"buyers edge platform, llc","waltham, ma",,not listed,job description job description we are seeking...,"flexible hours,local area,remote work"
8,data scientist,recooty,"san francisco, ca",80000.0,yearly,about the role aries is seeking a data scienti...,remote work,https://jooble.org/away/2850857609125217330,data scientist,recooty,"san francisco, ca",80000.0,yearly,about the role aries is seeking a data scienti...,remote work
9,data analyst,zearn,remote,75000.0,yearly,solvers and own work that will drive us to ach...,"flexible hours,full time,remote job,temporary ...",https://jooble.org/desc/-2793415339610557791,data analyst,zearn,remote,75000.0,yearly,solvers and own work that will drive us to ach...,"flexible hours,full time,remote job,temporary ..."
