#01_data_processing
- Loads job descriptions from CSV (either from S3 or locally)
- Extracts qualifications from JSON format
- Assigns unique IDs to each job
- Saves individual job description files as JSON
- Creates formatted text for reference purposes

In [2]:
import pandas as pd
import json
import os
import uuid
import boto3
from botocore.exceptions import ClientError
from pathlib import Path

In [11]:
# S3 Configuration
S3_ENABLED = True  # Set to False to use local storage only
S3_BUCKET = "mirra-matcher-325"
S3_RAW_PATH = "data/raw/job_description_extraction_samples_1.csv"  # S3 path to CSV file
S3_PROCESSED_PREFIX = "data/processed/jobs/"  # S3 prefix for processed JSON files

# for local storage or temporary files when using S3
LOCAL_RAW_DIR = "../data/raw"
LOCAL_PROCESSED_DIR = "../data/processed/jobs"
LOCAL_RAW_FILE = os.path.join(LOCAL_RAW_DIR, "job_descriptions.csv")

# create local directories for temporary storage
os.makedirs(LOCAL_RAW_DIR, exist_ok=True)
os.makedirs(LOCAL_PROCESSED_DIR, exist_ok=True)

# S3 utility functions
def s3_file_exists(bucket, key):
    """Check if a file exists in an S3 bucket."""
    s3_client = boto3.client('s3')
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        # The file does not exist or we don't have permission
        return False

In [12]:
def download_from_s3(bucket, key, local_path):
    """Download a file from S3 to local storage."""
    s3_client = boto3.client('s3')
    try:
        print(f"Downloading s3://{bucket}/{key} to {local_path}")
        s3_client.download_file(bucket, key, local_path)
        return True
    except ClientError as e:
        print(f"Error downloading from S3: {e}")
        return False


In [13]:
def upload_to_s3(local_path, bucket, key):
    """Upload a file from local storage to S3."""
    s3_client = boto3.client('s3')
    try:
        print(f"Uploading {local_path} to s3://{bucket}/{key}")
        s3_client.upload_file(local_path, bucket, key)
        return True
    except ClientError as e:
        print(f"Error uploading to S3: {e}")
        return False

In [14]:
# function to load data from either s3 or local storage
def load_data():
    """
    Load job description data from CSV file.
    First tries S3 if enabled, then local file system.
    If no file is found, provides options for upload or sample data.
    """
    # will try loading from S3 first if enabled
    df = None
    if S3_ENABLED:
        if s3_file_exists(S3_BUCKET, S3_RAW_PATH):
            # Download to local temporary file
            if download_from_s3(S3_BUCKET, S3_RAW_PATH, LOCAL_RAW_FILE):
                df = pd.read_csv(LOCAL_RAW_FILE)
                print(f"Loaded {len(df)} rows from S3: s3://{S3_BUCKET}/{S3_RAW_PATH}")
                return df
            else:
                print("Failed to download file from S3")

    # If S3 loading failed or is disabled, try local file
    if os.path.exists(LOCAL_RAW_FILE):
        df = pd.read_csv(LOCAL_RAW_FILE)
        print(f"Loaded {len(df)} rows from local file: {LOCAL_RAW_FILE}")
        return df

    # If no file is found, try interactive methods or provide sample
    print(f"File not found locally: {LOCAL_RAW_FILE}")
    try:
        # Try using Colab's file upload (if in Colab environment)
        from google.colab import files
        print("Uploading file via Colab...")
        uploaded = files.upload()
        filename = list(uploaded.keys())[0]
        df = pd.read_csv(filename)

        # Save to the expected local location
        df.to_csv(LOCAL_RAW_FILE, index=False)
        print(f"Saved uploaded file to {LOCAL_RAW_FILE}")

        # Also upload to S3 if enabled
        if S3_ENABLED:
            upload_to_s3(LOCAL_RAW_FILE, S3_BUCKET, S3_RAW_PATH)
    except ImportError:
        print("using sample data")
        # Create a minimal sample dataframe for testing
        df = pd.DataFrame({
            "id": ["sample_001"],
            "title": ["Software Engineer - ML Tools Support"],
            "description": ["Sample job description"],
            "label": ['''{"details":{"job_title":["Software Engineer - ML Tools Support - 314915"],"job_title_base":["Software Engineer"],"company_name":["Trillium Teamologies"],"industry":[],"employment_type":["Full-time"],"wage":[],"location":[{"city":"","state":"MI","country":"US"}],"wfh_policy":["Hybrid"],"travel_required":{"required":false,"hours_weekly":0},"benefits":{"medical":false,"dental":false,"vision":false,"mental_health":false,"hsa":false,"fsa":false,"401k_match":false,"equity":false,"unlimited_pto":false,"tuition_reimbursement":false,"bonus":false,"other":[]},"company_stage":[],"tax_terms":["Direct-hire"],"experience_level":["Senior"],"work_schedule":[],"work_authorization":["Does Not Offer Sponsorship"]},"mandatory":{"hard_skills":[{"skill":["Agile practices in solution delivery"],"minyears":[0]}],"education":[{"education_level":["Bachelor's"],"field_of_study":["Computer Science"]}],"credentials":[{"credential":["General Coding Proficiency Test"]}],"professional_background":[{"background":["software engineer"],"minyears":[4],"industry":[]}]},"preferred":{"hard_skills":[{"skill":["Python"],"minyears":[0]}],"education":[],"credentials":[],"professional_background":[]},"responsibility":{"hard_skills":[{"skill":["Manage machine learning tools using Agile Methodology"]}],"professional_background":[{"background":["Collaborate with other software engineers"]}]}}''']
        })

    return df

In [15]:
df = load_data()

Downloading s3://mirra-matcher-325/data/raw/job_description_extraction_samples_1.csv to ../data/raw/job_descriptions.csv
Loaded 25 rows from S3: s3://mirra-matcher-325/data/raw/job_description_extraction_samples_1.csv


In [16]:
print("\nDataFrame columns:")
print(df.columns.tolist())


DataFrame columns:
['snapshot_id', 'id', 'title', 'description', 'location', 'salary', 'pay_rate', 'telecommute_option', 'job_length', 'tax_terms', 'source_app', 'company_name', 'web_url', 'company', 'year', 'month', 'day', 'id_rank', 'rn', 'description_length', 'description_new', 'reformatted', 'label']


In [17]:
def extract_qualifications(label_json):
    """
    Parses the JSON string from the 'label' column and returns a dictionary.
    Returns an empty dict on error.
    """
    if pd.isna(label_json):
        return {}

    try:
        if isinstance(label_json, dict):
            return label_json
        return json.loads(label_json)
    except (json.JSONDecodeError, TypeError) as e:
        print(f"JSON parsing error: {e}")
        return {}

In [18]:
df["qualifications"] = df["label"].apply(extract_qualifications)

In [19]:
# create a unique ID for each job if not present
if "id" not in df.columns or df["id"].isna().any():
    print("Adding job IDs where missing...")
    if "id" not in df.columns:
        df["id"] = [f"job_{uuid.uuid4()}" for _ in range(len(df))]
    else:
        df.loc[df["id"].isna(), "id"] = [f"job_{uuid.uuid4()}" for _ in range(df["id"].isna().sum())]


In [21]:
# save each job as an individual JSON file
saved_count = 0
for _, row in df.iterrows():
    job_id = row["id"]
    qualifications = row["qualifications"]

    # Skip rows with empty qualifications
    if not qualifications:
        continue

    # Ensure job_id is in the qualifications
    qualifications["job_id"] = job_id

    # Save locally first
    local_output_path = os.path.join(LOCAL_PROCESSED_DIR, f"{job_id}.json")
    with open(local_output_path, 'w') as f:
        json.dump(qualifications, f, indent=2)

    # Then upload to S3 if S3 is enabled
    if S3_ENABLED:
        s3_key = f"{S3_PROCESSED_PREFIX}{job_id}.json"
        try:
            s3_client = boto3.client('s3')
            s3_client.upload_file(local_output_path, S3_BUCKET, s3_key)
            print(f"Uploaded to s3://{S3_BUCKET}/{s3_key}")
        except Exception as e:
            print(f"Error uploading to S3: {str(e)}")

    saved_count += 1

print(f"\nProcessed and saved {saved_count} job descriptions locally to {LOCAL_PROCESSED_DIR}")
if S3_ENABLED:
    print(f"and to S3 at s3://{S3_BUCKET}/{S3_PROCESSED_PREFIX}")


Uploaded to s3://mirra-matcher-325/data/processed/jobs/a2998f35-d332-4fee-8170-c9c946fab8e0.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/5e00aafa-57c3-4c06-8b06-1fe311bc8e4e.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/f20e91c6-5042-44a7-a065-6a9454797a47.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/0d652f5d-9822-4b9b-8274-65674eb68b42.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/90a9f50c-584c-49a4-ac5e-5928ff8a4b94.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/44c8bc5c-49c1-4467-b7b4-fdb2e7e11790.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/f71df3a6-441f-44af-bd75-f6d922e08a37.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/69c34396-8455-47cf-a8df-d09dbd7e31ea.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/9d15576d-1d0f-4396-88d7-4ea76a13d639.json
Uploaded to s3://mirra-matcher-325/data/processed/jobs/babf4bb0-7f3a-48e6-be4b-53e3934f9e0a.json
Uploaded to s3://mirra-matcher

In [22]:
# formatted text version for backup/reference
def format_for_embedding(qualifications):
    """
    Formats the structured qualifications JSON into a clean text string.
    This is for reference only - the embedding process will use the JSON files.
    """
    details = qualifications.get("details", {})
    job_title = ", ".join(details.get("job_title", []))

    mandatory = qualifications.get("mandatory", {})
    # Process hard skills
    hard_skills = ", ".join([item["skill"][0] for item in mandatory.get("hard_skills", []) if item.get("skill")])
    # Process education information
    education = ", ".join([
        (edu["education_level"][0] if edu.get("education_level") else "") +
        (" in " + ", ".join(edu.get("field_of_study", [])) if edu.get("field_of_study") else "")
        for edu in mandatory.get("education", [])
    ])

    formatted_text = (
        f"Job Title: {job_title}\n"
        f"Skills: {hard_skills}\n"
        f"Education: {education}"
    )
    return formatted_text


In [24]:
if len(df) > 0:
    sample_index = 0
    sample_job = df.iloc[sample_index]
    print("\nSample job formatted text:")
    print(format_for_embedding(sample_job["qualifications"]))

    # Also show the path to the saved JSON
    job_id = sample_job["id"]
    # Show both local and S3 paths
    print(f"\nSaved locally to: {os.path.join(LOCAL_PROCESSED_DIR, f'{job_id}.json')}")
    if S3_ENABLED:
        print(f"Uploaded to S3: s3://{S3_BUCKET}/{S3_PROCESSED_PREFIX}{job_id}.json")

print("\nData processing complete!")


Sample job formatted text:
Job Title: Software Engineer - ML Tools Support - 314915
Skills: Agile practices in solution delivery, OpenShift, Google Cloud Platform, Gen AI projects, coding and software engineering best practices, Docker, Kubernetes, GitHub, end-to-end Machine Learning and Gen AI technology stack
Education: Bachelor’s in Computer Science, Computer Engineering, Related

Saved locally to: ../data/processed/jobs/a2998f35-d332-4fee-8170-c9c946fab8e0.json
Uploaded to S3: s3://mirra-matcher-325/data/processed/jobs/a2998f35-d332-4fee-8170-c9c946fab8e0.json

Data processing complete!
