#01_data_processing
- Loads job descriptions from CSV (either from S3 or locally)
- Extracts qualifications from JSON format
- Assigns unique IDs to each job
- Saves individual job description files as JSON
- Creates formatted text for reference purposes

In [14]:
import pandas as pd
import json
import os
import uuid
import boto3
from botocore.exceptions import ClientError
from pathlib import Path

In [15]:
# S3 Configuration
S3_BUCKET = "mirra-matcher-32"  # Your actual S3 bucket name
S3_RAW_PATH = "data/raw/job_description_extraction_samples_3.csv"  # S3 path to raw CSV file
S3_PROCESSED_PREFIX = "data/processed/jobs/"  # S3 prefix for processed JSON files

# S3 utility functions
def s3_file_exists(bucket, key):
    """Check if a file exists in an S3 bucket."""
    s3_client = boto3.client('s3')
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        # The file does not exist or don't have permission
        return False

In [16]:
def s3_write_json(data, bucket, key):
    """Write JSON data directly to S3."""
    s3_client = boto3.client('s3')
    try:
        # Convert dict to JSON string
        json_data = json.dumps(data, indent=2)
        # Upload as bytes
        s3_client.put_object(
            Body=json_data.encode('utf-8'),
            Bucket=bucket,
            Key=key,
            ContentType='application/json'
        )
        print(f"Successfully wrote JSON to s3://{bucket}/{key}")
        return True
    except Exception as e:
        print(f"Error writing JSON to S3: {str(e)}")
        return False


In [17]:
def load_data(bucket="mirra-matcher-32", key="data/raw/job_description_extraction_samples_3.csv"):
    """
    Load job description data from CSV file in S3.
    Only loads directly from S3 without local file handling.
    
    Args:
        bucket: S3 bucket name
        key: S3 key for the CSV file
        
    Returns:
        DataFrame with job description data or None if file not found
    """
    # Check if file exists in S3
    if not s3_file_exists(bucket, key):
        print(f"File not found in S3: s3://{bucket}/{key}")
        return None
        
    try:
        # Create a direct read from S3 using pandas
        s3_client = boto3.client('s3')
        s3_resource = boto3.resource('s3')
        
        # Get the object directly from S3
        obj = s3_resource.Object(bucket, key)
        
        # Use pandas to read CSV directly from S3 object body
        df = pd.read_csv(obj.get()['Body'])
        
        print(f"Successfully loaded {len(df)} rows from S3: s3://{bucket}/{key}")
        return df
        
    except Exception as e:
        print(f"Error loading data from S3: {str(e)}")
        return None


In [18]:
df = load_data()

Successfully loaded 50 rows from S3: s3://mirra-matcher-32/data/raw/job_description_extraction_samples_3.csv


In [19]:
print("\nDataFrame columns:")
print(df.columns.tolist())


DataFrame columns:
['snapshot_id', 'id', 'title', 'description', 'location', 'salary', 'pay_rate', 'telecommute_option', 'job_length', 'tax_terms', 'source_app', 'company_name', 'web_url', 'company', 'year', 'month', 'day', 'id_rank', 'rn', 'description_length', 'description_new', 'reformatted', 'extracted']


In [20]:
def extract_qualifications(label_json):
    """
    Parses the JSON string from the 'label' column and returns a dictionary.
    Returns an empty dict on error.
    """
    if pd.isna(label_json):
        return {}

    try:
        if isinstance(label_json, dict):
            return label_json
        return json.loads(label_json)
    except (json.JSONDecodeError, TypeError) as e:
        print(f"JSON parsing error: {e}")
        return {}

In [21]:
df["qualifications"] = df["extracted"].apply(extract_qualifications)

In [22]:
# create a unique ID for each job if not present
if "id" not in df.columns or df["id"].isna().any():
    print("Adding job IDs where missing...")
    if "id" not in df.columns:
        df["id"] = [f"job_{uuid.uuid4()}" for _ in range(len(df))]
    else:
        df.loc[df["id"].isna(), "id"] = [f"job_{uuid.uuid4()}" for _ in range(df["id"].isna().sum())]


In [23]:
# save each job as an individual JSON file
saved_count = 0
for _, row in df.iterrows():
    job_id = row["id"]
    qualifications = row["qualifications"]

    # Skip rows with empty qualifications
    if not qualifications:
        continue

    # Ensure job_id is in the qualifications
    qualifications["job_id"] = job_id

    # Save to S3
    output_key = f"{S3_PROCESSED_PREFIX}{job_id}.json"
    if s3_write_json(qualifications, S3_BUCKET, output_key):
        saved_count += 1

print(f"\nProcessed and saved {saved_count} job descriptions to s3://{S3_BUCKET}/{S3_PROCESSED_PREFIX}")


Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/f9186343-53c2-4026-a46f-d9ceb057e449.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/2f43bd3b-bc7f-457e-ae11-f9da2482f696.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/bb6d2493-a506-4364-bdfe-145ed44307ce.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/8944afca-58e7-465f-88bc-669154500898.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/6ea4e3e3-4762-4116-8813-149442bd5fb1.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/f39db910-904d-4300-aedd-5a468d275770.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/a15701e5-b2a4-4bb9-8a64-7779f3b2447f.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/bea37244-69ae-457a-86da-86a3f1fe4033.json
Successfully wrote JSON to s3://mirra-matcher-32/data/processed/jobs/41e06724-0b8d-41f8-8c6e-40b3cf68f03e.json
S