#01_data_processing


In [1]:
import pandas as pd
import json
import os
import uuid
import boto3
from botocore.exceptions import ClientError
from pathlib import Path

In [2]:
# S3 Configuration
S3_BUCKET = "mirra-matcher-325"  # S3 bucket name
S3_RAW_PATH = "final_inference_data.csv"  # S3 path to raw XLSX file
S3_PROCESSED_PREFIX = "data/processed/jobs/"  # S3 prefix for processed JSON files

# S3 utility functions
def s3_file_exists(bucket, key):
    """Check if a file exists in an S3 bucket."""
    s3_client = boto3.client('s3')
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        # The file does not exist or don't have permission
        return False

In [3]:
def s3_write_json(data, bucket, key):
    """Write JSON data directly to S3."""
    s3_client = boto3.client('s3')
    try:
        # Convert dict to JSON string
        json_data = json.dumps(data, indent=2)
        # Upload as bytes
        s3_client.put_object(
            Body=json_data.encode('utf-8'),
            Bucket=bucket,
            Key=key,
            ContentType='application/json'
        )
        print(f"Successfully wrote JSON to s3://{bucket}/{key}")
        return True
    except Exception as e:
        print(f"Error writing JSON to S3: {str(e)}")
        return False


In [4]:
def load_data(bucket="mirra-matcher-325", key="data/raw/final_inference_data.csv"):
    """
    Load job description data from the file in S3.
    Only loads directly from S3 without local file handling.
    
    Args:
        bucket: S3 bucket name
        key: S3 key for the file
        
    Returns:
        DataFrame with job description data or None if file not found
    """
    # Check if file exists in S3
    if not s3_file_exists(bucket, key):
        print(f"File not found in S3: s3://{bucket}/{key}")
        return None
        
    try:
        # Create a direct read from S3 using pandas
        s3_client = boto3.client('s3')
        s3_resource = boto3.resource('s3')
        
        # Get the object directly from S3
        obj = s3_resource.Object(bucket, key)
        
        # Use pandas to read XLSX directly from S3 object body
        df = pd.read_csv(obj.get()['Body'])
        
        print(f"Successfully loaded {len(df)} rows from S3: s3://{bucket}/{key}")
        return df
        
    except Exception as e:
        print(f"Error loading data from S3: {str(e)}")
        return None


In [None]:
df = load_data()

In [10]:
def extract_qualifications(data):
    """
    Extracts qualifications from the data in json_data column.
    Uses ast.literal_eval to handle Python dictionary literals.
    Returns an empty dict on error.
    """
    import ast
    
    if pd.isna(data):
        return {}

    try:
        # If it's already a dictionary, use it directly
        if isinstance(data, dict):
            return data
            
        # For string data, try ast.literal_eval first
        elif isinstance(data, str):
            try:
                return ast.literal_eval(data)
            except (SyntaxError, ValueError):
                # If literal_eval fails, try json.loads
                try:
                    return json.loads(data)
                except json.JSONDecodeError:
                    # Last resort: just return empty dict
                    return {}
            
        # For other types, convert to string and try ast.literal_eval
        else:
            try:
                return ast.literal_eval(str(data))
            except (SyntaxError, ValueError):
                return {}
            
    except Exception as e:
        print(f"Error parsing data: {type(e).__name__}: {e}")
        return {}

In [None]:
df["qualifications"] = df["json_data"].apply(extract_qualifications)

In [None]:
print(f"\nNumber of rows with non-empty qualifications: {(df['qualifications'].apply(len) > 0).sum()}")

In [None]:
# create a unique ID for each job if not present
if "id" not in df.columns or df["id"].isna().any():
    print("Adding job IDs where missing...")
    if "id" not in df.columns:
        df["id"] = [f"job_{uuid.uuid4()}" for _ in range(len(df))]
    else:
        df.loc[df["id"].isna(), "id"] = [f"job_{uuid.uuid4()}" for _ in range(df["id"].isna().sum())]


In [None]:
# save each job as an individual JSON file
saved_count = 0
for _, row in df.iterrows():
    job_id = row["id"]
    qualifications = row["qualifications"]

    # Skip rows with empty qualifications
    if not qualifications:
        continue

    # Ensure job_id is in the qualifications
    qualifications["job_id"] = job_id

    # Save to S3
    output_key = f"{S3_PROCESSED_PREFIX}{job_id}.json"
    if s3_write_json(qualifications, S3_BUCKET, output_key):
        saved_count += 1

print(f"\nProcessed and saved {saved_count} job descriptions to s3://{S3_BUCKET}/{S3_PROCESSED_PREFIX}")
