# The first thing we have to do is collect some schemas that classify occupations.

We will begin with the SSYK schema from Statistics Sweden.

The document is available at https://www.scb.se/contentassets/c9d055b6f2114b62bd23c33602b56da5/ov9999_2012a01_br_x70br1201.pdf

It is stored in data/pdf/ssyk.pdf

In [5]:
import fitz  # PyMuPDF
import json
import re
import os

# Define the path to your PDF document
pdf_path = '../data/pdf/ssyk.pdf'

# Define the start and end pages for your document
start_page = 33  # replace with your actual start page
end_page = 35  # replace with your actual end page

# Define the regex pattern for occupation extraction
pattern = re.compile(
    r'^(?P<code>\d{1,4})\s(?P<title>[^\n]+)\n(?P<description>(?:.(?!\n\d{1,4}\s))*.)',
    re.MULTILINE | re.DOTALL
)
level_3_pattern = re.compile(
    r'Yrkesgrupp (\d{3})\s*(.*?)\n(.*?)(?=\nYrkesgrupp \d{3}|\n\d{4}|\Z)',
    re.DOTALL
)

# Define the path to save the JSON files
json_path = '../data/schemas/ssyk/'

# Function to process a single page
def process_page(page):
    page_text = page.get_text()
    page_matches = pattern.findall(page_text)
    level_3_matches = level_3_pattern.findall(page_text)
    occupations = []
    for match in page_matches + level_3_matches:
        code, title, description = match
        level = len(code)
        occupation = {
            "code": code.strip(),
            "title": title.strip(),
            "description": description.strip(),
            "level": level
        }
        occupations.append(occupation)
    return occupations

# Function to process the entire document and save each page's data to a separate JSON file
def process_document(pdf_path, start_page, end_page, json_path):
    pdf_document = fitz.open(pdf_path)
    all_occupations = []
    for page_num in range(start_page - 1, end_page):
        page = pdf_document.load_page(page_num)
        page_occupations = process_page(page)
        all_occupations.extend(page_occupations)
        # Save the current page occupations to a JSON file
        with open(os.path.join(json_path, f'occupations_page_{page_num+1}.json'), 'w', encoding='utf-8') as f:
            json.dump(page_occupations, f, ensure_ascii=False, indent=4)
    pdf_document.close()
    # Save all occupations to a single JSON file
    with open(os.path.join(json_path, 'combined_occupations.json'), 'w', encoding='utf-8') as f:
        json.dump(all_occupations, f, ensure_ascii=False, indent=4)
    return all_occupations

# Process the document and save each page's data to a separate JSON file
all_occupations = process_document(pdf_path, start_page, end_page, json_path)


In [6]:
import re
import json
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path, start_page, end_page):
    # Open the PDF file
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_num in range(start_page - 1, end_page):
            # Page numbers in the PDF are 0-indexed
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    return text

def clean_and_join_lines(text_lines):
    text = '\n'.join(text_lines)
    text = re.sub(r'-\s*\n\s*', '', text)
    text = re.sub(r'\n(?=\d{1,4}\s)', '##SPLIT##', text)
    text = text.replace('\n', ' ')
    return text.split('##SPLIT##')

def is_likely_occupation_line(line):
    return re.match(r'^\d{1,4}\s+\D', line) and not 'fotnot' in line.lower()

def parse_cleaned_text_to_json(text_lines):
    occupations = []
    pattern = re.compile(r'^(\d{1,4})\s+(.+?)(?:\s+(niv√•\s\d))?\s*$', re.MULTILINE)
    
    for line in text_lines:
        match = pattern.match(line)
        if match:
            code, title, level_suffix = match.groups()
            level = len(code)
            if level_suffix:
                title += f' {level_suffix}'
            title = title.strip()
            occupations.append({
                "code": code,
                "title": title,
                "level": level
            })
    
    return occupations

def save_to_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Define the path to the PDF file and the pages to extract
pdf_path = '../data/pdf/ssyk.pdf'
start_page = 23
end_page = 32

# Extract and process text from PDF
extracted_text = extract_text_from_pdf(pdf_path, start_page, end_page)
cleaned_lines = clean_and_join_lines(extracted_text)
filtered_lines = [line for line in cleaned_lines if is_likely_occupation_line(line)]
occupations_json = parse_cleaned_text_to_json(filtered_lines)

# Save the occupations data to a JSON file
json_file_path = 'ssyk_schema_codes_titles_levels.json'
save_to_json(occupations_json, json_file_path)
