In [0]:
from pypdf import PdfReader
import os 
import re
import logging
import sys
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [0]:
#Setup basic logging function for tracking and troubleshooting

def setup_logging():
    os.makedirs('logs', exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(f'logs/mo_codes_run_{datetime.now().strftime('%Y%m%d')}.log'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    return logging.getLogger(__name__)

In [0]:
#Function to identify correct files and select the latest one for transformation

def get_correct_file(logger):
    mo_codes_files_count = 0
    volume = []
    directory_path = '/Volumes/api_crime_data_2020_to_2026_01_30/default/full_crime_data'
    for dirpath,_,filenames in os.walk(directory_path):
        for file in filenames:
            if 'MO_CODES' in file:
                mo_codes_files_count += 1
                logger.info(f'{file} is in the directory')
                volume.append(os.path.join(dirpath, file))
            else:
                pass
            logger.info(f'{mo_codes_files_count} MO code files.')
    volume_sorted = sorted(volume, reverse=True)
    upload_mo_codes_file = volume_sorted[0]
    logger.info(f'Loading lastest MO codes file: {upload_mo_codes_file}')
    logger.info(f'Operation complete.')
    return upload_mo_codes_file

In [0]:
def appended_mo_codes_pages(logger, reader):

    logger.info(f'Combining all MO pages.')

    full_text = []
    for page in reader.pages:
        page_text = page.extract_text()
        full_text.append(page_text)
    logger.info(f'Transformation complete.')
    return full_text

In [0]:
def separate_mo_codes_pages(logger, all_pages_text):

    logger.info(f'Separating MO codes and MO descriptions from text.')

    full_text_combined = ''.join(all_pages_text)
    full_text_new_line_removed = re.sub(r'\n',',', full_text_combined)
    full_text_split = full_text_new_line_removed.split(',')

    logger.info(f'Transformation complete.')

    return full_text_split

In [0]:
def extract_only_mo_codes_with_regex(logger, page_text_in_lines):

    logger.info(f'Removing junk text.')

    pattern = r'^[0-9][0-9][0-9][0-9]'
    parsed_mo_codes = []

    for page_text in page_text_in_lines:
        if re.match(pattern, page_text):
            parsed_mo_codes.append(page_text)
    #print(parsed_mo_codes)

    logger.info(f'Transformation complete.')

    return parsed_mo_codes


In [0]:
def repackage_mo_codes_into_2_column_structure(logger, parsed_mo_codes):

    logger.info(f'Reformatting MO codes and MO descriptions into usable format.')

    mo_code_list = []
    mo_code_description_list = []

    for line in parsed_mo_codes:
        mo_code, mo_code_description = line.split(' ', 1)
        mo_code_list.append(mo_code)
        mo_code_description_list.append(mo_code_description)
    
    mo_code_desc_structure = list(zip(mo_code_list, mo_code_description_list))

    logger.info(f'Transformation complete.')

    return mo_code_desc_structure


In [0]:
def create_spark_dataframe(logger, mo_code_desc_structure):

    logger.info(f'Turning object into Spark dataframe')

    spark = SparkSession.builder.appName('mo_codes').getOrCreate()

    df = spark.createDataFrame(mo_code_desc_structure,['mo_codes', 'mo_descriptions'])

    logger.info(f'Transformation complete.')

    return df

In [0]:
def check_or_create_table(logger, df, table_name, schema_path):

    #logger.info(f'Checking to see if {table_name} exists.')
    
    if spark.catalog.tableExists(f'{schema_path}.{table_name}'):
        logger.info("Table exists; overwriting existing table with updated data.")
        df.write.mode("overwrite").saveAsTable(f'{schema_path}.{table_name}')
    else:
        logger.info('Table does not exist; table will be created.')
        #table = spark.createDataFrame([], schema)
        df.write.format('delta').mode('ignore').saveAsTable(f'{schema_path}.{table_name}')
        logger.info(f'Created {table_name} table.')


In [0]:
def copy_as_consumption_view(logger, table_name, view_name, schema_path):

    logger.info(f'Cloning main table into consumable view.')

    spark.sql(f"""CREATE OR REPLACE TABLE {schema_path}.{view_name} SHALLOW CLONE {schema_path}.{table_name}""")

    logger.info(f'Cloning complete.')
    
    return

In [0]:
def la_crime_data_mo_codes_pipeline_main():

    logger = setup_logging()

    latest_mo_codes_file = get_correct_file(logger)

    reader = PdfReader(latest_mo_codes_file)

    all_pages_text = appended_mo_codes_pages(logger, reader)

    page_text_in_lines = separate_mo_codes_pages(logger, all_pages_text)
    
    parsed_mo_codes = extract_only_mo_codes_with_regex(logger, page_text_in_lines)

    mo_code_desc_structure = repackage_mo_codes_into_2_column_structure(logger, parsed_mo_codes)

    df = create_spark_dataframe(logger, mo_code_desc_structure)

    mo_codes_schema_path = 'api_crime_data_2020_to_2026_01_30.default'
    table_name = 'mo_codes_mt'
    view_name  = 'mo_codes_vw'
    check_or_create_table(logger, df, table_name, mo_codes_schema_path)
    copy_as_consumption_view(logger,table_name, view_name, mo_codes_schema_path)

    logger.info("MO Codes successfully loaded into table.")



In [0]:
la_crime_data_mo_codes_pipeline_main()

2026-02-09 03:47:11,582 - INFO - MO_CODES_Numerical_20180627.pdf is in the directory
2026-02-09 03:47:11,583 - INFO - 1 MO code files.
2026-02-09 03:47:11,584 - INFO - MO_CODES_Numerical_20191119.pdf is in the directory
2026-02-09 03:47:11,584 - INFO - 2 MO code files.
2026-02-09 03:47:11,585 - INFO - 2 MO code files.
2026-02-09 03:47:11,604 - INFO - 2 MO code files.
2026-02-09 03:47:11,605 - INFO - Loading lastest MO codes file: /Volumes/api_crime_data_2020_to_2026_01_30/default/full_crime_data/MO_CODES_Numerical_20191119.pdf
2026-02-09 03:47:11,606 - INFO - Operation complete.
2026-02-09 03:47:11,790 - INFO - Combining all MO pages.
2026-02-09 03:47:12,104 - INFO - Transformation complete.
2026-02-09 03:47:12,106 - INFO - Separating MO codes and MO descriptions from text.
2026-02-09 03:47:12,107 - INFO - Transformation complete.
2026-02-09 03:47:12,108 - INFO - Removing junk text.
2026-02-09 03:47:12,110 - INFO - Transformation complete.
2026-02-09 03:47:12,110 - INFO - Reformatting 