In [1]:
import os
import pandas as pd
import pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession 
import pyspark.sql.functions as f

# Step 1: Define Spark Configuration
conf = SparkConf() \
    .setAppName("PodProcessing") \
    .setMaster("local[36]") \
    .set("spark.executor.memory", "300g") \
    .set("spark.driver.memory", "300g") \
    .set("spark.driver.maxResultSize", "10g") 

# Step 2: Initialize SparkContext with the Configuration
sc = SparkContext(conf=conf)

# Step 3: Initialize SparkSession
spark = SparkSession(sc)

In [None]:
!pip3.11 install pymarc poetry marctable

In [None]:
!pip3.11 install --upgrade pip

In [None]:
import os

# Add the directory to PATH
os.environ['PATH'] += os.pathsep + '/home/jovyan/.local/bin'

# Initial load only
Converts MARC to Parquet format for faster processing.

In [None]:
import os
import glob
import tempfile
import logging
from pymarc import Record
from pymarc.marcxml import parse_xml_to_array

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_files():
    # Use recursive search to pick up .xml files anywhere under the given directory.
    # Filter out files that are not MARC XML (e.g. those containing 'opensearch' in their name)
    all_files = glob.glob('/home/jovyan/work/*.xml', recursive=True)
    files = [f for f in all_files if 'opensearch' not in os.path.basename(f).lower()]
    logger.info(f"Found {len(files)} marc files: {files}")
    return files

def process_file(file):
    logger.info(f"Processing file {file}")

    # Define the output directory for Parquet files
    output_dir = '/home/jovyan/work/marc/parquet'
    os.makedirs(output_dir, exist_ok=True)

    # Determine the output file name: replace .xml with .parquet
    base = os.path.basename(file)
    output_file = os.path.join(output_dir, base.replace('.xml', '.parquet'))

    if file.endswith('.xml'):
        # Convert XML to MARC (MRC) format first using parse_xml_to_array.
        try:
            # Open XML file using utf-8 encoding instead of binary mode.
            with open(file, 'r', encoding='utf-8') as f_in:
                xml_string = f_in.read()
            # Encode to bytes if the parser requires bytes input.
            records = parse_xml_to_array(xml_string.encode('utf-8'))
            logger.info(f"Parsed {len(records)} MARC records from XML")
        except Exception as e:
            logger.error(f"Error processing XML file {file} with utf-8 encoding: {e}")
            return False

        try:
            with tempfile.NamedTemporaryFile(delete=False) as temp:
                temp_file = temp.name
            with open(temp_file, 'wb') as f_temp:
                for record in records:
                    if not isinstance(record, Record):
                        raise ValueError("Invalid MARC record")
                    f_temp.write(record.as_marc())
            logger.info(f"Wrote temporary MRC file {temp_file}")
        except Exception as e:
            logger.error(f"Error converting XML to MRC for file {file}: {e}")
            if os.path.exists(temp_file):
                os.remove(temp_file)
            return False

        # Run marctable on the temporary MRC file.
        logger.info(f"Running marctable command: marctable parquet {temp_file} {output_file}")
        exit_status = os.system(f'marctable parquet {temp_file} {output_file}')
        if exit_status != 0:
            logger.error(f"Error executing marctable command for file {file}")
            os.remove(temp_file)
            return False
        else:
            logger.info(f"Created Parquet file {output_file}")
        os.remove(temp_file)
        return True

    else:
        # Process non-XML files (for example, raw MRC files).
        try:
            with tempfile.NamedTemporaryFile(delete=False) as temp:
                temp_file = temp.name
        except Exception as e:
            logger.error(f"Error creating temporary file for {file}: {e}")
            return False

        try:
            with open(file, 'rb') as f_in, open(temp_file, 'wb') as temp_out:
                from pymarc import MARCReader
                reader = MARCReader(f_in, to_unicode=True, force_utf8=True, utf8_handling='replace')
                for record in reader:
                    if not isinstance(record, Record):
                        raise ValueError("Invalid MARC record")
                    temp_out.write(record.as_marc())
        except Exception as e:
            logger.error(f"Error processing file {file}: {e}")
            os.remove(temp_file)
            return False

        logger.info(f"Running marctable command: marctable parquet {temp_file} {output_file}")
        exit_status = os.system(f'marctable parquet {temp_file} {output_file}')
        if exit_status != 0:
            logger.error(f"Error executing marctable command for file {file}")
            os.remove(temp_file)
            return False
        else:
            logger.info(f"Created Parquet file {output_file}")

        os.remove(temp_file)
        return True

def marc2parquet():
    files = get_files()
    results = []
    for file in files:
        # Determine output file name based on file extension.
        base = os.path.basename(file)
        output_file = os.path.join('/home/jovyan/work/marc/parquet', base.replace('.xml', '.parquet'))
        if os.path.exists(output_file):
            logger.info(f"Skipping already processed file {file}")
            continue
        result = process_file(file)
        results.append(result)
    successful_files = sum(results)
    logger.info(f"Processed {len(results)} files, {successful_files} successful, {len(results) - successful_files} failed")
    return all(results)

# Run the function and capture the result
result = marc2parquet()
logger.info(f"marc2parquet result: {result}")

# Load Parquet
The main benefits of Parquet (like predicate pushdown and column pruning) are realized during file I/O.

# Load Parquet to Spark DataFrame

In [2]:
spark_df_penn = spark.read.parquet('/home/jovyan/work/marc/parquet/penn-2022-07-20-full-marc21.parquet')
spark_df_brown = spark.read.parquet('/home/jovyan/work/marc/parquet/brown-2022-06-14-full-marc21.parquet')
spark_df_chicago = spark.read.parquet('/home/jovyan/work/marc/parquet/chicago-2022-06-22-full-marc21.parquet')
spark_df_columbia = spark.read.parquet('/home/jovyan/work/marc/parquet/columbia.parquet')
spark_df_cornell = spark.read.parquet('/home/jovyan/work/marc/parquet/cornell-2023-04-28-full-marc21.parquet')
spark_df_dartmouth = spark.read.parquet('/home/jovyan/work/marc/parquet/dartmouth-2022-08-19-full-marc21.parquet')
spark_df_duke = spark.read.parquet('/home/jovyan/work/marc/parquet/duke-2025-01-29-full-marc21.parquet_clean_rev.parquet')
spark_df_harvard = spark.read.parquet('/home/jovyan/work/marc/parquet/harvard.parquet')
spark_df_jhu = spark.read.parquet('/home/jovyan/work/marc/parquet/jhu-2023-08-23-full-marc21.parquet')
spark_df_mit = spark.read.parquet('/home/jovyan/work/marc/parquet/mit-marc21.parquet')
spark_df_princeton = spark.read.parquet('/home/jovyan/work/marc/parquet/princeton-2022-06-17-full-marc21.parquet')
spark_df_stanford = spark.read.parquet('/home/jovyan/work/marc/parquet/stanford-2024-08-28-full-marc21.parquet')
spark_df_yale = spark.read.parquet('/home/jovyan/work/marc/parquet/yale-2022-06-17-full-marc21.parquet')


# Import matching libraries

In [3]:
!pip install fuzzywuzzy
!pip install python-Levenshtein
%pip install langdetect


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Find unique by Matchkey and lccn / isbn
Matchkey: 1. normalized title 2. edition and 3. publication fields. Matchkey is used when 010 (lccn) or 020 (isbn) fields are not available.

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, BooleanType, ArrayType
from pyspark.sql.functions import udf, col, collect_set, explode
import re
import unicodedata
from langdetect import detect

# 1. Create temp views
spark_df_penn.createOrReplaceTempView("penn")
spark_df_brown.createOrReplaceTempView("brown")
spark_df_chicago.createOrReplaceTempView("chicago")
spark_df_columbia.createOrReplaceTempView("columbia")
spark_df_cornell.createOrReplaceTempView("cornell")
spark_df_dartmouth.createOrReplaceTempView("dartmouth")
spark_df_duke.createOrReplaceTempView("duke")
spark_df_harvard.createOrReplaceTempView("harvard")
spark_df_jhu.createOrReplaceTempView("jhu")
spark_df_mit.createOrReplaceTempView("mit")
spark_df_princeton.createOrReplaceTempView("princeton")
spark_df_stanford.createOrReplaceTempView("stanford")
spark_df_yale.createOrReplaceTempView("yale")

# 2. Query to get fields from each view.
# Include F010 (LCCN), F020 (ISBN), F245, F250 (edition), and F260 (publication/distribution).
penn_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM penn")
brown_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM brown")
chicago_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM chicago")
columbia_245 = spark.sql("SELECT F001, F010, F020, F245, F250, F260 FROM columbia")
cornell_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM cornell")
dartmouth_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM dartmouth")
duke_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM duke")
harvard_245 = spark.sql("SELECT F001, F010, F020, F245, F250, F260 FROM harvard")
jhu_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM jhu")
mit_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM mit")
princeton_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM princeton")
stanford_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM stanford")
yale_245 = spark.sql("SELECT F001, F007, F010, F020, F245, F250, F260 FROM yale")

# 3. Normalize titles (F245), edition (F250), and publication/distribution (F260)
# Remove diacritics, punctuation, lowercase, and extra whitespace.
def normalize(text):
    if text:
        # If text is a list, join its elements into a string.
        if isinstance(text, list):
            text = " ".join(text)
        try:
            lang = detect(text)
        except Exception:
            lang = "unknown"
        if lang != "en":
            # For non-English, keep diacritics; only lowercase and trim whitespace.
            text = text.lower().strip()
            text = re.sub(r'\s+', ' ', text)
            return text
        else:
            # English: remove diacritics and punctuation.
            text = unicodedata.normalize('NFD', text)
            text = ''.join([c for c in text if not unicodedata.combining(c)])
            text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
            text = text.lower().strip()
            text = re.sub(r'\s+', ' ', text)
            return text
    return None

normalize_udf = udf(normalize, StringType())

def normalized_df(df):
    return df.withColumn("normalized_title", normalize_udf(F.col("F245"))) \
             .withColumn("normalized_edition", normalize_udf(F.col("F250"))) \
             .withColumn("normalized_pub", normalize_udf(F.col("F260")))

# Apply normalization to each DataFrame
penn_245 = normalized_df(penn_245)
brown_245 = normalized_df(brown_245)
chicago_245 = normalized_df(chicago_245)
columbia_245 = normalized_df(columbia_245)
cornell_245 = normalized_df(cornell_245)
dartmouth_245 = normalized_df(dartmouth_245)
duke_245 = normalized_df(duke_245)
harvard_245 = normalized_df(harvard_245)
jhu_245 = normalized_df(jhu_245)
mit_245 = normalized_df(mit_245)
princeton_245 = normalized_df(princeton_245)
stanford_245 = normalized_df(stanford_245)
yale_245 = normalized_df(yale_245)

# 4. Create a match key from the normalized title, edition, and publication/distribution.
def create_match_key(title, edition, pub):
    if title:
        key_parts = [title]
        if edition:
            key_parts.append(edition)
        if pub:
            key_parts.append(pub)
        return "_".join(key_parts)
    return None

match_key_udf = udf(create_match_key, StringType())

# 5. Add source column and a match key column
penn_df = penn_245.withColumn("source", F.lit("penn")) \
                  .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
brown_df = brown_245.withColumn("source", F.lit("brown")) \
                    .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
chicago_df = chicago_245.withColumn("source", F.lit("chicago")) \
                        .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
columbia_df = columbia_245.withColumn("source", F.lit("columbia")) \
                          .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
cornell_df = cornell_245.withColumn("source", F.lit("cornell")) \
                        .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
dartmouth_df = dartmouth_245.withColumn("source", F.lit("dartmouth")) \
                            .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
duke_df = duke_245.withColumn("source", F.lit("duke")) \
                  .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
harvard_df = harvard_245.withColumn("source", F.lit("harvard")) \
                        .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
jhu_df = jhu_245.withColumn("source", F.lit("jhu")) \
                .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
mit_df = mit_245.withColumn("source", F.lit("mit")) \
                .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
princeton_df = princeton_245.withColumn("source", F.lit("princeton")) \
                            .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
stanford_df = stanford_245.withColumn("source", F.lit("stanford")) \
                           .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))
yale_df = yale_245.withColumn("source", F.lit("yale")) \
                   .withColumn("match_key", match_key_udf(F.col("normalized_title"), F.col("normalized_edition"), F.col("normalized_pub")))

# 6. Union all DataFrames
all_df = (penn_df.union(brown_df).union(chicago_df).union(cornell_df)
          .union(dartmouth_df).union(jhu_df).union(mit_df).union(princeton_df)
          .union(stanford_df).union(yale_df).union(duke_df))

# 7. Create a helper UDF to extract ISBN and LCCN values as a trimmed array.
def get_ids(f010, f020):
    ret = []
    # Process F010
    if f010:
        if isinstance(f010, list):
            for item in f010:
                if item and isinstance(item, str):
                    trimmed = item.strip()
                    if trimmed:
                        ret.append(trimmed)
        elif isinstance(f010, str):
            trimmed = f010.strip()
            if trimmed:
                ret.append(trimmed)
    # Process F020
    if f020:
        if isinstance(f020, list):
            for item in f020:
                if item and isinstance(item, str):
                    trimmed = item.strip()
                    if trimmed:
                        ret.append(trimmed)
        elif isinstance(f020, str):
            trimmed = f020.strip()
            if trimmed:
                ret.append(trimmed)
    return ret

get_ids_udf = udf(get_ids, ArrayType(StringType()))

# 8. Add an id_list column (with ISBN and/or LCCN) to all records.
all_df = all_df.withColumn("id_list", get_ids_udf(F.col("F010"), F.col("F020")))

# 9. Define the matching key array.
# If any ISBN or LCCN identifiers exist, use them; otherwise, use the match key.
all_df = all_df.withColumn("key_array",
    F.when(F.size(F.col("id_list")) > 0, F.col("id_list"))
     .otherwise(F.array(F.col("match_key")))
)

# 10. Explode the key_array so that each record has one key per identifier (or one key if using the match key).
all_df_exploded = all_df.withColumn("key", explode("key_array"))

# 11. Group by key and collect sources where that key appears.
grouped = all_df_exploded.groupBy("key").agg(
    collect_set("source").alias("sources"),
    F.count("*").alias("record_count")
)

# 12. Define the complete set of sources.
all_sources = {"penn", "brown", "chicago", "cornell", "dartmouth",
               "duke", "jhu", "mit", "princeton", "stanford", "yale"}

# 13. Identify common records: groups where the key appears in all sources.
def has_all_sources(sources):
    return all(src in sources for src in all_sources)

has_all_sources_udf = udf(has_all_sources, BooleanType())
common_fuzzy = grouped.filter(has_all_sources_udf(F.col("sources")))

# 14. Count common records.
common_fuzzy_count = common_fuzzy.count()
print(f"Number of fuzzy common (all sources) records: {common_fuzzy_count}")

# 15. Derive unique records:
# Perform an anti-join to remove records that are common across all sources,
# then deduplicate based on the match_key so that only one copy of the same book remains.
unique_exploded = all_df_exploded.join(common_fuzzy.select("key"), on="key", how="left_anti")
unique_df = unique_exploded.dropDuplicates(["match_key"])

# 16. Count unique records per source.
unique_penn = unique_df.filter(F.col("source") == "penn")


# 17. Save results
unique_df.select("F001", "F245", "source").write.mode("overwrite") \
    .parquet("/home/jovyan/work/marc/unique_fuzzy_records.parquet")
common_fuzzy.select("key", "sources", "record_count").write.mode("overwrite") \
    .parquet("/home/jovyan/work/marc/common_fuzzy_records.parquet")
unique_penn.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_penn.parquet")

Number of fuzzy common (all sources) records: 49175


In [5]:
unique_penn.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_penn.parquet")


# Analyze results

In [14]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import col, substring, array_join, udf

format_dict = {
    'a': "Map",
    'c': "Electronic resource",
    'd': "Globe",
    'f': "Tactile material",
    'g': "Projected graphic",
    'h': "Microform",
    'k': "Nonprojected graphic",
    'm': "Motion picture",
    'o': "Kit",
    'q': "Notated music",
    'r': "Remote-sensing image",
    's': "Sound recording",
    't': "Text",
    'v': "Videorecording",
    'z': "Unspecified"
}

def get_F007_desc(code):
    return format_dict.get(code, "Unknown")

get_F007_desc_udf = udf(get_F007_desc, StringType())

# Read the parquet
unique_penn = spark.read.parquet("/home/jovyan/work/marc/unique_penn.parquet")

def convert_array_columns_to_str(df, columns):
    for c in columns:
        if c in df.columns:
            field_type = df.schema[c].dataType
            if isinstance(field_type, ArrayType):
                # If it's array<string>, convert to semicolon-delimited string
                df = df.withColumn(c + "_str", array_join(col(c), "; ")) \
                       .drop(c)
            else:
                # Otherwise just rename (string columns)
                df = df.withColumnRenamed(c, c + "_str")
    return df

# Add any columns that may be arrays
columns_to_convert = ["F007", "F010", "F020", "F250", "F260", "id_list", "key_array"]  # Add others if needed
unique_penn = convert_array_columns_to_str(unique_penn, columns_to_convert)

# Extract F007_code from F007_str
unique_penn = unique_penn.withColumn("F007_code", substring(col("F007_str"), 1, 1)) \
                         .withColumn("F007_desc", get_F007_desc_udf(col("F007_code")))

# Write each format type to a separate CSV
for code, description in format_dict.items():
    filtered_df = unique_penn.filter(F.col("F007_code") == code)
    output_path = f"/home/jovyan/work/marc/unique_penn_{description.replace(' ', '_')}.csv"
    filtered_df.write.mode("overwrite").option("header", True).csv(output_path)

print("Exported files per format code successfully.")

Exported files per format code successfully.


In [19]:
import glob
import os
import pandas as pd

# Format dictionary
format_dict = {
    'a': "Map",
    'c': "Electronic resource",
    'd': "Globe",
    'f': "Tactile material",
    'g': "Projected graphic",
    'h': "Microform",
    'k': "Nonprojected graphic",
    'm': "Motion picture",
    'o': "Kit",
    'q': "Notated music",
    'r': "Remote-sensing image",
    's': "Sound recording",
    't': "Text",
    'v': "Videorecording",
    'z': "Unspecified"
}

# Effective maximum number of data rows per sheet (leaving space for the header)
MAX_DATA_ROWS_PER_SHEET = 1048575

# Path to where CSV folders are located
base_path = "/home/jovyan/work/marc/"

for code, desc in format_dict.items():
    folder_name = f"unique_penn_{desc.replace(' ', '_')}.csv"
    csv_folder = os.path.join(base_path, folder_name)

    # Only include part files (ignore metadata like _SUCCESS)
    csv_files = glob.glob(os.path.join(csv_folder, "part-*.csv"))
    if not csv_files:
        continue

    # Read and combine CSV partitions
    df_list = []
    for csv_file in csv_files:
        df_chunk = pd.read_csv(csv_file, engine="python", on_bad_lines="skip")
        df_list.append(df_chunk)
    combined_df = pd.concat(df_list, ignore_index=True)

    # Determine the number of sheets needed
    num_rows = len(combined_df)
    sheet_count = (num_rows + MAX_DATA_ROWS_PER_SHEET - 1) // MAX_DATA_ROWS_PER_SHEET

    # Create an Excel file and split the DataFrame into sheets
    excel_filename = f"unique_penn_{desc.replace(' ', '_')}.xlsx"
    excel_path = os.path.join(base_path, excel_filename)

    with pd.ExcelWriter(excel_path) as writer:
        start_idx = 0
        for i in range(sheet_count):
            end_idx = min(start_idx + MAX_DATA_ROWS_PER_SHEET, num_rows)
            chunk_df = combined_df.iloc[start_idx:end_idx, :]
            sheet_name = desc[:20] + f"_part{i+1}"  # Truncate sheet name if needed
            chunk_df.to_excel(writer, sheet_name=sheet_name, index=False)
            start_idx = end_idx

    print(f"Finished writing Excel for format: {desc}")

Finished writing Excel for format: Map
Finished writing Excel for format: Electronic resource
Finished writing Excel for format: Globe
Finished writing Excel for format: Tactile material
Finished writing Excel for format: Projected graphic
Finished writing Excel for format: Microform
Finished writing Excel for format: Nonprojected graphic
Finished writing Excel for format: Motion picture
Finished writing Excel for format: Kit
Finished writing Excel for format: Notated music
Finished writing Excel for format: Remote-sensing image
Finished writing Excel for format: Sound recording
Finished writing Excel for format: Text
Finished writing Excel for format: Videorecording
Finished writing Excel for format: Unspecified
