In [1]:
import os
import pandas as pd
import pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession 
import pyspark.sql.functions as f

# Step 1: Define Spark Configuration
conf = SparkConf() \
    .setAppName("PodProcessing") \
    .setMaster("local[4]") \
    .set("spark.executor.memory", "350g") \
    .set("spark.driver.memory", "350g")

# Step 2: Initialize SparkContext with the Configuration
sc = SparkContext(conf=conf)

# Step 3: Initialize SparkSession
spark = SparkSession(sc)

In [None]:
!pip3.11 install pymarc poetry marctable

In [None]:
!pip3.11 install --upgrade pip

In [2]:
import os

# Add the directory to PATH
os.environ['PATH'] += os.pathsep + '/home/jovyan/.local/bin'

# Initial load only

In [None]:
import os
import glob
import tempfile
import logging
from pymarc import MARCReader, Record, XMLReader

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_files():
    # Get a list of all MARC files
    files = glob.glob('/home/jovyan/work/newmarc/*.mrc_clean_rev.mrc', recursive=True)
    logger.info(f"Found {len(files)} marc files")
    return files

def process_file(file):
    logger.info(f"Processing file {file}")

    # Define the output directory for Parquet files
    output_dir = '/home/jovyan/work/marc/parquet'
    os.makedirs(output_dir, exist_ok=True)

    # Create a temporary file to store valid MARC records
    try:
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            temp_file = temp.name
    except Exception as e:
        logger.error(f"Error creating temporary file for {file}: {e}")
        return False

    # Process the file in chunks
    try:
        with open(file, 'rb') as f_in, open(temp_file, 'wb') as temp_out:
            if file.endswith('.xml'):
                reader = XMLReader(f_in)
            else:
                reader = MARCReader(f_in, to_unicode=True, force_utf8=True, utf8_handling='replace')
            for record in reader:
                if not isinstance(record, Record):
                    raise ValueError("Invalid MARC record")
                temp_out.write(record.as_marc())
    except Exception as e:
        logger.error(f"Error processing file {file}: {e}")
        os.remove(temp_file)
        return False

    # Run the marctable command
    output_file = os.path.join(output_dir, os.path.basename(file).replace('.mrc', '.parquet'))
    logger.info(f"Running marctable command: marctable parquet {temp_file} {output_file}")
    exit_status = os.system(f'marctable parquet {temp_file} {output_file}')
    if exit_status != 0:
        logger.error(f"Error executing marctable command for file {file}")
        os.remove(temp_file)
        return False
    else:
        logger.info(f"Created Parquet file {output_file}")

    # Delete the temporary file
    os.remove(temp_file)

    return True

def marc2parquet():
    files = get_files()
    results = []

    for file in files:
        # Check if the corresponding Parquet file already exists
        output_file = os.path.join('/home/jovyan/work/marc/parquet', os.path.basename(file).replace('.mrc', '.parquet'))
        if os.path.exists(output_file):
            logger.info(f"Skipping already processed file {file}")
            continue

        result = process_file(file)
        results.append(result)

    successful_files = sum(results)
    logger.info(f"Processed {len(results)} files, {successful_files} successful, {len(results) - successful_files} failed")

    # Return True if all files were processed successfully, otherwise False
    return all(results)

# Run the function and capture the result
result = marc2parquet()
logger.info(f"marc2parquet result: {result}")

# Load Parquet
The main benefits of Parquet (like predicate pushdown and column pruning) are realized during file I/O.

In [3]:
spark_df_penn = spark.read.parquet('/home/jovyan/work/marc/parquet/penn-2022-07-20-full-marc21.parquet')
spark_df_brown = spark.read.parquet('/home/jovyan/work/marc/parquet/brown-2022-06-14-full-marc21.parquet')
spark_df_chicago = spark.read.parquet('/home/jovyan/work/marc/parquet/chicago-2022-06-22-full-marc21.parquet')
# df_columbia = spark.read.parquet('/home/jovyan/work/marc/parquet/columbia-2022-06-17-full-marc21.parquet')
spark_df_cornell = spark.read.parquet('/home/jovyan/work/marc/parquet/cornell-2023-04-28-full-marc21.parquet')
spark_df_dartmouth = spark.read.parquet('/home/jovyan/work/marc/parquet/dartmouth-2022-08-19-full-marc21.parquet')
spark_df_duke = spark.read.parquet('/home/jovyan/work/marc/parquet/duke-2025-01-29-full-marc21.parquet_clean_rev.parquet')
# spark_df_harvard = spark.read.parquet('/home/jovyan/work/marc/parquet/harvard-2022-06-17-full-marc21.parquet')
spark_df_jhu = spark.read.parquet('/home/jovyan/work/marc/parquet/jhu-2023-08-23-full-marc21.parquet')
spark_df_mit = spark.read.parquet('/home/jovyan/work/marc/parquet/mit-marc21.parquet')
spark_df_princeton = spark.read.parquet('/home/jovyan/work/marc/parquet/princeton-2022-06-17-full-marc21.parquet')
spark_df_stanford = spark.read.parquet('/home/jovyan/work/marc/parquet/stanford-2024-08-28-full-marc21.parquet')
spark_df_yale = spark.read.parquet('/home/jovyan/work/marc/parquet/yale-2022-06-17-full-marc21.parquet')


In [4]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



# Check for unique records across libraries
Querying marc record fields like columns in a sql database.

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from fuzzywuzzy import fuzz

# Create temp views
spark_df_penn.createOrReplaceTempView("penn")
spark_df_brown.createOrReplaceTempView("brown")
spark_df_chicago.createOrReplaceTempView("chicago")
spark_df_cornell.createOrReplaceTempView("cornell")
spark_df_dartmouth.createOrReplaceTempView("dartmouth")
spark_df_jhu.createOrReplaceTempView("jhu")
spark_df_mit.createOrReplaceTempView("mit")
spark_df_princeton.createOrReplaceTempView("princeton")
spark_df_stanford.createOrReplaceTempView("stanford")
spark_df_yale.createOrReplaceTempView("yale")

# Query to get F245 field
penn_oclc = spark.sql("SELECT F245 FROM penn")
brown_oclc = spark.sql("SELECT F245 FROM brown")
chicago_oclc = spark.sql("SELECT F245 FROM chicago")
cornell_oclc = spark.sql("SELECT F245 FROM cornell")
dartmouth_oclc = spark.sql("SELECT F245 FROM dartmouth")
jhu_oclc = spark.sql("SELECT F245 FROM jhu")
mit_oclc = spark.sql("SELECT F245 FROM mit")
princeton_oclc = spark.sql("SELECT F245 FROM princeton")
stanford_oclc = spark.sql("SELECT F245 FROM stanford")
yale_oclc = spark.sql("SELECT F245 FROM yale")

# UDF for fuzzy similarity
fuzzy_match_udf = udf(lambda x, y: fuzz.token_set_ratio(x, y) if x and y else 0, IntegerType())

# Normalize titles
def normalize_df(df):
    return df.withColumn("normalized", F.lower(F.regexp_replace(F.col("F245"), r"[^a-zA-Z0-9\s]", "")))

penn_oclc = normalize_df(penn_oclc)
brown_oclc = normalize_df(brown_oclc)
chicago_oclc = normalize_df(chicago_oclc)
cornell_oclc = normalize_df(cornell_oclc)
dartmouth_oclc = normalize_df(dartmouth_oclc)
jhu_oclc = normalize_df(jhu_oclc)
mit_oclc = normalize_df(mit_oclc)
princeton_oclc = normalize_df(princeton_oclc)
stanford_oclc = normalize_df(stanford_oclc)
yale_oclc = normalize_df(yale_oclc)

# Exact common intersection
common_oclc = (
    penn_oclc.select("F245")
    .intersect(brown_oclc.select("F245"))
    .intersect(chicago_oclc.select("F245"))
    .intersect(cornell_oclc.select("F245"))
    .intersect(dartmouth_oclc.select("F245"))
    .intersect(jhu_oclc.select("F245"))
    .intersect(mit_oclc.select("F245"))
    .intersect(princeton_oclc.select("F245"))
    .intersect(stanford_oclc.select("F245"))
    .intersect(yale_oclc.select("F245"))
)

unique_penn = penn_oclc.select("F245").subtract(common_oclc)
unique_brown = brown_oclc.select("F245").subtract(common_oclc)
unique_chicago = chicago_oclc.select("F245").subtract(common_oclc)
unique_cornell = cornell_oclc.select("F245").subtract(common_oclc)
unique_dartmouth = dartmouth_oclc.select("F245").subtract(common_oclc)
unique_jhu = jhu_oclc.select("F245").subtract(common_oclc)
unique_mit = mit_oclc.select("F245").subtract(common_oclc)
unique_princeton = princeton_oclc.select("F245").subtract(common_oclc)
unique_stanford = stanford_oclc.select("F245").subtract(common_oclc)
unique_yale = yale_oclc.select("F245").subtract(common_oclc)

print(f"Number of common records (exact): {common_oclc.count()}")
print(f"Number of unique records in Penn: {unique_penn.count()}")
print(f"Number of unique records in Brown: {unique_brown.count()}")
print(f"Number of unique records in Chicago: {unique_chicago.count()}")
print(f"Number of unique records in Cornell: {unique_cornell.count()}")
print(f"Number of unique records in Dartmouth: {unique_dartmouth.count()}")
print(f"Number of unique records in JHU: {unique_jhu.count()}")
print(f"Number of unique records in MIT: {unique_mit.count()}")
print(f"Number of unique records in Princeton: {unique_princeton.count()}")
print(f"Number of unique records in Stanford: {unique_stanford.count()}")
print(f"Number of unique records in Yale: {unique_yale.count()}")

# Example fuzzy matching for Penn vs Brown (repeat for others as necessary):
threshold = 85
common_fuzzy_penn_brown = (
    penn_oclc.alias("p")
    .crossJoin(brown_oclc.alias("b"))
    .withColumn("similarity", fuzzy_match_udf(F.col("p.normalized"), F.col("b.normalized")))
    .filter(F.col("similarity") >= threshold)
)
print(f"Fuzzy common count Penn-Brown: {common_fuzzy_penn_brown.count()}")

# Fuzzy match on title
Query marc records for similar titles using fuzzy matching.

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, BooleanType
from pyspark.sql.functions import udf, col, collect_set
import re

# 1. Create temp views
spark_df_penn.createOrReplaceTempView("penn")
spark_df_brown.createOrReplaceTempView("brown")
spark_df_chicago.createOrReplaceTempView("chicago")
#spark_df_columbia.createOrReplaceTempView("columbia")
spark_df_cornell.createOrReplaceTempView("cornell")
spark_df_dartmouth.createOrReplaceTempView("dartmouth")
spark_df_duke.createOrReplaceTempView("duke")
#spark_df_harvard.createOrReplaceTempView("harvard")
spark_df_jhu.createOrReplaceTempView("jhu")
spark_df_mit.createOrReplaceTempView("mit")
spark_df_princeton.createOrReplaceTempView("princeton")
spark_df_stanford.createOrReplaceTempView("stanford")
spark_df_yale.createOrReplaceTempView("yale")

# 2. Query to get 245 field from each table/view and drop duplicates
penn_245 = spark.sql("SELECT F001, F245, F007 FROM penn").dropDuplicates(["F245"])
brown_245 = spark.sql("SELECT F001, F245, F007 FROM brown").dropDuplicates(["F245"])
chicago_245 = spark.sql("SELECT F001, F245, F007 FROM chicago").dropDuplicates(["F245"])
# columbia_245 = spark.sql("SELECT F001, F245 FROM columbia").dropDuplicates(["F245"])
cornell_245 = spark.sql("SELECT F001, F245, F007 FROM cornell").dropDuplicates(["F245"])
dartmouth_245 = spark.sql("SELECT F001, F245, F007 FROM dartmouth").dropDuplicates(["F245"])
duke_245 = spark.sql("SELECT F001, F245, F007 FROM duke").dropDuplicates(["F245"])
# harvard_245 = spark.sql("SELECT F001, F245 FROM harvard").dropDuplicates(["F245"])
jhu_245 = spark.sql("SELECT F001, F245, F007 FROM jhu").dropDuplicates(["F245"])
mit_245 = spark.sql("SELECT F001, F245, F007 FROM mit").dropDuplicates(["F245"])
princeton_245 = spark.sql("SELECT F001, F245, F007 FROM princeton").dropDuplicates(["F245"])
stanford_245 = spark.sql("SELECT F001, F245, F007 FROM stanford").dropDuplicates(["F245"])
yale_245 = spark.sql("SELECT F001, F245, F007 FROM yale").dropDuplicates(["F245"])

# 3. Normalize titles: remove punctuation, lowercase, etc.
def normalize(text):
    if text:
        return re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    else:
        return None

normalize_udf = udf(normalize, StringType())

def normalized_df(df):
    return df.withColumn("normalized", normalize_udf(F.col("F245")))

penn_245 = normalized_df(penn_245)
brown_245 = normalized_df(brown_245)
chicago_245 = normalized_df(chicago_245)
# columbia_245 = normalized_df(columbia_245)
cornell_245 = normalized_df(cornell_245)
dartmouth_245 = normalized_df(dartmouth_245)
duke_245 = normalized_df(duke_245)
# harvard_245 = normalized_df(harvard_245)
jhu_245 = normalized_df(jhu_245)
mit_245 = normalized_df(mit_245)
princeton_245 = normalized_df(princeton_245)
stanford_245 = normalized_df(stanford_245)
yale_245 = normalized_df(yale_245)

# 4. Create a simple fingerprint by sorting tokens
def fingerprint(text):
    if text:
        tokens = text.split()
        tokens.sort()
        return " ".join(tokens)
    return None

fingerprint_udf = udf(fingerprint, StringType())

# 5. Add source columns and fingerprint column
penn_df = penn_245.withColumn("source", F.lit("penn")).withColumn("fp", fingerprint_udf(F.col("normalized")))
brown_df = brown_245.withColumn("source", F.lit("brown")).withColumn("fp", fingerprint_udf(F.col("normalized")))
chicago_df = chicago_245.withColumn("source", F.lit("chicago")).withColumn("fp", fingerprint_udf(F.col("normalized")))
# columbia_df = columbia_245.withColumn("source", F.lit("columbia")).withColumn("fp", fingerprint_udf(F.col("normalized")))
cornell_df = cornell_245.withColumn("source", F.lit("cornell")).withColumn("fp", fingerprint_udf(F.col("normalized")))
dartmouth_df = dartmouth_245.withColumn("source", F.lit("dartmouth")).withColumn("fp", fingerprint_udf(F.col("normalized")))
duke_df = duke_245.withColumn("source", F.lit("duke")).withColumn("fp", fingerprint_udf(F.col("normalized")))
# harvard_df = harvard_245.withColumn("source", F.lit("harvard")).withColumn("fp", fingerprint_udf(F.col("normalized")))
jhu_df = jhu_245.withColumn("source", F.lit("jhu")).withColumn("fp", fingerprint_udf(F.col("normalized")))
mit_df = mit_245.withColumn("source", F.lit("mit")).withColumn("fp", fingerprint_udf(F.col("normalized")))
princeton_df = princeton_245.withColumn("source", F.lit("princeton")).withColumn("fp", fingerprint_udf(F.col("normalized")))
stanford_df = stanford_245.withColumn("source", F.lit("stanford")).withColumn("fp", fingerprint_udf(F.col("normalized")))
yale_df = yale_245.withColumn("source", F.lit("yale")).withColumn("fp", fingerprint_udf(F.col("normalized")))

# 6. Union all DataFrames
all_df = (penn_df.union(brown_df).union(chicago_df).union(cornell_df)
          .union(dartmouth_df).union(jhu_df).union(mit_df).union(princeton_df)
          .union(stanford_df).union(yale_df).union(duke_df))

# 7. Group by fingerprint and collect the sources where that fingerprint appears
grouped = all_df.groupBy("fp").agg(
    collect_set("source").alias("sources"),
    F.count("*").alias("record_count")
)

# 8. Define the list of all sources
all_sources = {"penn", "brown", "chicago", "cornell", "dartmouth",
               "duke", "jhu", "mit", "princeton", "stanford", "yale"}

# 9. Identify fuzzy common records
def has_all_sources(sources):
    return set(sources) == all_sources

has_all_sources_udf = udf(has_all_sources, BooleanType())
common_fuzzy = grouped.filter(has_all_sources_udf(F.col("sources")))

# 10. Count fuzzy common records
common_fuzzy_count = common_fuzzy.count()
print(f"Number of fuzzy common (all sources) records: {common_fuzzy_count}")

# 11. Derive unique records (records not in common fuzzy groups)
unique_df = all_df.join(common_fuzzy.select("fp"), on="fp", how="left_anti")

# 12. Count unique records per source
unique_penn = unique_df.filter(F.col("source") == "penn")
print(f"Number of unique fuzzy records in Penn: {unique_penn.count()}")

unique_brown = unique_df.filter(F.col("source") == "brown")
print(f"Number of unique fuzzy records in Brown: {unique_brown.count()}")

unique_chicago = unique_df.filter(F.col("source") == "chicago")
print(f"Number of unique fuzzy records in Chicago: {unique_chicago.count()}")

unique_cornell = unique_df.filter(F.col("source") == "cornell")
print(f"Number of unique fuzzy records in Cornell: {unique_cornell.count()}")

unique_dartmouth = unique_df.filter(F.col("source") == "dartmouth")
print(f"Number of unique fuzzy records in Dartmouth: {unique_dartmouth.count()}")

unique_jhu = unique_df.filter(F.col("source") == "jhu")
print(f"Number of unique fuzzy records in JHU: {unique_jhu.count()}")

unique_mit = unique_df.filter(F.col("source") == "mit")
print(f"Number of unique fuzzy records in MIT: {unique_mit.count()}")

unique_princeton = unique_df.filter(F.col("source") == "princeton")
print(f"Number of unique fuzzy records in Princeton: {unique_princeton.count()}")

unique_stanford = unique_df.filter(F.col("source") == "stanford")
print(f"Number of unique fuzzy records in Stanford: {unique_stanford.count()}")

unique_yale = unique_df.filter(F.col("source") == "yale")
print(f"Number of unique fuzzy records in Yale: {unique_yale.count()}")

#unique_columbia = unique_df.filter(F.col("source") == "columbia")
#print(f"Number of unique fuzzy records in Columbia: {unique_columbia.count()}")

unique_duke = unique_df.filter(F.col("source") == "duke")
print(f"Number of unique fuzzy records in Duke: {unique_duke.count()}")

#unique_harvard = unique_df.filter(F.col("source") == "harvard")
#print(f"Number of unique fuzzy records in Harvard: {unique_harvard.count()}")

# 13. Save the unique records to a file
unique_df.select("F001", "F245", "source").write.mode("overwrite") \
  .parquet("/home/jovyan/work/marc/unique_fuzzy_records.parquet")

# 14. Save the common fuzzy records to a file
common_fuzzy.select("fp", "sources", "record_count").write.mode("overwrite") \
  .parquet("/home/jovyan/work/marc/common_fuzzy_records.parquet")

# 15. Save the unique records per source to a file
unique_penn.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_penn.parquet")
unique_brown.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_brown.parquet")
unique_chicago.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_chicago.parquet")
unique_cornell.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_cornell.parquet")
unique_dartmouth.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_dartmouth.parquet")
unique_jhu.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_jhu.parquet")
unique_mit.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_mit.parquet")
unique_princeton.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_princeton.parquet")
unique_stanford.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_stanford.parquet")
unique_yale.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_yale.parquet")
# unique_columbia.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_columbia.parquet")
unique_duke.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_duke.parquet")
# unique_harvard.write.mode("overwrite").parquet("/home/jovyan/work/marc/unique_harvard.parquet")

In [8]:
# analyze the unique records from penn
unique_penn = spark.read.parquet('/home/jovyan/work/marc/unique_penn.parquet')

# Create temp view (if needed)
unique_penn.createOrReplaceTempView("unique_penn")

# Exclude records where the first character of F007 is "c" (electronic resources)
unique_print_penn = unique_penn.filter(F.substring(F.col("F007"), 1, 1) != "c")

# Now select the rarest titles (ordering by F245 here is only illustrative; adjust as needed)
rarest_titles = unique_print_penn.select("F001", "F245").orderBy(F.col("F245").desc())
rarest_titles.show(10)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `F007` cannot be resolved. Did you mean one of the following? [`F001`, `F245`, `fp`, `source`, `normalized`].;
'Filter NOT (substring('F007, 1, 1) = c)
+- Relation [fp#8773,F001#8774,F245#8775,normalized#8776,source#8777] parquet
