In [None]:
import os
import pandas as pd
import pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession 
from pyspark.ml.fpm import FPGrowth
import pyspark.sql.functions as f

# Step 1: Define Spark Configuration
conf = SparkConf() \
    .setAppName("PodProcessing") \
    .setMaster("local[4]") \
    .set("spark.executor.memory", "300g") \
    .set("spark.driver.memory", "300g")

# Step 2: Initialize SparkContext with the Configuration
sc = SparkContext(conf=conf)

# Step 3: Initialize SparkSession
spark = SparkSession(sc)

# Now you can continue with your Spark operations

In [None]:
!pip3.11 install pymarc

In [None]:
!pip3.11 install "numpy<2"

In [None]:
!pip3.11 install poetry

In [None]:
!pip3.11 install --upgrade pip

In [None]:
!pip3.11 install marctable

In [None]:
import os

# Add the directory to PATH
os.environ['PATH'] += os.pathsep + '/home/jovyan/.local/bin'

In [None]:
%pip install --upgrade pymarc

# Initial load only

In [None]:
import os
import glob
import tempfile
import logging
from pymarc import MARCReader, Record

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_files():
    # Get a list of all MARC files
    files = glob.glob('/home/jovyan/work/stanford-2024-08-28-full-marc21.mrc', recursive=True)
    logger.info(f"Found {len(files)} marc files")
    return files

def process_file(file):
    logger.info(f"Processing file {file}")

    # Define the output directory for Parquet files
    output_dir = '/home/jovyan/work/marc/parquet'
    os.makedirs(output_dir, exist_ok=True)

    # Create a temporary file to store valid MARC records
    try:
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            temp_file = temp.name
    except Exception as e:
        logger.error(f"Error creating temporary file for {file}: {e}")
        return False

    # Process the file in chunks
    try:
        with open(file, 'rb') as f_in, open(temp_file, 'wb') as temp_out:
            if file.endswith('.xml'):
                reader = XMLReader(f_in)
            else:
                reader = MARCReader(f_in)
            for record in reader:
                if not isinstance(record, Record):
                    raise ValueError("Invalid MARC record")
                temp_out.write(record.as_marc())
    except Exception as e:
        logger.error(f"Error processing file {file}: {e}")
        os.remove(temp_file)
        return False

    # Run the marctable command
    output_file = os.path.join(output_dir, os.path.basename(file).replace('.mrc', '.parquet'))
    logger.info(f"Running marctable command: marctable parquet {temp_file} {output_file}")
    exit_status = os.system(f'marctable parquet {temp_file} {output_file}')
    if exit_status != 0:
        logger.error(f"Error executing marctable command for file {file}")
        os.remove(temp_file)
        return False
    else:
        logger.info(f"Created Parquet file {output_file}")

    # Delete the temporary file
    os.remove(temp_file)

    return True

def marc2parquet():
    files = get_files()
    results = []

    for file in files:
        # Check if the corresponding Parquet file already exists
        output_file = os.path.join('/home/jovyan/work/marc/parquet', os.path.basename(file).replace('.mrc', '.parquet'))
        if os.path.exists(output_file):
            logger.info(f"Skipping already processed file {file}")
            continue

        result = process_file(file)
        results.append(result)

    successful_files = sum(results)
    logger.info(f"Processed {len(results)} files, {successful_files} successful, {len(results) - successful_files} failed")

    # Return True if all files were processed successfully, otherwise False
    return all(results)

# Run the function and capture the result
result = marc2parquet()
logger.info(f"marc2parquet result: {result}")

# Load Parquet

In [None]:
spark_df_penn = spark.read.parquet('/home/jovyan/work/marc/parquet/penn-2022-07-20-full-marc21.parquet')
spark_df_brown = spark.read.parquet('/home/jovyan/work/marc/parquet/brown-2022-06-14-full-marc21.parquet')
spark_df_chicago = spark.read.parquet('/home/jovyan/work/marc/parquet/chicago-2022-06-22-full-marc21.parquet')
spark_df_dartmouth = spark.read.parquet('/home/jovyan/work/marc/parquet/dartmouth-2022-08-19-full-marc21.parquet)
spark_df_cornell = spark.read.parquet('/home/jovyan/work/marc/parquet/chicago-2022-06-22-full-marc21.parquet')
spark_df_jhu = spark.read.parquet('/home/jovyan/work/marc/parquet/jhu-2023-08-23-full-marc21.parquet')
spark_df_mit = spark.read.parquet('/home/jovyan/work/marc/parquet/mit-marc21.parquet')
spark_df_princeton = spark.read.parquet('/home/jovyan/work/marc/parquet/princeton-2022-06-17-full-marc21.parquet')
spark_df_yale = spark.read.parquet('/home/jovyan/work/marc/parquet/yale-2022-06-17-full-marc21.parquet')


# Check for unique records across libraries

In [None]:
# using spark sql, I need to query a field 
# and compare the two dataframes to find out how many records are common between 
# the two dataframes and how many are unique to each dataframe
spark_df_penn.createOrReplaceTempView("penn")
spark_df_brown.createOrReplaceTempView("brown")
spark_df_chicago.createOrReplaceTempView("chicago")
spark_df_cornell.createOrReplaceTempView("cornell")
spark_df_jhu.createOrReplaceTempView("jhu")
spark_df_mit.createOrReplaceTempView("mit")
spark_df_princeton.createOrReplaceTempView("princeton")
spark_df_yale.createOrReplaceTempView("yale")

# Query the F035 field to get the OCLC number
penn_oclc = spark.sql("SELECT F245 FROM penn")
brown_oclc = spark.sql("SELECT F245 FROM brown")
chicago_oclc = spark.sql("SELECT F245 FROM chicago")
cornell_oclc = spark.sql("SELECT F245 FROM cornell")
jhu_oclc = spark.sql("SELECT F245 FROM jhu")
mit_oclc = spark.sql("SELECT F245 FROM mit")
princeton_oclc = spark.sql("SELECT F245 FROM princeton")
yale_oclc = spark.sql("SELECT F245 FROM yale")


# Find the number of common records in each dataframe
common_oclc = penn_oclc.intersect(brown_oclc)
common_oclc = common_oclc.intersect(chicago_oclc)
common_oclc = common_oclc.intersect(cornell_oclc)
common_oclc = common_oclc.intersect(jhu_oclc)
common_oclc = common_oclc.intersect(mit_oclc)
common_oclc = common_oclc.intersect(princeton_oclc)
common_oclc = common_oclc.intersect(yale_oclc)
common_oclc.count()

# Find the number of unique records in each dataframe
unique_penn = penn_oclc.subtract(common_oclc)
unique_brown = brown_oclc.subtract(common_oclc)
unique_chicago = chicago_oclc.subtract(common_oclc)
unique_cornell = cornell_oclc.subtract(common_oclc)
unique_jhu = jhu_oclc.subtract(common_oclc)
unique_mit = mit_oclc.subtract(common_oclc)
unique_princeton = princeton_oclc.subtract(common_oclc)
unique_yale = yale_oclc.subtract(common_oclc)

unique_penn.count(), unique_brown.count(), unique_chicago.count(), unique_cornell.count(), unique_jhu.count(), unique_mit.count(), unique_princeton.count(), unique_yale.count()

# print the results
print(f"Number of common records: {common_oclc.count()}")
print(f"Number of unique records in Penn: {unique_penn.count()}")
print(f"Number of unique records in Brown: {unique_brown.count()}")
print(f"Number of unique records in Chicago: {unique_chicago.count()}")
print(f"Number of unique records in Cornell: {unique_cornell.count()}")
print(f"Number of unique records in JHU: {unique_jhu.count()}")
print(f"Number of unique records in MIT: {unique_mit.count()}")
print(f"Number of unique records in Princeton: {unique_princeton.count()}")
print(f"Number of unique records in Yale: {unique_yale.count()}")






# Some trial SQL queries (older)

In [None]:

spark_df.createOrReplaceTempView("df")
df_001 = spark.sql("SELECT `001` FROM df")
# are there any null values?
df_001.filter(df_001['001'].isNull()).count()
#check if all records have 001 values


In [None]:
#approximate number of items that are portuguese language
por = spark.sql("SELECT * FROM df WHERE `008` LIKE '%por%'")
count_por = por.count()
print(count_por)
