In [1]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, from_json, transform, regexp_extract, when, lit, floor, concat, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, ArrayType

In [2]:
os.getcwd()

'/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/notebooks'

In [3]:
os.chdir('/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/')

In [4]:
def human_readable_size(size_bytes):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024

In [5]:
file_path = "/mnt/data/public/openlibrary/20250426/ol_cdump_latest.txt.gz"
file_size_bytes = os.path.getsize(file_path)
print(f"ol_cdump_latest.txt.gz: {human_readable_size(file_size_bytes)}")

ol_cdump_latest.txt.gz: 41.89 GB


# Initialize Spark

In [6]:
# Initialize Spark session
spark = (SparkSession
         .builder
         .appName("Open Library Dataframes Creation")
         .master('local[*]')
         .getOrCreate()
        )

# Prepare Dataframes

In [7]:
pd.set_option("display.max_colwidth",None)
pd.set_option("display.max_columns",None)

In [8]:
file_path = "/mnt/data/public/openlibrary/20250426/ol_cdump_latest.txt.gz"

df = spark.read.text(file_path)

In [9]:
df.rdd.getNumPartitions()

1

In [10]:
df_split = (df
            .select(split(col("value"), "\t").alias("fields"))
            .select(
                    col("fields")[0].alias("record_type"),
                    col("fields")[1].alias("key"),
                    col("fields")[2].cast(IntegerType()).alias("revision"),
                    col("fields")[3].cast(TimestampType()).alias("timestamp"),
                    col("fields")[4].alias("json_data")
            )
           )

df_split.limit(10).toPandas()

Unnamed: 0,record_type,key,revision,timestamp,json_data
0,/type/author,/authors/OL5345273A,1,2008-10-05 09:14:51.676778,"{""name"": ""TOM.KIYIHN"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-05T09:14:51.676778""}, ""key"": ""/authors/OL5345273A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
1,/type/author,/authors/OL5345271A,1,2008-10-04 15:55:36.974594,"{""name"": ""Aksakov, S. T."", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-04T15:55:36.974594""}, ""key"": ""/authors/OL5345271A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
2,/type/author,/authors/OL5345272A,1,2008-10-04 16:04:08.619520,"{""name"": ""Latham,Barbarahttp://webcat.camosun.bc.ca/cambooks/njpm/tableof.htm"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-04T16:04:08.61952""}, ""key"": ""/authors/OL5345272A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
3,/type/edition,/books/OL17806216M,1,2008-10-01 06:49:20.851055,"{""table_of_contents"": [{""type"": ""/type/text"", ""value"": ""Gender as a category of analysis in vernacular architect studies / Angel Kwolek-Folland""}, {""type"": ""/type/text"", ""value"": ""\""I'm a lumberjack and I'm ok\"" : the built environment and varied masculinities in the industrial age / Deryck W. Holdsworth""}, {""type"": ""/type/text"", ""value"": ""The Masonic lodge room, 1870-1930 : a sacred space of masculine spiritual hierarchy / William D. Moore""}, {""type"": ""/type/text"", ""value"": ""Grammar codes, and performance : linguistic and sociolinguistic models in the study of vernacular architecture / Michael Ann Williams and M. Jane Young""}, {""type"": ""/type/text"", ""value"": ""Building an urban identity : the clustered spires of Frederick, Maryland / Diane Shaw""}, {""type"": ""/type/text"", ""value"": ""\""A school house well arranged\"" : Baltimore public school buildings on the Lancasterian plan, 1829-1839 / Peter E. Kurtze""}, {""type"": ""/type/text"", ""value"": ""Letting in \""the world\"" : (re)interpretive tensions in the Quaker meeting house / Susan Garfinkel""}, {""type"": ""/type/text"", ""value"": ""Man\u0303ana, man\u0303ana : racial stereotypes and the Anglo rediscovery of the Southwest's vernacular architecture, 1890-1920 / Abigail A. Van Slyck""}, {""type"": ""/type/text"", ""value"": ""Association, residence, and shop : an appropriation of commercial blocks in North American Chinatowns / Christopher L. Yip""}, {""type"": ""/type/text"", ""value"": ""\""Snug li'l house with flue and oven\"" : nineteenth-century reforms in plantation slave housing / John Michael Vlach""}, {""type"": ""/type/text"", ""value"": ""Cheap and tasteful dwellings in popular architecture / Jan Jennings""}, {""type"": ""/type/text"", ""value"": ""Cheap, quick, and easy, part II : pressed metal ceilings, 1880-1930 / Pamela H. Simpson""}, {""type"": ""/type/text"", ""value"": ""The Eichler home : intention and experience in postwar suburbia / Annmarie Adams""}, {""type"": ""/type/text"", ""value"": ""Rural adaptations of suburban bungalows, Sussex County, Delaware / Susan Mulchahey Chase""}, {""type"": ""/type/text"", ""value"": ""Building in stone in southwestern Pennyslvania : patterns and process / Karen Koegler""}, {""type"": ""/type/text"", ""value"": ""Private dwellings, public ways, and the landscape of early rural capitalism in Virginia's Shenandoah Valley / Warren R. Hofstra""}, {""type"": ""/type/text"", ""value"": ""The architectural and social topography of early-nineteenth-century Portsmouth, New Hampshire / Bernard L. Herman""}, {""type"": ""/type/text"", ""value"": ""From roadside camps to garden homes : housing and community planning for California's migrant work force, 1935-1941 / Greg Hise.""}], ""series"": [""Perspectives in vernacular architecture -- 5."", ""Perspectives in vernacular architecture (Knoxville, Tenn.) -- 5.""], ""lc_classifications"": [""NA705 .G36 1995""], ""contributions"": [""Cromley, Elizabeth C."", ""Hudgins, Carter L.""], ""edition_name"": ""1st ed."", ""title"": ""Gender, class, and shelter"", ""languages"": [{""key"": ""/languages/eng""}], ""subjects"": [""Vernacular architecture -- United States."", ""Architecture and society -- United States.""], ""publish_country"": ""tnu"", ""by_statement"": ""edited by Elizabeth Collins Cromley and Carter L. Hudgins."", ""type"": {""key"": ""/type/edition""}, ""revision"": 1, ""publishers"": [""University of Tennessee Press""], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T06:49:20.851055""}, ""key"": ""/books/OL17806216M"", ""publish_places"": [""Knoxville""], ""pagination"": ""xiv, 272 p. :"", ""dewey_decimal_class"": [""720/.1/060973""], ""notes"": {""type"": ""/type/text"", ""value"": ""Includes bibliographical references (p. [259]-264) and index.""}, ""number_of_pages"": 272, ""lccn"": [""94018715""], ""isbn_10"": [""087049872X""], ""publish_date"": ""1995""}"
4,/type/i18n,/i18n/strings.uk,1,2008-10-04 18:19:08.241118,"{""string_upload"": ""\u0417\u0430\u0432\u0430\u043d\u0442\u0430\u0436\u0438\u0442\u0438"", ""string_submit"": ""\u0412\u0456\u0434\u0456\u0441\u043b\u0430\u0442\u0438"", ""string_site_subtitle"": """", ""string_site_title"": ""\u0417\u0430\u0433\u043e\u043b\u043e\u0432\u043e\u043a"", ""string_published_in"": ""\u041e\u043f\u0443\u0431\u043b\u0456\u043a\u043e\u0432\u0430\u043d\u043e \u0443"", ""string_prev"": ""\u041f\u043e\u0432\u0435\u0440\u043d\u0443\u0442\u0438\u0441\u044c"", ""string_deleted"": ""\u0426\u044f \u0441\u0442\u043e\u0440\u0456\u043d\u043a\u0430 \u0432\u0438\u0434\u0430\u043b\u0435\u043d\u0430"", ""string_change_language"": ""\u0417\u043c\u0456\u043d\u0438\u0442\u0438 \u043c\u043e\u0432\u0443"", ""string_page_does_not_exist"": ""\u0421\u0442\u043e\u0440\u0456\u043d\u043a\u0430 \u0432\u0456\u0434\u0441\u0443\u0442\u043d\u044f"", ""string_next"": ""\u0412\u043f\u0435\u0440\u0435\u0434"", ""string_login"": ""\u0423\u0432\u0456\u0439\u0442\u0438"", ""string_create_it"": ""\u0421\u0442\u0432\u043e\u0440\u0438\u0442\u0438 \u0441\u0442\u043e\u0440\u0456\u043d\u043a\u0443"", ""string_open_library"": ""\u0412\u0456\u0434\u043a\u0440\u0438\u0442\u0430 \u0431\u0456\u0431\u043b\u0456\u043e\u0442\u0435\u043a\u0430 - Open Library"", ""string_about_us"": ""\u041f\u0440\u043e \u043d\u0430\u0441"", ""ns"": ""/"", ""type"": {""key"": ""/type/i18n""}, ""string_create"": ""\u0421\u0442\u0432\u043e\u0440\u0438\u0442\u0438"", ""string_back_to_ol"": ""\u041f\u043e\u0432\u0435\u0440\u043d\u0443\u0442\u0438\u0441\u044c \u0434\u043e \""\u041f\u0440\u043e \u0412\u0456\u0434\u043a\u0440\u0438\u0442\u0443 \u0411\u0456\u0431\u043b\u0456\u043e\u0442\u0435\u043a\u0443\"""", ""string_preferences"": ""\u041d\u0430\u043b\u0430\u0448\u0442\u0443\u0432\u0430\u043d\u043d\u044f"", ""string_has_fulltext"": ""\u041c\u0430\u0454 \u043f\u043e\u0432\u043d\u0438\u0439 \u0442\u0435\u043a\u0441\u0442"", ""string_logout"": ""\u0412\u0438\u0439\u0442\u0438"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-04T18:19:08.241118""}, ""key"": ""/i18n/strings.uk"", ""string_contact_us"": ""\u0417\u0432`\u044f\u0436\u0456\u0442\u044c\u0441\u044f \u0437 \u043d\u0430\u043c\u0438"", ""string_powered_by_infogami"": ""\u0421\u0442\u0432\u043e\u0440\u0435\u043d\u043e <a href=\""%s\"">Infogami</a>"", ""lang"": ""bg"", ""string_welcome_user"": ""\u041b\u0430\u0441\u043a\u0430\u0432\u043e \u043f\u0440\u043e\u0441\u0438\u043c\u043e, <a href=\""%s\"">%s</a>!"", ""string_facet_year"": ""\u0414\u0430\u0442\u0430 \u043f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u0457"", ""string_not_found"": ""\u041d\u0435 \u0437\u043d\u0430\u0439\u0434\u0435\u043d\u043e"", ""string_history"": ""\u0406\u0441\u0442\u043e\u0440\u0456\u044f"", ""string_hello"": ""\u041f\u0440\u0438\u0432\u0456\u0442"", ""string_internet_archive"": ""\u0406\u043d\u0442\u0435\u0440\u043d\u0435\u0442 \u0430\u0440\u0445\u0456\u0432 (IA)"", ""revision"": 1}"
5,/type/author,/authors/OL5345269A,1,2008-10-03 20:38:12.544508,"{""name"": ""Bi"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T20:38:12.544508""}, ""key"": ""/authors/OL5345269A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
6,/type/author,/authors/OL5345270A,1,2008-10-03 20:38:37.118889,"{""name"": ""Bianco Nicholas"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T20:38:37.118889""}, ""key"": ""/authors/OL5345270A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
7,/type/author,/authors/OL5345265A,1,2008-10-02 21:00:12.012264,"{""name"": ""Crisp, Tony"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T21:00:12.012264""}, ""key"": ""/authors/OL5345265A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
8,/type/author,/authors/OL5345268A,1,2008-10-03 13:50:01.827605,"{""name"": ""Jean-Pierre ROSSIE"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T13:50:01.827605""}, ""key"": ""/authors/OL5345268A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
9,/type/author,/authors/OL5345264A,1,2008-10-02 21:00:12.066032,"{""name"": ""Crisp, Tony"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T21:00:12.066032""}, ""key"": ""/authors/OL5345264A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"


## Works Dataframe

In [11]:
df_works = df_split.filter(col("record_type") == "/type/work")
df_works.limit(10).toPandas()

Unnamed: 0,record_type,key,revision,timestamp,json_data
0,/type/work,/works/OL5967449W,3,2013-08-23 21:48:46.541299,"{""created"": {""type"": ""/type/datetime"", ""value"": ""2009-12-10T18:16:15.578495""}, ""subject_places"": [""Texas""], ""subjects"": [""Folk music"", ""History and criticism""], ""latest_revision"": 3, ""key"": ""/works/OL5967449W"", ""title"": ""Music in Texas--frontier to 1900"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL1481792A""}}, {""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL1461795A""}}], ""type"": {""key"": ""/type/work""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T21:48:46.541299""}, ""revision"": 3}"
1,/type/work,/works/OL5918506W,3,2013-08-23 21:49:55.575503,"{""created"": {""type"": ""/type/datetime"", ""value"": ""2009-12-10T18:11:13.513027""}, ""subject_places"": [""Beaumont (Tex.)""], ""subjects"": [""History""], ""latest_revision"": 3, ""key"": ""/works/OL5918506W"", ""title"": ""Beaumont, a chronicle of promise"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL1461795A""}}, {""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL1481792A""}}], ""type"": {""key"": ""/type/work""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T21:49:55.575503""}, ""revision"": 3}"
2,/type/work,/works/OL10007810W,3,2010-04-28 06:54:19.472104,"{""title"": ""La cuisine a la plancha (connaitre)"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2009-12-11T01:58:28.508302""}, ""covers"": [3156622], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-28T06:54:19.472104""}, ""latest_revision"": 3, ""key"": ""/works/OL10007810W"", ""authors"": [{""type"": ""/type/author_role"", ""author"": {""key"": ""/authors/OL3973957A""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 3}"
3,/type/work,/works/OL9172590W,4,2013-08-23 22:18:31.678649,"{""title"": ""Cloud Dancer"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2009-12-10T23:58:01.797117""}, ""covers"": [4967856], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T22:18:31.678649""}, ""latest_revision"": 4, ""key"": ""/works/OL9172590W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL2674508A""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 4}"
4,/type/work,/works/OL17802418W,1,2017-10-31 23:48:21.594355,"{""title"": ""Leer y redactar en la universidad : del caos de las ideas al texto estructurado"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2017-10-31T23:48:21.594355""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2017-10-31T23:48:21.594355""}, ""latest_revision"": 1, ""key"": ""/works/OL17802418W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 1}"
5,/type/work,/works/OL17802421W,1,2017-11-01 03:39:02.464948,"{""title"": ""HELLBURNER"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T03:39:02.464948""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T03:39:02.464948""}, ""latest_revision"": 1, ""key"": ""/works/OL17802421W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL7046811A""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 1}"
6,/type/work,/works/OL16806239W,1,2013-08-23 22:51:32.436808,"{""title"": ""An Alef-Bet Kabalah"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T22:51:32.436808""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T22:51:32.436808""}, ""latest_revision"": 1, ""key"": ""/works/OL16806239W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 1}"
7,/type/work,/works/OL16806240W,1,2013-08-23 23:02:41.862836,"{""title"": ""Che Forever"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T23:02:41.862836""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2013-08-23T23:02:41.862836""}, ""latest_revision"": 1, ""key"": ""/works/OL16806240W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL274904A""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 1}"
8,/type/work,/works/OL17802473W,2,2017-11-01 13:51:00.576128,"{""title"": ""Apartamento en la Costa Brava"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T13:50:36.367953""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T13:51:00.576128""}, ""latest_revision"": 2, ""key"": ""/works/OL17802473W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}, ""author"": {""key"": ""/authors/OL1540485A""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 2}"
9,/type/work,/works/OL17802492W,1,2017-11-01 16:03:25.631548,"{""title"": ""Artisti romi \u00een arte contemporana"", ""created"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T16:03:25.631548""}, ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2017-11-01T16:03:25.631548""}, ""latest_revision"": 1, ""key"": ""/works/OL17802492W"", ""authors"": [{""type"": {""key"": ""/type/author_role""}}], ""type"": {""key"": ""/type/work""}, ""revision"": 1}"


In [12]:
works_json_schema = StructType([
    StructField("key", StringType(), True),
    StructField("title", StringType(), True),
    StructField("subtitle", StringType(), True),
    StructField("type", StructType([
        StructField("key", StringType(), True)
    ]), True),
    StructField("authors", ArrayType(StructType([
        StructField("type", StructType([
            StructField("key", StringType(), True)
        ]), True),
        StructField("author", StructType([
            StructField("key", StringType(), True)
        ]), True),
        StructField("role", StringType(), True),
        StructField("as", StringType(), True)
    ])), True),
    StructField("covers", ArrayType(IntegerType()), True),
    StructField("links", ArrayType(StructType([
        StructField("title", StringType(), True),
        StructField("url", StringType(), True)
    ])), True),
    StructField("id", IntegerType(), True),
    StructField("lc_classifications", ArrayType(StringType()), True),
    StructField("subjects", ArrayType(StringType()), True),
    StructField("first_publish_date", StringType(), True),
    StructField("description", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("notes", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("revision", IntegerType(), True),
    StructField("latest_revision", IntegerType(), True),
    StructField("created", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("last_modified", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True)
])

In [13]:
df_works_parsed = df_works.withColumn(
    "json_parsed", from_json(col("json_data"), works_json_schema)
)

In [14]:
df_works_final = df_works_parsed.select(
    col("record_type"),
    col("key").alias("work_key"),
    col("json_parsed.title").alias("title"),
    col("json_parsed.subtitle").alias("subtitle"),
    transform(
        col("json_parsed.authors"),
        lambda x: x.author.key
    ).alias("author_keys"),
    col("json_parsed.subjects").alias("subjects"),
    col("json_parsed.first_publish_date").alias("first_publish_date")
)

In [15]:
df_works_final.limit(10).toPandas()

Unnamed: 0,record_type,work_key,title,subtitle,author_keys,subjects,first_publish_date
0,/type/work,/works/OL5967449W,Music in Texas--frontier to 1900,,"[/authors/OL1481792A, /authors/OL1461795A]","[Folk music, History and criticism]",
1,/type/work,/works/OL5918506W,"Beaumont, a chronicle of promise",,"[/authors/OL1461795A, /authors/OL1481792A]",[History],
2,/type/work,/works/OL10007810W,La cuisine a la plancha (connaitre),,[/authors/OL3973957A],,
3,/type/work,/works/OL9172590W,Cloud Dancer,,[/authors/OL2674508A],,
4,/type/work,/works/OL17802418W,Leer y redactar en la universidad : del caos de las ideas al texto estructurado,,[None],,
5,/type/work,/works/OL17802421W,HELLBURNER,,[/authors/OL7046811A],,
6,/type/work,/works/OL16806239W,An Alef-Bet Kabalah,,[None],,
7,/type/work,/works/OL16806240W,Che Forever,,[/authors/OL274904A],,
8,/type/work,/works/OL17802473W,Apartamento en la Costa Brava,,[/authors/OL1540485A],,
9,/type/work,/works/OL17802492W,Artisti romi în arte contemporana,,[None],,


In [16]:
output_dir = os.path.join(os.getcwd(), "partitioned_data")
!rm -rf output_dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [17]:
works_output_path = f"{output_dir}/works_partitioned"
df_works_final.write.parquet(works_output_path, mode="overwrite")

In [18]:
df_works_final = spark.read.parquet(works_output_path)

In [19]:
df_works_final.rdd.getNumPartitions()

31

## Authors Dataframe

In [20]:
df_authors = df_split.filter(col("record_type") == "/type/author")
df_authors.limit(10).toPandas()

Unnamed: 0,record_type,key,revision,timestamp,json_data
0,/type/author,/authors/OL5345273A,1,2008-10-05 09:14:51.676778,"{""name"": ""TOM.KIYIHN"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-05T09:14:51.676778""}, ""key"": ""/authors/OL5345273A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
1,/type/author,/authors/OL5345271A,1,2008-10-04 15:55:36.974594,"{""name"": ""Aksakov, S. T."", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-04T15:55:36.974594""}, ""key"": ""/authors/OL5345271A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
2,/type/author,/authors/OL5345272A,1,2008-10-04 16:04:08.619520,"{""name"": ""Latham,Barbarahttp://webcat.camosun.bc.ca/cambooks/njpm/tableof.htm"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-04T16:04:08.61952""}, ""key"": ""/authors/OL5345272A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
3,/type/author,/authors/OL5345269A,1,2008-10-03 20:38:12.544508,"{""name"": ""Bi"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T20:38:12.544508""}, ""key"": ""/authors/OL5345269A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
4,/type/author,/authors/OL5345270A,1,2008-10-03 20:38:37.118889,"{""name"": ""Bianco Nicholas"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T20:38:37.118889""}, ""key"": ""/authors/OL5345270A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
5,/type/author,/authors/OL5345265A,1,2008-10-02 21:00:12.012264,"{""name"": ""Crisp, Tony"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T21:00:12.012264""}, ""key"": ""/authors/OL5345265A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
6,/type/author,/authors/OL5345268A,1,2008-10-03 13:50:01.827605,"{""name"": ""Jean-Pierre ROSSIE"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-03T13:50:01.827605""}, ""key"": ""/authors/OL5345268A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
7,/type/author,/authors/OL5345264A,1,2008-10-02 21:00:12.066032,"{""name"": ""Crisp, Tony"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T21:00:12.066032""}, ""key"": ""/authors/OL5345264A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
8,/type/author,/authors/OL5345266A,1,2008-10-02 22:32:21.621743,"{""name"": ""Si"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T22:32:21.621743""}, ""key"": ""/authors/OL5345266A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"
9,/type/author,/authors/OL5345267A,1,2008-10-02 22:32:39.685528,"{""name"": ""Sikundar,Sylvia"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-02T22:32:39.685528""}, ""key"": ""/authors/OL5345267A"", ""type"": {""key"": ""/type/author""}, ""revision"": 1}"


In [21]:
authors_json_schema = StructType([
    StructField("key", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("type", StructType([
        StructField("key", StringType(), True)
    ]), True),
    StructField("revision", IntegerType(), True),
    StructField("last_modified", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True)
])

In [22]:
df_authors_parsed = df_authors.withColumn(
    "json_parsed", from_json(col("json_data"), authors_json_schema)
)

In [23]:
df_authors_final = df_authors_parsed.select(
    col("record_type"),
    col("key").alias("author_key"),
    col("revision"),
    col("timestamp"),
    col("json_parsed.name").alias("name"),
    col("json_parsed.location").alias("location")
)

In [24]:
df_authors_final.limit(10).toPandas()

Unnamed: 0,record_type,author_key,revision,timestamp,name,location
0,/type/author,/authors/OL5345273A,1,2008-10-05 09:14:51.676778,TOM.KIYIHN,
1,/type/author,/authors/OL5345271A,1,2008-10-04 15:55:36.974594,"Aksakov, S. T.",
2,/type/author,/authors/OL5345272A,1,2008-10-04 16:04:08.619520,"Latham,Barbarahttp://webcat.camosun.bc.ca/cambooks/njpm/tableof.htm",
3,/type/author,/authors/OL5345269A,1,2008-10-03 20:38:12.544508,Bi,
4,/type/author,/authors/OL5345270A,1,2008-10-03 20:38:37.118889,Bianco Nicholas,
5,/type/author,/authors/OL5345265A,1,2008-10-02 21:00:12.012264,"Crisp, Tony",
6,/type/author,/authors/OL5345268A,1,2008-10-03 13:50:01.827605,Jean-Pierre ROSSIE,
7,/type/author,/authors/OL5345264A,1,2008-10-02 21:00:12.066032,"Crisp, Tony",
8,/type/author,/authors/OL5345266A,1,2008-10-02 22:32:21.621743,Si,
9,/type/author,/authors/OL5345267A,1,2008-10-02 22:32:39.685528,"Sikundar,Sylvia",


In [27]:
authors_output_path = f"{output_dir}/authors_partitioned"
df_authors_final.write.parquet(authors_output_path, mode="overwrite")

In [28]:
df_authors_final = spark.read.parquet(authors_output_path)

In [29]:
df_authors_final.rdd.getNumPartitions()

16

## Editions Dataframe

In [30]:
df_eds = df_split.filter(col("record_type") == "/type/edition")
df_eds.limit(10).toPandas()

Unnamed: 0,record_type,key,revision,timestamp,json_data
0,/type/edition,/books/OL17806216M,1,2008-10-01 06:49:20.851055,"{""table_of_contents"": [{""type"": ""/type/text"", ""value"": ""Gender as a category of analysis in vernacular architect studies / Angel Kwolek-Folland""}, {""type"": ""/type/text"", ""value"": ""\""I'm a lumberjack and I'm ok\"" : the built environment and varied masculinities in the industrial age / Deryck W. Holdsworth""}, {""type"": ""/type/text"", ""value"": ""The Masonic lodge room, 1870-1930 : a sacred space of masculine spiritual hierarchy / William D. Moore""}, {""type"": ""/type/text"", ""value"": ""Grammar codes, and performance : linguistic and sociolinguistic models in the study of vernacular architecture / Michael Ann Williams and M. Jane Young""}, {""type"": ""/type/text"", ""value"": ""Building an urban identity : the clustered spires of Frederick, Maryland / Diane Shaw""}, {""type"": ""/type/text"", ""value"": ""\""A school house well arranged\"" : Baltimore public school buildings on the Lancasterian plan, 1829-1839 / Peter E. Kurtze""}, {""type"": ""/type/text"", ""value"": ""Letting in \""the world\"" : (re)interpretive tensions in the Quaker meeting house / Susan Garfinkel""}, {""type"": ""/type/text"", ""value"": ""Man\u0303ana, man\u0303ana : racial stereotypes and the Anglo rediscovery of the Southwest's vernacular architecture, 1890-1920 / Abigail A. Van Slyck""}, {""type"": ""/type/text"", ""value"": ""Association, residence, and shop : an appropriation of commercial blocks in North American Chinatowns / Christopher L. Yip""}, {""type"": ""/type/text"", ""value"": ""\""Snug li'l house with flue and oven\"" : nineteenth-century reforms in plantation slave housing / John Michael Vlach""}, {""type"": ""/type/text"", ""value"": ""Cheap and tasteful dwellings in popular architecture / Jan Jennings""}, {""type"": ""/type/text"", ""value"": ""Cheap, quick, and easy, part II : pressed metal ceilings, 1880-1930 / Pamela H. Simpson""}, {""type"": ""/type/text"", ""value"": ""The Eichler home : intention and experience in postwar suburbia / Annmarie Adams""}, {""type"": ""/type/text"", ""value"": ""Rural adaptations of suburban bungalows, Sussex County, Delaware / Susan Mulchahey Chase""}, {""type"": ""/type/text"", ""value"": ""Building in stone in southwestern Pennyslvania : patterns and process / Karen Koegler""}, {""type"": ""/type/text"", ""value"": ""Private dwellings, public ways, and the landscape of early rural capitalism in Virginia's Shenandoah Valley / Warren R. Hofstra""}, {""type"": ""/type/text"", ""value"": ""The architectural and social topography of early-nineteenth-century Portsmouth, New Hampshire / Bernard L. Herman""}, {""type"": ""/type/text"", ""value"": ""From roadside camps to garden homes : housing and community planning for California's migrant work force, 1935-1941 / Greg Hise.""}], ""series"": [""Perspectives in vernacular architecture -- 5."", ""Perspectives in vernacular architecture (Knoxville, Tenn.) -- 5.""], ""lc_classifications"": [""NA705 .G36 1995""], ""contributions"": [""Cromley, Elizabeth C."", ""Hudgins, Carter L.""], ""edition_name"": ""1st ed."", ""title"": ""Gender, class, and shelter"", ""languages"": [{""key"": ""/languages/eng""}], ""subjects"": [""Vernacular architecture -- United States."", ""Architecture and society -- United States.""], ""publish_country"": ""tnu"", ""by_statement"": ""edited by Elizabeth Collins Cromley and Carter L. Hudgins."", ""type"": {""key"": ""/type/edition""}, ""revision"": 1, ""publishers"": [""University of Tennessee Press""], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T06:49:20.851055""}, ""key"": ""/books/OL17806216M"", ""publish_places"": [""Knoxville""], ""pagination"": ""xiv, 272 p. :"", ""dewey_decimal_class"": [""720/.1/060973""], ""notes"": {""type"": ""/type/text"", ""value"": ""Includes bibliographical references (p. [259]-264) and index.""}, ""number_of_pages"": 272, ""lccn"": [""94018715""], ""isbn_10"": [""087049872X""], ""publish_date"": ""1995""}"
1,/type/edition,/books/OL17841395M,1,2008-10-01 11:02:13.766308,"{""pagination"": ""$29.95"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:13.766308""}, ""title"": ""Elizabeth Blackburn and the story of telomeres"", ""notes"": {""type"": ""/type/text"", ""value"": ""SP""}, ""number_of_pages"": 95, ""isbn_10"": [""262026228""], ""key"": ""/books/OL17841395M"", ""authors"": [{""key"": ""/authors/OL5345253A""}], ""publish_places"": [""MIT, 2007""], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""History of Medicine""], ""revision"": 1}"
2,/type/edition,/books/OL17841396M,1,2008-10-01 11:02:14.299218,"{""publishers"": [""Medical Research Foundation of Oregon""], ""pagination"": ""1 sheet :"", ""table_of_contents"": [{""type"": ""/type/text"", ""value"": ""History and purpose.""}, {""type"": ""/type/text"", ""value"": ""Personnel.""}, {""type"": ""/type/text"", ""value"": ""Facilities.""}, {""type"": ""/type/text"", ""value"": ""Finances and organization.""}, {""type"": ""/type/text"", ""value"": ""Map of campus, dated April 1962 (buildings under construction or completed).""}], ""title"": ""Oregon Regional Primate Research Center facts."", ""oclc_number"": [""190823722""], ""languages"": [{""key"": ""/languages/eng""}], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:14.299218""}, ""publish_date"": ""1962"", ""publish_country"": ""oru"", ""key"": ""/books/OL17841396M"", ""authors"": [{""key"": ""/authors/OL4684114A""}], ""publish_places"": [""Portland, Or""], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Oregon Regional Primate Research Center."", ""Primates."", ""Research -- Oregon.""], ""revision"": 1}"
3,/type/edition,/books/OL17841387M,1,2008-10-01 11:02:08.596896,"{""subtitle"": ""effect in female, black, and diabetic patients, and cost-effectiveness"", ""series"": [""Evidence report/technology assessment -- no. 82"", ""AHRQ publication -- no. 03-E045""], ""lc_classifications"": [""RC685.C53 P437 2003""], ""contributions"": [""Shekelle, Paul G."", ""Morton, Sally C."", ""Southern California Evidence-Based Practice Center/RAND.""], ""title"": ""Pharmacologic management of heart failure and left ventricular systolic dysfunction"", ""languages"": [{""key"": ""/languages/eng""}], ""subjects"": [""Heart failure -- Chemotherapy -- Evaluation."", ""Heart failure -- Chemotherapy -- Cost effectiveness."", ""Adrenergic beta blockers -- Therapeutic use -- Evaluation."", ""Angiotensin converting enzyme -- Inhibitors -- Therapeutic use -- Evaluation."", ""Diabetes -- Complications -- Chemotherapy -- Evaluation."", ""Evidence-based medicine."", ""Heart Failure -- drug therapy."", ""Ventricular Dysfunction, Left -- drug therapy."", ""Evidence-Based Medicine -- methods.""], ""publish_country"": ""mdu"", ""by_statement"": ""prepared for Agency for Healthcare Research and Quality, U.S. Department of Health and Human Services ; prepared by Southern California-RAND Evidence-based Practice Center, Santa Monica, CA ; program director, Paul Shekelle, program co-director, Sally Morton ... [et al.]."", ""type"": {""key"": ""/type/edition""}, ""revision"": 1, ""publishers"": [""U.S. Dept. of Health and Human Services, Public Health Service, Agency for Healthcare Research and Quality""], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:08.596896""}, ""key"": ""/books/OL17841387M"", ""publish_places"": [""Rockville, Md""], ""oclc_number"": [""54960165""], ""pagination"": ""x, 163 p. :"", ""dewey_decimal_class"": [""616.1/29061""], ""notes"": {""type"": ""/type/text"", ""value"": ""\""Prepared for Agency for Healthcare Research and Quality, U.S. Department of Health and Human Services.\""\n\n\""July 2003.\""\n\n\""Contract no.: 290-97-0001.\""\n\n\""\""Bibliography\"": p. 101-128.\n\nIncludes bibliographical references (p. 83-85).""}, ""number_of_pages"": 163, ""lccn"": [""2003628991""], ""publish_date"": ""2003""}"
4,/type/edition,/books/OL17841388M,1,2008-10-01 11:02:09.629327,"{""publishers"": [""W.M. Hinton & Co., Printers""], ""pagination"": ""10 p. ;"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:09.629327""}, ""title"": ""Disease-germs"", ""type"": {""key"": ""/type/edition""}, ""notes"": {""type"": ""/type/text"", ""value"": ""\""An address delivered at the anniversary meeting of the San Francisco Medical Society, November, 1879.\""\n\nCover title.""}, ""number_of_pages"": 10, ""languages"": [{""key"": ""/languages/eng""}], ""subjects"": [""Germ theory of disease."", ""Bacteria."", ""Bacteria.""], ""publish_date"": ""1879"", ""publish_country"": ""cau"", ""key"": ""/books/OL17841388M"", ""authors"": [{""key"": ""/authors/OL2456514A""}], ""by_statement"": ""by J.H. Wythe."", ""publish_places"": [""S.F. [i.e. San Francisco]""], ""oclc_number"": [""15642021""], ""revision"": 1}"
5,/type/edition,/books/OL17841389M,1,2008-10-01 11:02:10.116704,"{""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:10.116704""}, ""title"": ""Current diagnosis and treatment: emergency medicine"", ""notes"": {""type"": ""/type/text"", ""value"": ""$69.95""}, ""isbn_10"": [""780071443197""], ""key"": ""/books/OL17841389M"", ""authors"": [{""key"": ""/authors/OL5345248A""}], ""publish_places"": [""McGraw Hill, 2008""], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Emergency Medicine""], ""revision"": 1}"
6,/type/edition,/books/OL17841390M,1,2008-10-01 11:02:10.530593,"{""title"": ""Midwifery & Childbirth in America"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:10.530593""}, ""key"": ""/books/OL17841390M"", ""authors"": [{""key"": ""/authors/OL5345249A""}], ""type"": {""key"": ""/type/edition""}, ""revision"": 1}"
7,/type/edition,/books/OL17841391M,1,2008-10-01 11:02:10.954860,"{""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:10.95486""}, ""title"": ""Fundamentals of musculoskeletal ultrasound"", ""notes"": {""type"": ""/type/text"", ""value"": ""$79.00""}, ""isbn_10"": [""416035931""], ""key"": ""/books/OL17841391M"", ""authors"": [{""key"": ""/authors/OL5345250A""}], ""publish_places"": [""Saunders, 2007""], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Radiology""], ""revision"": 1}"
8,/type/edition,/books/OL17841366M,1,2008-10-01 11:01:55.432347,"{""title"": ""Tami's test record"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:01:55.432347""}, ""key"": ""/books/OL17841366M"", ""authors"": [{""key"": ""/authors/OL5345237A""}], ""publish_places"": [""2007""], ""type"": {""key"": ""/type/edition""}, ""revision"": 1}"
9,/type/edition,/books/OL17841392M,1,2008-10-01 11:02:11.453848,"{""publishers"": [""Oregon Health & Science University, Dept. of Surgery""], ""pagination"": ""65 p. :"", ""table_of_contents"": [{""type"": ""/type/text"", ""value"": ""Introduction.""}, {""type"": ""/type/text"", ""value"": ""History of Oregon Health & Science University.""}, {""type"": ""/type/text"", ""value"": ""\""The downtown surgeons\""""}, {""type"": ""/type/text"", ""value"": ""History of the surgical residency programs.""}, {""type"": ""/type/text"", ""value"": ""OHSU surgery residency program.""}, {""type"": ""/type/text"", ""value"": ""History of OHSU surgery divisions.""}, {""type"": ""/type/text"", ""value"": ""March to a world class department of surgery.""}, {""type"": ""/type/text"", ""value"": ""Historic events in the Department of Surgery.""}, {""type"": ""/type/text"", ""value"": ""References.""}, {""type"": ""/type/text"", ""value"": ""Interviewees and manuscript reviewers.""}], ""title"": ""History of the Department of Surgery, 1867-2007"", ""oclc_number"": [""187326800""], ""notes"": {""type"": ""/type/text"", ""value"": ""Includes bibliographical references (p. 63-64).""}, ""number_of_pages"": 65, ""edition_name"": ""1st ed."", ""languages"": [{""key"": ""/languages/eng""}], ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2008-10-01T11:02:11.453848""}, ""publish_date"": ""2007"", ""publish_country"": ""oru"", ""key"": ""/books/OL17841392M"", ""authors"": [{""key"": ""/authors/OL5345251A""}], ""by_statement"": ""written by Patricia A. Southard."", ""publish_places"": [""Portland, Or""], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Oregon Health & Science University. -- Dept. of Surgery."", ""Oregon Health Sciences University. -- Dept. of Surgery."", ""University of Oregon Health Sciences Center. -- Dept. of Surgery."", ""University of Oregon. -- Medical School. -- Dept. of Surgery."", ""Surgery -- history -- Oregon."", ""Schools, Medical -- history -- Oregon.""], ""revision"": 1}"


In [31]:
editions_json_schema = StructType([
    StructField("key", StringType(), True),
    StructField("title", StringType(), True),
    StructField("subtitle", StringType(), True),
    StructField("type", StructType([
        StructField("key", StringType(), True)
    ]), True),
    StructField("authors", ArrayType(StructType([
        StructField("key", StringType(), True)
    ])), True),
    StructField("works", ArrayType(StructType([
        StructField("key", StringType(), True)
    ])), True),
    StructField("identifiers", StructType([]), True),  # Generic object, skipped for now
    StructField("isbn_10", ArrayType(StringType()), True),
    StructField("isbn_13", ArrayType(StringType()), True),
    StructField("lccn", ArrayType(StringType()), True),
    StructField("ocaid", StringType(), True),
    StructField("oclc_numbers", ArrayType(StringType()), True),
    StructField("local_id", ArrayType(StringType()), True),
    StructField("covers", ArrayType(IntegerType()), True),
    StructField("links", ArrayType(StructType([
        StructField("title", StringType(), True),
        StructField("url", StringType(), True)
    ])), True),
    StructField("languages", ArrayType(StructType([
        StructField("key", StringType(), True)
    ])), True),
    StructField("translated_from", ArrayType(StructType([
        StructField("key", StringType(), True)
    ])), True),
    StructField("translation_of", StringType(), True),
    StructField("by_statement", StringType(), True),
    StructField("weight", StringType(), True),
    StructField("edition_name", StringType(), True),
    StructField("number_of_pages", IntegerType(), True),
    StructField("pagination", StringType(), True),
    StructField("physical_dimensions", StringType(), True),
    StructField("physical_format", StringType(), True),
    StructField("copyright_date", StringType(), True),
    StructField("publish_country", StringType(), True),
    StructField("publish_date", StringType(), True),
    StructField("publish_places", ArrayType(StringType()), True),
    StructField("publishers", ArrayType(StringType()), True),
    StructField("contributions", ArrayType(StringType()), True),
    StructField("dewey_decimal_class", ArrayType(StringType()), True),
    StructField("genres", ArrayType(StringType()), True),
    StructField("lc_classifications", ArrayType(StringType()), True),
    StructField("other_titles", ArrayType(StringType()), True),
    StructField("series", ArrayType(StringType()), True),
    StructField("source_records", ArrayType(StringType()), True),
    StructField("subjects", ArrayType(StringType()), True),
    StructField("work_titles", ArrayType(StringType()), True),
    StructField("table_of_contents", ArrayType(StringType()), True),
    StructField("description", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("first_sentence", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("notes", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("revision", IntegerType(), True),
    StructField("latest_revision", IntegerType(), True),
    StructField("created", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True),
    StructField("last_modified", StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True)
])

In [32]:
df_eds_parsed = df_eds.withColumn(
    "json_parsed", from_json(col("json_data"), editions_json_schema)
)

In [33]:
df_eds_final = df_eds_parsed.select(
    col("record_type"),
    col("key").alias("edition_key"),
    col("json_parsed.title").alias("title"),
    col("json_parsed.subtitle").alias("subtitle"),
    # Simplify authors to array of keys
    transform(
        col("json_parsed.authors"),
        lambda x: x.key
    ).alias("author_keys"),
    col("json_parsed.works").alias("work_keys"),
    # Simplify languages to array of keys
    transform(
        col("json_parsed.languages"),
        lambda x: x.key
    ).alias("language_keys"),
    col("json_parsed.publish_country").alias("publish_country"),
    col("json_parsed.publish_date").alias("publish_date"),
    col("json_parsed.publish_places").alias("publish_places"),
    col("json_parsed.genres").alias("genres"),
    col("json_parsed.subjects").alias("subjects"),
)

In [34]:
df_eds_final.limit(10).toPandas()

Unnamed: 0,record_type,edition_key,title,subtitle,author_keys,work_keys,language_keys,publish_country,publish_date,publish_places,genres,subjects
0,/type/edition,/books/OL17806216M,"Gender, class, and shelter",,,,[/languages/eng],tnu,1995.0,[Knoxville],,"[Vernacular architecture -- United States., Architecture and society -- United States.]"
1,/type/edition,/books/OL17841395M,Elizabeth Blackburn and the story of telomeres,,[/authors/OL5345253A],,,,,"[MIT, 2007]",,[History of Medicine]
2,/type/edition,/books/OL17841396M,Oregon Regional Primate Research Center facts.,,[/authors/OL4684114A],,[/languages/eng],oru,1962.0,"[Portland, Or]",,"[Oregon Regional Primate Research Center., Primates., Research -- Oregon.]"
3,/type/edition,/books/OL17841387M,Pharmacologic management of heart failure and left ventricular systolic dysfunction,"effect in female, black, and diabetic patients, and cost-effectiveness",,,[/languages/eng],mdu,2003.0,"[Rockville, Md]",,"[Heart failure -- Chemotherapy -- Evaluation., Heart failure -- Chemotherapy -- Cost effectiveness., Adrenergic beta blockers -- Therapeutic use -- Evaluation., Angiotensin converting enzyme -- Inhibitors -- Therapeutic use -- Evaluation., Diabetes -- Complications -- Chemotherapy -- Evaluation., Evidence-based medicine., Heart Failure -- drug therapy., Ventricular Dysfunction, Left -- drug therapy., Evidence-Based Medicine -- methods.]"
4,/type/edition,/books/OL17841388M,Disease-germs,,[/authors/OL2456514A],,[/languages/eng],cau,1879.0,[S.F. [i.e. San Francisco]],,"[Germ theory of disease., Bacteria., Bacteria.]"
5,/type/edition,/books/OL17841389M,Current diagnosis and treatment: emergency medicine,,[/authors/OL5345248A],,,,,"[McGraw Hill, 2008]",,[Emergency Medicine]
6,/type/edition,/books/OL17841390M,Midwifery & Childbirth in America,,[/authors/OL5345249A],,,,,,,
7,/type/edition,/books/OL17841391M,Fundamentals of musculoskeletal ultrasound,,[/authors/OL5345250A],,,,,"[Saunders, 2007]",,[Radiology]
8,/type/edition,/books/OL17841366M,Tami's test record,,[/authors/OL5345237A],,,,,[2007],,
9,/type/edition,/books/OL17841392M,"History of the Department of Surgery, 1867-2007",,[/authors/OL5345251A],,[/languages/eng],oru,2007.0,"[Portland, Or]",,"[Oregon Health & Science University. -- Dept. of Surgery., Oregon Health Sciences University. -- Dept. of Surgery., University of Oregon Health Sciences Center. -- Dept. of Surgery., University of Oregon. -- Medical School. -- Dept. of Surgery., Surgery -- history -- Oregon., Schools, Medical -- history -- Oregon.]"


In [35]:
eds_output_path = f"{output_dir}/eds_partitioned"
df_eds_final.write.parquet(eds_output_path, mode="overwrite")

In [36]:
df_eds_final = spark.read.parquet(eds_output_path)

In [37]:
df_eds_final.rdd.getNumPartitions()

103

In [38]:
spark.stop()

In [39]:
total_size = 0.0
for i, file in enumerate(os.listdir('/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/eds_partitioned/')):
    file_path = '/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/eds_partitioned/' + file
    file_size_bytes = os.path.getsize(file_path)
    total_size += file_size_bytes
    print(f"{file}: {human_readable_size(file_size_bytes)}")
print(f"Total Size: {human_readable_size(total_size)}")

_SUCCESS: 0.00 B
part-00000-ac7e19f9-82eb-49d5-a0eb-c87bf900c2a5-c000.snappy.parquet: 12.86 GB
.part-00000-ac7e19f9-82eb-49d5-a0eb-c87bf900c2a5-c000.snappy.parquet.crc: 102.89 MB
._SUCCESS.crc: 8.00 B
Total Size: 12.96 GB


In [40]:
total_size = 0.0
for i, file in enumerate(os.listdir('/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/authors_partitioned/')):
    file_path = '/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/authors_partitioned/' + file
    file_size_bytes = os.path.getsize(file_path)
    total_size += file_size_bytes
    print(f"{file}: {human_readable_size(file_size_bytes)}")
print(f"Total Size: {human_readable_size(total_size)}")

_SUCCESS: 0.00 B
part-00000-1d12f884-9f4d-48b9-884d-37d03c766b68-c000.snappy.parquet: 446.62 MB
.part-00000-1d12f884-9f4d-48b9-884d-37d03c766b68-c000.snappy.parquet.crc: 3.49 MB
._SUCCESS.crc: 8.00 B
Total Size: 450.11 MB


In [41]:
total_size = 0.0
for i, file in enumerate(os.listdir('/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/works_partitioned/')):
    file_path = '/home/msds2025/jvalera/bdcc2025/bdcc-lab-openlib/partitioned_data/works_partitioned/' + file
    file_size_bytes = os.path.getsize(file_path)
    total_size += file_size_bytes
    print(f"{file}: {human_readable_size(file_size_bytes)}")
print(f"Total Size: {human_readable_size(total_size)}")

_SUCCESS: 0.00 B
part-00000-7a7024c9-2d18-4186-aab9-f9c08a405214-c000.snappy.parquet: 3.84 GB
.part-00000-7a7024c9-2d18-4186-aab9-f9c08a405214-c000.snappy.parquet.crc: 30.71 MB
._SUCCESS.crc: 8.00 B
Total Size: 3.87 GB
