In [2]:
import polars as pl

from py2neo import Graph

graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))

In [13]:
df = pl.from_dicts(graph.run("""
    match (artwork: Artwork) -- (artist: Artist)
    optional match (artist) -- (movement: Movement)
    optional match (artwork) -- (medium: Medium)
    return artwork.id as id, artwork.name as artwork, artist.name as artist, movement.name as movement, medium.name as medium, artwork.image_url as url
""").data()) \
    .with_columns([
        pl.col("url").str.extract(r"([^/]*)$").alias("filename"),
        pl.col("url").str.extract(r"^(?:[^/]*/){4}([^/]*)").alias("catalog")
    ]) \
    .join(pl.read_parquet("../data/Artwork.parquet.gzip", columns=["id", "url"]).rename({"url": "source_url"}), on="id") \
    .drop(["id", "url"]) \
    # .to_pandas()

print(df.shape)
df.head()

(11851, 7)


artwork,artist,movement,medium,filename,catalog,source_url
str,str,str,str,str,str,str
"""Peach Trees in Blossom""","""Vincent Van Gogh""","""Post-Impressionism""",,"""Peach+Trees+in+Blossom.j...","""Vincent+Van+Gogh""","""http://wikigallery.org/w..."
"""Backyards Of Old Houses ...","""Vincent Van Gogh""","""Post-Impressionism""",,"""Backyards+Of+Old+Houses+...","""Vincent+Van+Gogh""","""http://wikigallery.org/w..."
"""Bank Of The Oise At Auve...","""Vincent Van Gogh""","""Post-Impressionism""",,"""Bank+Of+The+Oise+At+Auve...","""Vincent+Van+Gogh""","""http://wikigallery.org/w..."
"""Autumn Landscape At Dusk...","""Vincent Van Gogh""","""Post-Impressionism""",,"""Autumn+Landscape+At+Dusk...","""Vincent+Van+Gogh""","""http://wikigallery.org/w..."
"""Blossoming Almond Tree""","""Vincent Van Gogh""","""Post-Impressionism""","""Oil on canvas""","""Blossoming+Almond+Tree.j...","""Vincent+Van+Gogh""","""http://wikigallery.org/w..."


In [14]:
graph.run("match (g: Generated) return g.url as url").to_data_frame().shape[0] + \
    graph.run("match (a: Artwork) return a.image_url as url").to_data_frame().shape[0]

13964

In [28]:
print("Total number of artwork entries with non-missing movement: ", df.shape[0] - df.select(["artwork", "movement"]).null_count().get_column("movement")[0])
df_count = df.select(["artwork", "movement"]).filter(pl.col("movement").is_not_null()).groupby("artwork").count()
print("Total number of unique artworks with non-missing movement: ", df_count.shape[0])
print("Artworks with only a single movement matching: ", df_count.filter(pl.col("count") == 1).shape[0])
print("Artworks with up to two movements matching: ", df_count.filter(pl.col("count") <= 2).shape[0])

Total number of artwork entries with non-missing movement:  5398
Total number of unique artworks with non-missing movement:  3979
Artworks with only a single movement matching:  2872
Artworks with up to two movements matching:  3712


In [48]:
df_movements_by_artwork = df.groupby("filename").count().sort("count", reverse=True)
df_movements_by_artwork.filter(pl.col("count") > 2).shape[0] / df_movements_by_artwork.shape[0]

0.04395604395604396

In [10]:
df = pl.from_dicts(graph.run("""
    match (s: Specialization) -- (a: Artist)
    match (a) -[r: DIED_IN]-> (deathplace: Place)
    optional match (a) -- (m: Movement)
    optional match (a) -- (ac: Academy)
    return a.name as artist, deathplace.name as deathplace, r.cause as cause_of_death, m.name as movement, s.name as spec, ac.name as education
    order by artist
""").data(), schema={"artist": str, "deathplace": str, "cause_of_death": str, "movement": str, "spec": str, "education": str})

df.head()

artist,deathplace,cause_of_death,movement,spec,education
str,str,str,str,str,str
"""(Giovanni Anto...","""Venice""",,,"""Landscapes""",
"""(Giovanni Anto...","""Venice""",,,"""Etching""",
"""(after) (Giova...","""Venice""",,,"""Landscapes""",
"""(after) (Giova...","""Venice""",,,"""Etching""",
"""(after) Alexis...","""Paris""",,,"""Painting""",
"""(after) Andrea...","""Florence""",,"""Renaissance""","""Painting""",
"""(after) Anthon...","""Antwerp""",,"""Renaissance""","""Painting""",
"""(after) Carlo ...","""Rome""",,"""Baroque""","""Painting""",
"""(after) Dyck, ...","""London""",,"""Baroque""","""Painting""",
"""(after) Franco...","""Paris""",,"""Baroque""","""Painting""",


In [11]:
pl.Config.set_fmt_str_lengths(25)
df.get_column("cause_of_death").value_counts().sort("counts")

cause_of_death,counts
str,u32
"""Suicide (allegedly)""",2
"""Gunshot wound""",8
,480


In [22]:
df.filter(pl.col("name").str.contains(r"Rubens")).get_column("name").to_list()

['Peter Paul Rubens', 'and Snyders, F. Rubens, Peter Paul']

In [25]:
df.filter(pl.col("name").str.contains(r"after")).get_column("name").to_list()

['(after) Hieronymus Bosch',
 '(after) Pietro Antonio Rotari',
 '(after) William Etty',
 '(after) Philippe De Champaigne',
 '(after) Harmenszoon Van Rijn Rembrandt',
 '(after) Dyck, Sir Anthony van',
 '(after) (Giovanni Antonio Canal) Canaletto',
 '(after) Millais, Sir John Everett',
 '(after) Carel Fabritius',
 '(after) Franz Von Defregger',
 '(after) Giovanni Paolo Panini',
 '(after) Pugin, Augustus Charles',
 '(after) Francois Boucher',
 '(after) Sir Edward Coley Burne-Jones',
 '(after) Alexis-Simon Belle',
 '(after) Willem Van Mieris Leiden',
 '(after) Ambrosius Benson',
 '(after) Federico Zuccaro',
 '(after) Sebastian Vrancx',
 '(after) Antonis Mor',
 '(after) Sir Joshua Reynolds',
 '(after) Carlo Maratta Or Maratti',
 '(after) Abraham Janssens Van Nuyssen',
 '(after) Luca Cambiaso',
 '(after) Tiziano Vecellio (Titian)',
 '(after) Kneller, Sir Godfrey',
 '(after) Louis De Caullery',
 '(after) Huysmans, Jacob',
 '(after) Hoefnagel, Joris',
 '(after) Landseer, Sir Edwin',
 '(after) 