In [None]:
import sys
sys.path.append('../')

In [None]:
import pandas as pd

# Load MOMA datasets
moma_artists = pd.read_csv('data/Artists.csv')
moma_artworks = pd.read_csv('data/Artworks.csv')

# Load PainterPalette dataset
painter_palette = pd.read_csv('data/PainterPalette.csv')

In [29]:
moma_artists.isna().sum()

ConstituentID        0
DisplayName          0
ArtistBio         2205
Nationality          0
Gender               0
BeginDate            0
EndDate              0
Wiki QID         12420
ULAN             12736
birth_year           0
death_year           0
dtype: int64

In [30]:
moma_artworks.isna().sum()

Title                      0
Artist                     0
ConstituentID              0
ArtistBio               3915
Nationality                0
BeginDate                968
EndDate                  968
Gender                     0
Date                    1077
Medium                  7781
Dimensions              5386
CreditLine              1318
AccessionNumber            0
Classification             0
Department                 0
DateAcquired            3977
Cataloged                  0
ObjectID                   0
URL                    39600
ImageURL               46817
OnView                107728
Circumference (cm)    108520
Depth (cm)             97627
Diameter (cm)         107646
Height (cm)            18158
Length (cm)           108077
Weight (kg)           108285
Width (cm)             18734
Seat Height (cm)      108526
Duration (sec.)       106793
start_year                 0
end_year                   0
dtype: int64

In [31]:
painter_palette.isna().sum()

artist                           0
Nationality                  10361
citizenship                      0
gender                           0
styles                       10361
movement                         0
Art500k_Movements             8013
birth_place                      0
death_place                      0
birth_year                     692
death_year                    2606
FirstYear                     3161
LastYear                      3161
wikiart_pictures_count        7159
locations                       93
locations_with_years            93
styles_extended               7159
StylesCount                   7786
StylesYears                   8012
occupations                  10361
PaintingsExhibitedAt          6318
PaintingsExhibitedAtCount     6318
PaintingSchool                   0
Influencedby                 10361
Influencedon                 10361
Pupils                       10361
Teachers                     10361
FriendsandCoworkers          10361
Contemporary        

In [32]:
print(len(painter_palette.index))

10361


Clean MOMA Artists dataset:

In [25]:
# Ensure ConstituentID is unique
if moma_artists['ConstituentID'].duplicated().any():
    moma_artists = moma_artists.drop_duplicates(subset=['ConstituentID'])

In [26]:
# Extract birth and death years from ArtistBio
moma_artists['birth_year'] = moma_artists['ArtistBio'].str.extract(r'(\d{4})–').astype(float)  # Extract birth year
moma_artists['death_year'] = moma_artists['ArtistBio'].str.extract(r'–(\d{4})').astype(float)  # Extract death year

# Handle missing values (e.g., if no birth or death year is found)
moma_artists['Nationality'] = moma_artists['Nationality'].fillna('Unknown')
moma_artists['Gender'] = moma_artists['Gender'].fillna('Unknown')
moma_artists['birth_year'] = moma_artists['birth_year'].fillna(0).astype(int)
moma_artists['death_year'] = moma_artists['death_year'].fillna(0).astype(int)

# Save cleaned data
moma_artists.to_csv('/tmp/cleaned_moma_artists.csv', index=False)

Clean MOMA Artworks dataset:

In [27]:
# Ensure `artwork_id` is unique
if moma_artworks['Title'].duplicated().any():
    moma_artworks = moma_artworks.drop_duplicates(subset=['Title'])

# Convert ConstituentID column to string (to handle non-string values)
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].astype(str)

# Split the ConstituentID column into a list of IDs
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].str.split(', ')

# Explode the ConstituentID column
moma_artworks = moma_artworks.explode('ConstituentID')

# Convert ConstituentID to integers (handle any remaining invalid values)
moma_artworks['ConstituentID'] = pd.to_numeric(moma_artworks['ConstituentID'], errors='coerce')

# Extract start and end years from the date column
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float)  # Extract start year
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)  # Extract end year (last 2 digits)

# Handle cases where the end year is only 2 digits (e.g., "1976-77")
# Combine start year's century with end year's last 2 digits
moma_artworks['end_year'] = (
    moma_artworks['start_year'].astype(str).str[:2] +  # Get the first 2 digits of start year (century)
    moma_artworks['end_year'].astype(str).str[-2:]     # Get the last 2 digits of end year
)

# Convert end_year to numeric, handling invalid values
moma_artworks['end_year'] = pd.to_numeric(moma_artworks['end_year'], errors='coerce')

# Handle missing values
moma_artworks['Artist'] = moma_artworks['Artist'].fillna('Unknown')
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].fillna(0).astype(int)
moma_artworks['Nationality'] = moma_artworks['Nationality'].fillna('Unknown')
moma_artworks['Gender'] = moma_artworks['Gender'].fillna('Unknown')
moma_artworks['Title'] = moma_artworks['Title'].fillna('Unknown')
moma_artworks['start_year'] = moma_artworks['start_year'].fillna(0).astype(int)
moma_artworks['end_year'] = moma_artworks['end_year'].fillna(0).astype(int)

# Save cleaned data
moma_artworks.to_csv('/tmp/cleaned_moma_artworks.csv', index=False)

Clean PainterPalette dataset:

In [28]:
# Handle missing or inconsistent data
painter_palette['Nationality'] = painter_palette['Nationality'].fillna('Unknown')
painter_palette['citizenship'] = painter_palette['Nationality'].fillna('Unknown')
painter_palette['gender'] = painter_palette['Nationality'].fillna('Unknown')
painter_palette['styles'] = painter_palette['styles'].fillna('Unknown')
painter_palette['movement'] = painter_palette['movement'].fillna('Unknown')
painter_palette['birth_place'] = painter_palette['birth_place'].fillna('Unknown')
painter_palette['death_place'] = painter_palette['death_place'].fillna('Unknown')
painter_palette['occupations'] = painter_palette['occupations'].fillna('Unknown')
painter_palette['Influencedby'] = painter_palette['Influencedby'].fillna('Unknown')
painter_palette['Influencedon'] = painter_palette['Influencedon'].fillna('Unknown')
painter_palette['Pupils'] = painter_palette['Pupils'].fillna('Unknown')
painter_palette['Teachers'] = painter_palette['Teachers'].fillna('Unknown')
painter_palette['FriendsandCoworkers'] = painter_palette['FriendsandCoworkers'].fillna('Unknown')
painter_palette['Contemporary'] = painter_palette['Contemporary'].fillna('Unknown')
painter_palette['PaintingSchool'] = painter_palette['PaintingSchool'].fillna('Unknown')

# Split multiple values into lists
painter_palette['styles'] = painter_palette['styles'].str.split(',')
painter_palette['Nationality'] = painter_palette['Nationality'].str.split(',')
painter_palette['Influencedby'] = painter_palette['Influencedby'].str.split(',')
painter_palette['Influencedon'] = painter_palette['Influencedon'].str.split(',')
painter_palette['Pupils'] = painter_palette['Pupils'].str.split(',')
painter_palette['Teachers'] = painter_palette['Teachers'].str.split(',')
painter_palette['FriendsandCoworkers'] = painter_palette['FriendsandCoworkers'].str.split(',')
painter_palette['Contemporary'] = painter_palette['Contemporary'].str.split(',')
painter_palette['occupations'] = painter_palette['occupations'].str.split(',')

# Save cleaned data
painter_palette.to_csv('/tmp/cleaned_painter_palette.csv', index=False)

Use psycopg2 to connect to PostgreSQL DB:

In [None]:
import psycopg2

# Connect to Postgres
conn = psycopg2.connect(
    dbname="art",
    user="postgres",
    password="",
    host="localhost",
    port="5432"
)
cursor = conn.cursor()

In [None]:
# Merge datasets

# Combine MOMA Artists and PainterPalette datasets
artists_combined = pd.merge(
    moma_artists,
    painter_palette,
    left_on='name',
    right_on='artist',
    how='left'
)

# Insert values into Artists table
for _, row in artists_combined.iterrows():
    cursor.execute("""
        INSERT INTO Artists (artist_id, name, nationality, gender, birth_year, death_year, influenced_by, influenced_on, pupils, teachers, friends_and_coworkers, contemporary, art_movement, occupations, painting_school)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, (
        row['artist_id'],
        row['name'],
        row['nationality'],
        row['gender'],
        row['birth_year'],
        row['death_year'],
        row['influenced_by'],
        row['influenced_on'],
        row['pupils'],
        row['teachers'],
        row['friends_and_coworkers'],
        row['contemporary'],
        row['art_movement'],
        row['occupations'],
        row['painting_school']
    ))

In [None]:
# Insert values into Artworks table
for _, row in moma_artworks.iterrows():
    cursor.execute("""
        INSERT INTO Artworks (artwork_id, title, artist_id, date, medium, dimensions, acquisition_date, credit, catalogue, styles, movement, movements, styles_extended, styles_count, styles_years, paintings_exhibited_at, paintings_exhibited_at_count)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, (
        row['artwork_id'],
        row['title'],
        row['artist_id'],
        row['date'],
        row['medium'],
        row['dimensions'],
        row['acquisition_date'],
        row['credit'],
        row['catalogue'],
        row['styles'],
        row['movement'],
        row['movements'],
        row['styles_extended'],
        row['styles_count'],
        row['styles_years'],
        row['paintings_exhibited_at'],
        row['paintings_exhibited_at_count']
    ))

In [None]:
# Extract unique movements
unique_movements = painter_palette[['movement', 'movements']].drop_duplicates()

# Insert into ArtMovements table
for _, row in unique_movements.iterrows():
    cursor.execute("""
        INSERT INTO ArtMovements (movement_name, styles, movements)
        VALUES (%s, %s, %s)
    """, (
        row['movement'],
        row['styles'],
        row['movements']
    ))

In [None]:
# Extract unique countries
unique_countries = painter_palette[['nationality', 'citizenship', 'birth_place', 'death_place', 'locations', 'locations_with_years']].drop_duplicates()

# Insert into Countries table
for _, row in unique_countries.iterrows():
    cursor.execute("""
        INSERT INTO Countries (nationality, citizenship, birth_place, death_place, locations, locations_with_years)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, (
        row['nationality'],
        row['citizenship'],
        row['birth_place'],
        row['death_place'],
        row['locations'],
        row['locations_with_years']
    ))

In [None]:
# Extract unique exhibitions
unique_exhibitions = painter_palette[['paintings_exhibited_at', 'paintings_exhibited_at_count']].drop_duplicates()

# Insert into Exhibitions table
for _, row in unique_exhibitions.iterrows():
    cursor.execute("""
        INSERT INTO Exhibitions (paintings_exhibited_at, paintings_exhibited_at_count)
        VALUES (%s, %s)
    """, (
        row['paintings_exhibited_at'],
        row['paintings_exhibited_at_count']
    ))

In [None]:
# Extract unique occupations
unique_occupations = painter_palette['occupations'].explode().drop_duplicates()

# Insert into Occupations table
for occupation in unique_occupations:
    cursor.execute("""
        INSERT INTO Occupations (name)
        VALUES (%s)
    """, (occupation,))

Commit and close the connection:

In [None]:
# Commit changes
conn.commit()

# Close the connection
cursor.close()
conn.close()