In [None]:
import pandas as pd
import numpy as np
import psycopg2

# Load datasets
moma_artists = pd.read_csv('Artists.csv')
moma_artworks = pd.read_csv('Artworks.csv')
painter_palette = pd.read_csv('PainterPalette.csv')
wikidata = pd.read_csv('Total_Merged_Painters_Data.csv')

Clean MOMA Artists dataset:

In [None]:
# Ensure ConstituentID is unique
moma_artists = moma_artists.drop_duplicates(subset=['ConstituentID'])

# Extract birth and death years from ArtistBio. Fill missing values and convert data types.
moma_artists['birth_year'] = moma_artists['ArtistBio'].str.extract(r'(\d{4})–').astype(float).fillna(0).astype(int)
moma_artists['death_year'] = moma_artists['ArtistBio'].str.extract(r'–(\d{4})').astype(float).fillna(0).astype(int)
moma_artists['Nationality'] = moma_artists['Nationality'].fillna('Unknown')
moma_artists['Gender'] = moma_artists['Gender'].fillna('Unknown')

Clean MOMA Artworks dataset:

In [None]:
# Ensure artwork is unique
moma_artworks = moma_artworks.drop_duplicates(subset=['ObjectID'], keep='first')
moma_artworks = moma_artworks.drop_duplicates(subset=['Title'])

# Convert ConstituentID column to string, split, explode, and convert to integers
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].astype(str).str.split(', ')
moma_artworks = moma_artworks.explode('ConstituentID')
moma_artworks['ConstituentID'] = pd.to_numeric(moma_artworks['ConstituentID'], errors='coerce').fillna(0).astype(int)

# Filter out rows where ConstituentID is 0
moma_artworks = moma_artworks[moma_artworks['ConstituentID'] != 0]

# Extract start and end years from the date column
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float)  # Extract start year
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)  # Extract end year (last 2 digits)

# Handle cases where the end year is only 2 digits (e.g., "1976-77")
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float).fillna(0).astype(int)
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)
moma_artworks['end_year'] = (
    moma_artworks['start_year'].astype(str).str[:2] + 
    moma_artworks['end_year'].astype(str).str.zfill(2)
).fillna(moma_artworks['start_year'].astype(str))

moma_artworks['start_year'] = pd.to_numeric(moma_artworks['start_year'], errors='coerce').fillna(0).astype(int)
moma_artworks['end_year'] = pd.to_numeric(moma_artworks['end_year'], errors='coerce').fillna(0).astype(int)

moma_artworks['Artist'] = moma_artworks['Artist'].fillna('Unknown')

Clean PainterPalette dataset:

In [None]:
# Clean PainterPalette dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

painter_palette[columns_to_clean] = painter_palette[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    painter_palette[col] = painter_palette[col].str.split(',')

Clean WikiData dataset:

In [None]:
# Clean WikiData dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

wikidata[columns_to_clean] = wikidata[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    wikidata[col] = wikidata[col].str.split(',')

Merge datasets:

In [None]:
# Fill in PainterPalette with data from WikiData dataset

artists_combined = pd.merge(
    painter_palette, 
    wikidata, 
    left_on='artist', 
    right_on='artist_name', 
    how='left',
    suffixes=('', '_wiki')
)

# List of columns to fill from the wikidata dataset
columns_to_fill = ['Nationality', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements','birth_place', 'death_place', 'birth_year', 'death_year',
                   'locations', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'styles_extended', 'locations_with_years', 'StylesCount', 'StylesYears', 
                   'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 
                   'FriendsandCoworkers', 'Contemporary', 'Type']

# Combine the columns to fill from both painter_palette and wikidata
for col in columns_to_fill:
    if col in artists_combined.columns and f'{col}_wiki' in artists_combined.columns:
        # Fill missing values in the target column (from painter_palette) with the values from the wiki column
        artists_combined[col] = artists_combined[col].combine_first(artists_combined[f'{col}_wiki'])
    else:
        print(f"Warning: Column {col} or {col}_wiki not found in the merged dataset.")

# Drop the columns that were added from wikidata dataset to avoid duplication
for col in columns_to_fill:
    wiki_col = f'{col}_wiki'
    if wiki_col in artists_combined.columns:
        artists_combined.drop(columns=[wiki_col], inplace=True)

In [None]:
# Merge with MOMA Artists dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artists, 
    left_on='artist', 
    right_on='DisplayName', 
    how='left'
)

artists_combined['Nationality'] = artists_combined['Nationality_x']
artists_combined = artists_combined.drop(columns=['Nationality_x', 'Nationality_y'])

In [None]:
# Merge with MOMA Artworks dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artworks, 
    left_on='artist', 
    right_on='Artist', 
    how='left'
)

In [None]:
# List of columns to combine that have '_x' and '_y' versions
columns_to_combine = ['birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']

# Combine the '_x' and '_y' columns
for col in columns_to_combine:
    col_x = col + '_x'
    col_y = col + '_y'
    
    # Ensure both columns exist in the dataframe
    if col_x in artists_combined.columns and col_y in artists_combined.columns:
        # Combine the columns (using _x values, and filling missing values from _y)
        artists_combined[col] = artists_combined[col_x].combine_first(artists_combined[col_y])
        
        # Drop the original '_x' and '_y' columns after combining
        artists_combined.drop(columns=[col_x, col_y], inplace=True)
    else:
        continue

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

artists_combined.head()

artists_combined.to_csv('artists_combined.csv', index=False)

Connect and insert into Postgres:

In [None]:
# Connect to Postgres
conn = psycopg2.connect(
    dbname="art",
    user="postgres",
    password="",
    host="localhost",
    port="5432"
)
cursor = conn.cursor()

In [None]:
import pandas as pd

# Iterate through artists_combined and insert into Artists table
for _, row in artists_combined.iterrows():

    # Check and clean year columns to ensure valid ranges
    birth_year = row.get('birth_year', None)
    death_year = row.get('death_year', None)
    career_start_year = row.get('FirstYear', None)
    career_end_year = row.get('LastYear', None)

    # Skip rows where year columns NaN or too large
    if pd.isna(birth_year) or birth_year > 2147483647:
        continue  # Skip this row

    if pd.isna(death_year) or death_year > 2147483647:
        continue  # Skip this row

    if pd.isna(career_start_year) or career_start_year > 2147483647:
        continue  # Skip this row

    if pd.isna(career_end_year) or career_end_year > 2147483647:
        continue  # Skip this row

    # Convert to int
    birth_year = int(birth_year)
    death_year = int(death_year)
    career_start_year = int(career_start_year)
    career_end_year = int(career_end_year)

    artist_name = row['artist']
    birth_place = row.get('birth_place', None)  # Use None if no value exists
    death_place = row.get('death_place', None)
    nationality = row.get('Nationality', None)
    citizenship = row.get('citizenship', None)
    gender = row.get('gender', None)

    # Insert data into the Artists table
    cursor.execute("""
        INSERT INTO Artists (
            artist_name, birth_year, birth_place, death_place, nationality, 
            citizenship, gender, death_year, career_start_year, career_end_year
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (artist_name, birth_year) DO NOTHING;
    """, (
        artist_name, birth_year, birth_place, death_place, nationality, 
        citizenship, gender, death_year, career_start_year, career_end_year
    ))


In [None]:
# Extract unique occupations, flatten lists
unique_occupations = set()

for occupations in artists_combined['occupations'].dropna():
    if isinstance(occupations, list):  # If stored as a list
        unique_occupations.update(occupations)
    else:
        unique_occupations.add(occupations)  # If single value

# Insert unique occupations into the Occupations table
for occupation in unique_occupations:
    cursor.execute("""
        INSERT INTO Occupations (occupation_name)
        VALUES (%s)
        ON CONFLICT (occupation_name) DO NOTHING;
    """, (occupation,))

In [None]:
# Iterate over each row in the DataFrame
for _, row in artists_combined.iterrows():
    artist_name = row.get('artist', None)
    birth_year = row.get('birth_year', None)

    # Skip if birth_year is NaN, too large, or artist_name is missing
    if pd.isna(artist_name) or pd.isna(birth_year) or birth_year > 2147483647:
        continue

    birth_year = int(birth_year)  # Convert birth_year to int

    # Ensure the artist exists in Artists table before inserting occupations
    if (artist_name, birth_year) not in existing_artists:
        continue  # Skip this row if artist doesn't exist in Artists table

    # Extract occupations from row
    occupations = row.get('occupations', None)
    unique_occupations = set()

    # Check if occupations is not NaN and handle different cases for list-like or single value
    if pd.notna(occupations):  # Check if occupations exists (and isn't NaN)
        if isinstance(occupations, str):  # If occupations is a single string
            unique_occupations.update(occupations.split(', '))  # Split string into list
        elif isinstance(occupations, list):  # If it's a list of occupations
            unique_occupations.update(occupations)
        elif isinstance(occupations, np.ndarray):  # If it's a numpy array
            # Convert the numpy array to a list and add to unique_occupations
            unique_occupations.update(occupations.tolist())
        else:  # If it's another type (e.g., float or other), convert it to string and add
            unique_occupations.add(str(occupations))

    # Insert valid occupations into the Artist_Occupations table
    for occupation in unique_occupations:
        cursor.execute("""
            INSERT INTO Artist_Occupations (artist_name, birth_year, occupation_name)
            VALUES (%s, %s, %s)
            ON CONFLICT (artist_name, birth_year, occupation_name) DO NOTHING;
        """, (artist_name, birth_year, occupation))

Commit and close the connection:

In [None]:
print(artists_combined.columns.tolist())

In [None]:
# Commit changes
conn.commit()

# Close the connection
cursor.close()
conn.close()