In [1]:
# ! conda install psycopg2 -y

In [2]:
# ! conda install tqdm -y

In [3]:
# ! conda install regex -y

In [2]:
import pandas as pd
import numpy as np
import psycopg2
from tqdm import tqdm
import regex as re
import json

### Combine Data Sources

Can skip this section and load combined file below

In [None]:
# Load datasets
moma_artists = pd.read_csv('data/Artists.csv')
moma_artworks = pd.read_csv('data/Artworks.csv')
painter_palette = pd.read_csv('data/PainterPalette.csv')
wikidata = pd.read_csv('data/Total_Merged_Painters_Data.csv')

Clean MOMA Artists dataset:

In [4]:
# Ensure ConstituentID is unique
moma_artists = moma_artists.drop_duplicates(subset=['ConstituentID'])

# Extract birth and death years from ArtistBio. Fill missing values and convert data types.
moma_artists['birth_year'] = moma_artists['ArtistBio'].str.extract(r'(\d{4})–').astype(float).fillna(0).astype(int)
moma_artists['death_year'] = moma_artists['ArtistBio'].str.extract(r'–(\d{4})').astype(float).fillna(0).astype(int)
moma_artists['Nationality'] = moma_artists['Nationality'].fillna('Unknown')
moma_artists['Gender'] = moma_artists['Gender'].fillna('Unknown')

Clean MOMA Artworks dataset:

In [5]:
# Ensure artwork is unique
moma_artworks = moma_artworks.drop_duplicates(subset=['ObjectID'], keep='first')
moma_artworks = moma_artworks.drop_duplicates(subset=['Title'])

# Convert ConstituentID column to string, split, explode, and convert to integers
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].astype(str).str.split(', ')
moma_artworks = moma_artworks.explode('ConstituentID')
moma_artworks['ConstituentID'] = pd.to_numeric(moma_artworks['ConstituentID'], errors='coerce').fillna(0).astype(int)

# Filter out rows where ConstituentID is 0
moma_artworks = moma_artworks[moma_artworks['ConstituentID'] != 0]

# Extract start and end years from the date column
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float)  # Extract start year
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)  # Extract end year (last 2 digits)

# Handle cases where the end year is only 2 digits (e.g., "1976-77")
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float).fillna(0).astype(int)
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)
moma_artworks['end_year'] = (
    moma_artworks['start_year'].astype(str).str[:2] + 
    moma_artworks['end_year'].astype(str).str.zfill(2)
).fillna(moma_artworks['start_year'].astype(str))

moma_artworks['start_year'] = pd.to_numeric(moma_artworks['start_year'], errors='coerce').fillna(0).astype(int)
moma_artworks['end_year'] = pd.to_numeric(moma_artworks['end_year'], errors='coerce').fillna(0).astype(int)

moma_artworks['Artist'] = moma_artworks['Artist'].fillna('Unknown')

Clean PainterPalette dataset:

In [6]:
# Clean PainterPalette dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

painter_palette[columns_to_clean] = painter_palette[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    painter_palette[col] = painter_palette[col].str.split(',')

Clean WikiData dataset:

In [7]:
# Clean WikiData dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

wikidata[columns_to_clean] = wikidata[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    wikidata[col] = wikidata[col].str.split(',')

Merge datasets:

In [8]:
# Fill in PainterPalette with data from WikiData dataset

artists_combined = pd.merge(
    painter_palette, 
    wikidata, 
    left_on='artist', 
    right_on='artist_name', 
    how='left',
    suffixes=('', '_wiki')
)

# List of columns to fill from the wikidata dataset
columns_to_fill = ['Nationality', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements','birth_place', 'death_place', 'birth_year', 'death_year',
                   'locations', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'styles_extended', 'locations_with_years', 'StylesCount', 'StylesYears', 
                   'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 
                   'FriendsandCoworkers', 'Contemporary', 'Type']

# Combine the columns to fill from both painter_palette and wikidata
for col in columns_to_fill:
    if col in artists_combined.columns and f'{col}_wiki' in artists_combined.columns:
        # Fill missing values in the target column (from painter_palette) with the values from the wiki column
        artists_combined[col] = artists_combined[col].combine_first(artists_combined[f'{col}_wiki'])
    else:
        print(f"Warning: Column {col} or {col}_wiki not found in the merged dataset.")

# Drop the columns that were added from wikidata dataset to avoid duplication
for col in columns_to_fill:
    wiki_col = f'{col}_wiki'
    if wiki_col in artists_combined.columns:
        artists_combined.drop(columns=[wiki_col], inplace=True)

In [9]:
# Merge with MOMA Artists dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artists, 
    left_on='artist', 
    right_on='DisplayName', 
    how='left'
)

artists_combined['Nationality'] = artists_combined['Nationality_x']
artists_combined = artists_combined.drop(columns=['Nationality_x', 'Nationality_y'])

In [10]:
# Merge with MOMA Artworks dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artworks, 
    left_on='artist', 
    right_on='Artist', 
    how='left'
)

In [11]:
# List of columns to combine that have '_x' and '_y' versions
columns_to_combine = ['birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']

# Combine the '_x' and '_y' columns
for col in columns_to_combine:
    col_x = col + '_x'
    col_y = col + '_y'
    
    # Ensure both columns exist in the dataframe
    if col_x in artists_combined.columns and col_y in artists_combined.columns:
        # Combine the columns (using _x values, and filling missing values from _y)
        artists_combined[col] = artists_combined[col_x].combine_first(artists_combined[col_y])
        
        # Drop the original '_x' and '_y' columns after combining
        artists_combined.drop(columns=[col_x, col_y], inplace=True)
    else:
        continue

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

artists_combined.head()

artists_combined.to_csv('data/artists_combined.csv', index=False)

### Load Combined Data

In [3]:
artists_combined = pd.read_csv('data/artists_combined.csv')

  artists_combined = pd.read_csv('data/artists_combined.csv')


### Create Postgres Database

#### Connect To Postgres:

In [29]:
# Connect to Postgres
conn = psycopg2.connect(
    dbname="202Project",
    user="postgres",
    password="***REMOVED***",
    host="localhost",
    port="5432"
)
cursor = conn.cursor()

#### Insert Artists

In [13]:
# Iterate through artists_combined and insert into Artists table
for _, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0]):

    # Check and clean year columns to ensure valid ranges
    birth_year = row.get('birth_year', None)
    death_year = row.get('death_year', None)
    career_start_year = row.get('FirstYear', None)
    career_end_year = row.get('LastYear', None)

    # Skip rows where year columns NaN or too large
    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None  # Skip this row

    if pd.isna(death_year) or death_year > 2030:
        death_year = None  # Skip this row

    if pd.isna(career_start_year) or career_start_year > 2030:
        career_start_year = None  # Skip this row

    if pd.isna(career_end_year):
        career_end_year = None  # Skip this row
        
    if type(career_end_year) == str:
        if "https" in career_end_year:
            career_end_year = None
        else:
            career_end_year = int(career_end_year.replace(".0", ""))
    if (career_end_year is not None) and career_end_year > 2030:
        career_end_year = None

    # Convert to int
    birth_year = int(birth_year) if birth_year is not None else None
    birth_year = None if birth_year == 0 else birth_year

    death_year = None if (death_year is not None) and death_year == 0 else death_year

    career_start_year = None if (career_start_year is not None) and career_start_year == 0 else career_start_year

    career_end_year = None if (career_end_year is not None) and career_end_year == 0 else career_end_year

    artist_name = row['artist']

    nationality = row.get('Nationality', None)
    if re.match(r'Q\d+', str(nationality)):
        nationality = None
    if nationality is not None:
        nationality = str(nationality).strip("[").strip("]").replace("'", "")
    if 'unknown' in str(nationality).lower():
        nationality = None
    citizenship = row.get('citizenship', None)
    if 'unknown' in str(citizenship).lower():
        citizenship = None
    gender = row.get('gender', None)
    if 'unknown' in str(gender).lower():
        gender = None

    # Insert data into the Artists table
    cursor.execute("""
        INSERT INTO Artists (
            artist_name, birth_year, nationality, 
            citizenship, gender, death_year, career_start_year, career_end_year
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (artist_name, birth_year_key) DO NOTHING;
    """, (
        artist_name, birth_year, nationality, 
        citizenship, gender, death_year, career_start_year, career_end_year
    ))

conn.commit()


  0%|          | 0/60713 [00:00<?, ?it/s]

100%|██████████| 60713/60713 [01:02<00:00, 972.39it/s] 


#### Inserting Occupations & Occupations_Artists

In [15]:
# Extract the "occupations" column from artists_combined DataFrame
occupations_data = artists_combined['occupations']

# Create a set to store unique occupations
unique_occupations = set()

# Process each row in the occupations column
for occupations in occupations_data:
    if isinstance(occupations, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_occupations.update([occ.strip() for occ in occupations.split(', ') if occ.strip()])
    elif isinstance(occupations, list):  
        # If it's a list, clean each element
        unique_occupations.update([str(occ).strip() for occ in occupations if pd.notna(occ) and str(occ).strip()])
    elif isinstance(occupations, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(occupations).all():  
            unique_occupations.update([str(occ).strip() for occ in occupations.tolist() if pd.notna(occ) and str(occ).strip()])
    elif pd.notna(occupations):  
        # Handle any other non-null value
        unique_occupations.add(str(occupations).strip())

cleaner_occupations = set()
# more processing yay!
for occupation in tqdm(unique_occupations, desc="processing isolated occs"):
    occupation = occupation.replace("[","").replace("]","").replace("'","").replace("\"","")
    if re.match(r'Q\d+', occupation):
        continue
    cleaner_occupations.add(occupation)

# Insert unique occupations into the Occupations table
for occupation in tqdm(cleaner_occupations, desc="inserting"):
    try:
        cursor.execute("""
            INSERT INTO Occupations (occupation_name)
            VALUES (%s)
            ON CONFLICT (occupation_name) DO NOTHING;
        """, (occupation,))
    except Exception as e:
        print(f"Error inserting occupation {occupation} -> {e}")
        conn.rollback()

conn.commit()

print("Occupation Insertion Complete!")

processing isolated occs: 100%|██████████| 1753/1753 [00:00<00:00, 69607.92it/s]
inserting: 100%|██████████| 986/986 [00:00<00:00, 2582.35it/s]

Occupation Insertion Complete!





In [22]:
# Fetch existing artists from the database before processing
cursor.execute("SELECT artist_name FROM Artists;")
existing_artists = set(cursor.fetchall())  # Store as a set of tuples (artist_name, birth_year)
print("fetched existing artists")

# Iterate over each row in the DataFrame
for _, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0], 
                   desc="iterating artists"):
    artist_name = row.get('artist', None)
    birth_year = row.get('birth_year', None)

    # Convert to int
    birth_year = int(birth_year) if (not pd.isna(birth_year)) and (birth_year is not None) else None
    birth_year = None if (pd.isna(birth_year)) or (birth_year == 0) or (birth_year > 2030) else birth_year

    # Ensure the artist exists in Artists table before inserting occupations
    if (artist_name) not in existing_artists:
        continue  # Skip this row if artist doesn't exist in Artists table

    # Extract occupations from row
    occupations = row.get('occupations', None)
    
    # Create a set to store unique occupations
    unique_occupations = set()

    # Process each row in the occupations column
    if isinstance(occupations, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_occupations.update([occ.strip() for occ in occupations.split(', ') if occ.strip()])
    elif isinstance(occupations, list):  
        # If it's a list, clean each element
        unique_occupations.update([str(occ).strip() for occ in occupations if pd.notna(occ) and str(occ).strip()])
    elif isinstance(occupations, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(occupations).all():  
            unique_occupations.update([str(occ).strip() for occ in occupations.tolist() if pd.notna(occ) and str(occ).strip()])
    elif pd.notna(occupations):  
        # Handle any other non-null value
        unique_occupations.add(str(occupations).strip())

    cleaner_occupations = set()
    # more processing yay!
    for occupation in unique_occupations:
        occupation = occupation.replace("[","").replace("]","").replace("'","").replace("\"","")
        if re.match(r'Q\d+', occupation):
            continue
        cleaner_occupations.add(occupation)

    # Insert valid occupations into the Artist_Occupations table
    for occupation in cleaner_occupations:
        try:
            cursor.execute("""
                INSERT INTO Artist_Occupations (artist_name, birth_year, occupation_name)
                VALUES (%s, %s, %s)
                ON CONFLICT (artist_name, birth_year, occupation_name) DO NOTHING;
            """, (artist_name, birth_year, occupation))
        except Exception as e:
            print(f"Error inserting occupation for {artist_name}, {birth_year}: {occupation} -> {e}")
            conn.rollback()

# Commit all changes after insertions
conn.commit()

print("Artist_Occupations Insertion Complete!")


fetched existing artists


iterating artists:   0%|          | 0/60713 [00:00<?, ?it/s]

iterating artists: 100%|██████████| 60713/60713 [00:04<00:00, 12781.53it/s]

Artist_Occupations Insertion Complete!





In [21]:
pd.isna(birth_year)

True

In [20]:
type(birth_year)

float

#### Inserting Artworks, Additional Artists, Artists_Artworks

In [None]:
import re
import pandas as pd
import psycopg2
import psycopg2.extras

# 🔹 **Step 1: Fetch Existing Artists**
cursor.execute("SELECT artist_name, birth_year FROM Artists;")
existing_artists = set(cursor.fetchall())  # Store as a set of (artist_name, birth_year)

# 🔹 **Step 2: Prepare Data for Bulk Insert**
artists_to_insert = []
artworks_to_insert = []
artists_artworks_to_insert = []

for _, row in tqdm(moma_artworks.iterrows(), total=len(moma_artworks)):
    title = row.get('Title', None)
    artwork_date = row.get('Date', None)

    # 🔹 **Extract the first four-digit year from artwork_date**
    artwork_date_match = re.search(r'\d{4}', str(artwork_date)) if artwork_date else None
    artwork_date = int(artwork_date_match.group(0)) if artwork_date_match else None

    artists_info = {}

    artist_name = row.get('Artist', None)
    if (artist_name is not None) and ("," in artist_name):
        artists_list = artist_name.split(", ")
        for i in range(len(artists_list)):
            artists_info[i] = {}
            artists_info[i]['artist_name'] = artists_list[i]
    else:
        artists_info[0] = {}
        artists_info[0]['artist_name'] = artist_name

    birth_year = row.get('BeginDate', None)
    if (birth_year is not None) and (" " in birth_year):
        birth_year_list = birth_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(birth_year_list)):
            birth_year_match = re.search(r'\d{4}', str(birth_year_list[i])) if birth_year_list[i] else None
            birth_year = int(birth_year_match.group(0)) if birth_year_match else None
            artists_info[i]['birth_year'] = birth_year
    elif (birth_year is not None):
        birth_year_match = re.search(r'\d{4}', str(birth_year))
        birth_year = int(birth_year_match.group(0)) if birth_year_match else None
        artists_info[0]['birth_year'] = birth_year


    medium = row.get('Medium', None)
    department = row.get('Department', None)
    date_acquired = row.get('DateAcquired', None)
    art_classification = row.get('Classification', None)
    credit_line = row.get('CreditLine', None)

    nationality = row.get('Nationality', None)
    if "unknown" in str(nationality).lower():
        nationality = None
    elif (nationality is not None) and (") (" in nationality):
        nationality_list = nationality.strip("(").strip(")").split(") (")
        for i in range(len(nationality_list)):
            artists_info[i]['nationality'] = nationality_list[i]
    elif (nationality is not None):
        artists_info[0]['nationality'] = nationality.replace("(", "").replace(")", "")

    gender = row.get('Gender', None)
    if (gender is not None) and (") (" in gender):
        gender_list = gender.strip("(").strip(")").split(") (")
        for i in range(len(gender_list)):
            artists_info[i]['gender'] = gender_list[i]
    elif (gender is not None):
        artists_info[0]['gender'] = gender.replace("(", "").replace(")", "")

    death_year = row.get('EndDate', None)
    if (death_year is not None) and (" " in death_year):
        death_year_list = death_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(death_year_list)):
            death_year_match = re.search(r'\d{4}', str(death_year_list[i])) if death_year_list[i] else None
            death_year = int(death_year_match.group(0)) if death_year_match else None
            artists_info[i]['death_year'] = death_year
    elif (death_year is not None):
        death_year_match = re.search(r'\d{4}', str(death_year)) if death_year else None
        death_year = int(death_year_match.group(0)) if death_year_match else None
        artists_info[0]['death_year'] = death_year


    # 🔹 **Check if the artist exists, otherwise add to insert list**
    for i in range(len(artists_info)):
        artist_name = artists_info[i].get('artist_name', None)
        birth_year = artists_info[i].get('birth_year', None)

        # Skip if artist_name or birth_year is missing
        if (artist_name is None) or (birth_year is None):
            continue

        # Skip if artist_name and birth_year already exist in the database
        if (artist_name, birth_year) not in existing_artists:

            artists_to_insert.append((artist_name,
                                      birth_year,
                                      artists_info[i].get('nationality', None),
                                      artists_info[i].get('gender', None), 
                                      artists_info[i].get('death_year', None)))
            existing_artists.add((artist_name, birth_year))  # Add to cache to prevent duplicate inserts

            artists_artworks_to_insert.append((artist_name, birth_year, title, artwork_date))

    # 🔹 **Prepare artwork insert data**
    artworks_to_insert.append((title, artwork_date, medium, department, date_acquired, art_classification, credit_line))

pd.DataFrame(artists_to_insert, columns=['artist_name', 'birth_year', 'nationality', 'gender', 'death_year']).to_csv("clean_and_import/artists_to_insert.csv", index=False)
pd.DataFrame(artworks_to_insert, columns=['title', 'artwork_date', 'medium', 'department', 'date_acquired', 'art_classification', 'credit_line']).to_csv("clean_and_import/artworks_to_insert.csv", index=False)
pd.DataFrame(artists_artworks_to_insert, columns=['artist_name', 'birth_year', 'title', 'artwork_date']).to_csv("clean_and_import/artists_artworks_to_insert.csv", index=False)

# # 🔹 **Step 3: Batch Insert Artists**
# if artists_to_insert:
#     query = """
#         INSERT INTO Artists (artist_name, birth_year, nationality, gender, death_year)
#         VALUES %s
#         ON CONFLICT (artist_name, birth_year) DO NOTHING;
#     """
#     psycopg2.extras.execute_values(cursor, query, artists_to_insert)
#     print(f"Inserted {len(artists_to_insert)} new artists.")

# # 🔹 **Step 4: Batch Insert Artworks**
# if artworks_to_insert:
#     query = """
#         INSERT INTO Artworks (title, artwork_date, artist_name, birth_year, medium, department, date_acquired, art_classification, credit_line)
#         VALUES %s
#         ON CONFLICT (title, artwork_date) DO NOTHING;
#     """
#     psycopg2.extras.execute_values(cursor, query, artworks_to_insert)
#     print(f"Inserted {len(artworks_to_insert)} artworks.")

# # 🔹 **Step 5: Commit Changes**


# print(" Data Insertion Complete!")


100%|██████████| 118807/118807 [00:12<00:00, 9874.07it/s] 


##### Important Note

Use files "artists_to_insert.csv", "artworks_to_insert.csv", and "artists_artworks_to_insert.csv" to manually import data into tables using DataGrip.

The complexity of psycopg2.extras.execute_values and regulary query execution calls are too high in python to run in a reasonable amount of time.

#### Insert Artist Movement

In [56]:
# First, extract unique movements and insert them into the Movements table (assuming it exists)
movements_data = artists_combined['movement']

# Create a set to store unique movements
unique_movements = set()

# Process each row in the movements column with tqdm progress bar
print("Extracting unique movements...")
for movements in tqdm(movements_data, desc="Processing movements"):
    if isinstance(movements, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_movements.update([mov.strip() for mov in movements.split(', ') if mov.strip()])
    elif isinstance(movements, list):  
        # If it's a list, clean each element
        unique_movements.update([str(mov).strip() for mov in movements if pd.notna(mov) and str(mov).strip()])
    elif isinstance(movements, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(movements).all():  
            unique_movements.update([str(mov).strip() for mov in movements.tolist() if pd.notna(mov) and str(mov).strip()])
    elif pd.notna(movements):  
        # Handle any other non-null value
        unique_movements.add(str(movements).strip())

# Insert unique movements into the Movements table with tqdm progress bar
print("Inserting unique movements into the Movements table...")
for movement in tqdm(unique_movements, desc="Inserting movements"):
    try:
        cursor.execute("""
            INSERT INTO Movements (movement_name)
            VALUES (%s)
            ON CONFLICT (movement_name) DO NOTHING;
        """, (movement,))
    except Exception as e:
        print(f"Error inserting movement {movement} -> {e}")
        conn.rollback()

# Now populate the Artist_Movements table
artist_movements = []

# Process the DataFrame to extract artist-movement relationships with years_active
print("Processing artist-movement relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']

    if pd.isna(birth_year) or birth_year > 2030:
        continue  # Skip this row
    # Convert to int
    birth_year = int(birth_year)
    birth_year = None if birth_year == 0 else birth_year
    if birth_year is None:
        continue
    
    
    # Get movements for this artist
    movements = []
    
    if isinstance(row['movement'], str):
        movements = [mov.strip() for mov in row['movement'].split(', ') if mov.strip()]
    elif isinstance(row['movement'], list):
        movements = [str(mov).strip() for mov in row['movement'] if pd.notna(mov) and str(mov).strip()]
    elif isinstance(row['movement'], np.ndarray):
        movements = [str(mov).strip() for mov in row['movement'].tolist() if pd.notna(mov) and str(mov).strip()]
    elif pd.notna(row['movement']):
        movements = [str(row['movement']).strip()]

    # Add to our list of relationships to insert
    for movement in movements:
        if movement is not None:
            artist_movements.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'movement_name': movement
            })

# Insert the artist-movement relationships using executemany with psycopg2
print("Inserting artist-movement relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Movements (artist_name, birth_year, movement_name)
        VALUES (%(artist_name)s, %(birth_year)s, %(movement_name)s)
        ON CONFLICT (artist_name, birth_year, movement_name) DO NOTHING;
    """, artist_movements)
    conn.commit()
    print(f"Successfully inserted {len(artist_movements)} artist-movement relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist movements: {e}")

print("Artist Movements Insertion Complete!")

Extracting unique movements...


Processing movements: 100%|██████████| 60713/60713 [00:00<00:00, 584311.46it/s]


Inserting unique movements into the Movements table...


Inserting movements: 100%|██████████| 125/125 [00:00<00:00, 5440.20it/s]


Processing artist-movement relationships...


Processing artists: 100%|██████████| 60713/60713 [00:05<00:00, 11329.28it/s]


Inserting artist-movement relationships...
Successfully inserted 59705 artist-movement relationships
Artist Movements Insertion Complete!


#### Styles and Artist_Styles

In [57]:
# Extract unique styles, flatten lists if necessary
unique_styles = set()

for styles in artists_combined['styles'].dropna():
    if isinstance(styles, list):  # If stored as a list
        unique_styles.update(style.strip() for style in styles)  # Remove extra spaces
    else:
        unique_styles.add(styles.strip())  # If single value, clean and add directly

# Insert unique styles into the Styles table
for style in unique_styles:
    cursor.execute("""
        INSERT INTO Styles (style_name)
        VALUES (%s)
        ON CONFLICT (style_name) DO NOTHING;
    """, (style,))

# Commit the transaction
conn.commit()

In [58]:
# Iterate through each row in the DataFrame
for _, row in artists_combined.iterrows():
    artist_name = row['artist']
    birth_year = row['birth_year']

    # Skip if birth_year is NaN, too large, or artist_name is missing
    if pd.isna(artist_name) or pd.isna(birth_year) or birth_year > 2030:
        continue

    birth_year = int(birth_year)  # Convert birth_year to int
    
    # Process styles
    styles = row['styles']
    if isinstance(styles, str):  
        styles = [s.strip() for s in styles.split(',')]
    elif styles is None or (isinstance(styles, (list, pd.Series, pd.DataFrame)) and pd.isnull(styles).all()):  
        continue  # Skip if styles are missing or empty

    # Process StylesCount dictionary
    styles_count = {}
    if isinstance(row['StylesCount'], str):
        for entry in row['StylesCount'].strip('{}').split(', '):
            try:
                style, count = entry.rsplit(':', 1)
                styles_count[style.strip()] = int(count)
            except ValueError:
                continue  

    # Process StylesYears dictionary
    styles_years = {}
    if isinstance(row['StylesYears'], str):
        for entry in row['StylesYears'].split(','):
            try:
                style, years = entry.split(':', 1)
                styles_years[style.strip()] = years.strip()
            except ValueError:
                continue  

    # **Ensure artist exists in the Artists table**
    cursor.execute("""
        INSERT INTO Artists (artist_name, birth_year)
        VALUES (%s, %s)
        ON CONFLICT (artist_name, birth_year) DO NOTHING;
    """, (artist_name, birth_year))

    # **Ensure styles exist in the Styles table**
    for style in styles:
        cursor.execute("""
            INSERT INTO Styles (style_name)
            VALUES (%s)
            ON CONFLICT (style_name) DO NOTHING;
        """, (style,))

    # Insert each style associated with the artist
    for style in styles:
        style_count = styles_count.get(style, None)
        style_years = styles_years.get(style, None)

        cursor.execute("""
            INSERT INTO Artist_Styles (artist_name, birth_year, style_name, style_count, style_years)
            VALUES (%s, %s, %s, %s, %s)
            ON CONFLICT (artist_name, birth_year, style_name) DO NOTHING;
        """, (artist_name, birth_year, style, style_count, style_years))

# Commit changes to the database
conn.commit()

#### Teacher Relationships

In [5]:
artists_combined.columns

Index(['artist', 'citizenship', 'gender', 'styles', 'movement',
       'Art500k_Movements', 'birth_place', 'death_place', 'FirstYear',
       'LastYear', 'wikiart_pictures_count', 'locations',
       'locations_with_years', 'styles_extended', 'StylesCount', 'StylesYears',
       'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount',
       'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers',
       'FriendsandCoworkers', 'Contemporary', 'Type', 'artist_name',
       'Wikidata QID', 'DisplayName', 'Wiki QID', 'ULAN', 'Title', 'Artist',
       'Date', 'Medium', 'Dimensions', 'CreditLine', 'AccessionNumber',
       'Classification', 'Department', 'DateAcquired', 'Cataloged', 'ObjectID',
       'URL', 'ImageURL', 'OnView', 'Circumference (cm)', 'Depth (cm)',
       'Diameter (cm)', 'Height (cm)', 'Length (cm)', 'Weight (kg)',
       'Width (cm)', 'Seat Height (cm)', 'Duration (sec.)', 'start_year',
       'end_year', 'birth_year', 'death_year', 'Nationality'

In [11]:
artists_combined[artists_combined['Pupils'] != "['Unknown']"]['Pupils']

10                 ['The cycle of 5 paintings Deluge', '']
40       ['Artists2/Alexandre Jacques Chantron/Danae 18...
43                                    ['Marc Chagall', '']
61          ['Théodore Géricault', 'Eugene Delacroix', '']
66       ['Tamara de Lempicka', 'Georg Pauli', 'Dorrit ...
                               ...                        
60232                            ['Harry Phelan Gibb', '']
60235                          ['John Singer Sargent', '']
60240    ['Paul Émile Chabas', 'Alexandre-Jacques Chant...
60261                                ['Kanō Motonobu', '']
60262                   ['Donatello', 'Paolo Uccello', '']
Name: Pupils, Length: 1744, dtype: object

In [8]:
artists_combined[artists_combined['Teachers'] != "['Unknown']"]['Teachers']

11       ['Jules Joseph Lefebvre', 'Gustave Boulanger',...
32                               ['Aleksandra Ekster', '']
40                      ['William-Adolphe Bouguereau', '']
44            ['Anatol Petrytsky', 'Sergiy Grigoriev', '']
45                                      ['Ilya Repin', '']
                               ...                        
60224                           ['Volodymyr Orlovsky', '']
60240    ['Artists2/William Adolphe Bouguereau/Mother A...
60247                          ['Jacques-Louis David', '']
60257                        ['Mariotto Albertinelli', '']
60495                                ['Fernand Leger', '']
Name: Teachers, Length: 4015, dtype: object

In [7]:
artists_combined[artists_combined['Influencedon'] != "['Unknown']"]['Influencedon']

62                              ['Private Collection', '']
156                                ['Gustave Courbet', '']
169      ['Museo del Prado', ' Madrid', ' Spain', 'Muse...
249      ['Louvre', ' Paris', ' France', 'Royal Collect...
252                           ['Jean Lecomte du Nouÿ', '']
                               ...                        
60243    ['Edward Mitchell Bannister', 'Andres de Santa...
60247                             ['Eugene Delacroix', '']
60253                                ['132 x 96.1 cm', '']
60257    ['Louvre', ' Paris', ' France', 'National Gall...
60262                               ['Lorenzo Monaco', '']
Name: Influencedon, Length: 10865, dtype: object

In [6]:
artists_combined[artists_combined['Influencedby'] != "['Unknown']"]['Influencedby']

10       ['allegories-and-symbols', 'boats-and-ships', '']
31       ['National Museum of Ancient Art (MNAA)', ' Li...
34       ['Johannes Vermeer', 'Diego Velazquez', 'Claud...
40                      ['William-Adolphe Bouguereau', '']
44                                   ['Byzantine Art', '']
                               ...                        
60547                                 ['55 x 38.2 cm', '']
60663                     ['Rene Magritte', 'Balthus', '']
60664    ['El Greco', 'Francisco Goya', 'Rafael Zabalet...
60680    ['battles-and-wars', 'Shinpūren-Rebellion', '1...
60683    ['actors-and-performances', 'male-portraits', ...
Name: Influencedby, Length: 11264, dtype: object

In [9]:
artists_combined[artists_combined['FriendsandCoworkers'] != "['Unknown']"]['FriendsandCoworkers']

10       ['Artists2/Mikalojus Ciurlionis/Wrath Ii 1904....
11                                   ['Childe Hassam', '']
32       ['Victor Palmov', 'David Burliuk', 'Oleksandr ...
34       ['Edmund Charles Tarbell', 'Robert Lewis Reid'...
45       ['Artists2/Anna Ostroumova Lebedeva/Self Portr...
                               ...                        
60421       ['Yves Klein', 'Heinz Mack', 'Otto Piene', '']
60491                     ['Heinz Mack', 'Otto Piene', '']
60664             ['Saturnino Herran', 'Diego Rivera', '']
60680    ['Artists2/Tsukioka Yoshitoshi/Shinp Ren Rebel...
60683    ['Artists2/Utagawa Kunisada/Not_Detected_24036...
Name: FriendsandCoworkers, Length: 9374, dtype: object

In [11]:
from ast import literal_eval
import csv

In [17]:
artists_combined.columns

Index(['artist', 'citizenship', 'gender', 'styles', 'movement',
       'Art500k_Movements', 'birth_place', 'death_place', 'FirstYear',
       'LastYear', 'wikiart_pictures_count', 'locations',
       'locations_with_years', 'styles_extended', 'StylesCount', 'StylesYears',
       'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount',
       'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers',
       'FriendsandCoworkers', 'Contemporary', 'Type', 'artist_name',
       'Wikidata QID', 'DisplayName', 'Wiki QID', 'ULAN', 'Title', 'Artist',
       'Date', 'Medium', 'Dimensions', 'CreditLine', 'AccessionNumber',
       'Classification', 'Department', 'DateAcquired', 'Cataloged', 'ObjectID',
       'URL', 'ImageURL', 'OnView', 'Circumference (cm)', 'Depth (cm)',
       'Diameter (cm)', 'Height (cm)', 'Length (cm)', 'Weight (kg)',
       'Width (cm)', 'Seat Height (cm)', 'Duration (sec.)', 'start_year',
       'end_year', 'birth_year', 'death_year', 'Nationality'

In [30]:
cursor.execute("SELECT artist_name FROM Artists;")
existing_artists = set(cursor.fetchall())

# Function to clean artist names
def clean_artist_name(name):
    # Remove path prefixes
    name = re.sub(r'^Artists2/[^/]+/', '', name)
    # Remove file extensions and suffixes
    name = re.sub(r'/[^/]+\s\d{4}.*$', '', name)
    name = re.sub(r'/Not_Detected_\d+.*$', '', name)
    return name.strip()

# Parse relationship string into list of artist names
def parse_relationship(relationship_str):
    if pd.isna(relationship_str) or relationship_str == "['Unknown']":
        return []
    
    try:
        relationship_list = literal_eval(relationship_str)
        artists = []
        for item in relationship_list:
            if item and not any([
                item.startswith(('Louvre', 'Museo', 'National', 'Royal', 'Private')),
                re.match(r'^\d+\s*x\s*\d+', item),
                item.endswith(('cm', 'France', 'Spain')),
                re.match(r'^[a-z\-]+$', item)  # Lowercase with hyphens (tags)
            ]):
                artists.append(clean_artist_name(item))
        return artists
    except:
        return []

# Process each row to extract relationships
relationships = []

for index, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0]):
    artist_name = row.get('artist', None)
    
    birth_year = row.get('birth_year', None)
    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None
    birth_year = int(birth_year) if birth_year is not None else None
    birth_year = None if birth_year == 0 else birth_year
    
    # Skip if this artist doesn't exist in the database
    if (artist_name,) not in existing_artists:
        continue
    
    # Process pupils
    pupils = parse_relationship(row.get('Pupils'))
    for pupil in pupils:
        if (pupil,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': pupil,
                'relationship_type': 'Teacher'
            })
    
    # Process teachers
    teachers = parse_relationship(row.get('Teachers'))
    for teacher in teachers:
        if (teacher,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': teacher,
                'relationship_type': 'Pupil'
            })
    
    # Process influences
    influences = parse_relationship(row.get('Influencedby'))
    for influence in influences:
        if (influence,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': influence,
                'relationship_type': 'Influenced By'
            })
    
    influenced = parse_relationship(row.get('Influencedon'))
    for influence in influenced:
        if (influence,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': influence,
                'relationship_type': 'Influenced On'
            })
    
    # Process friends
    friends = parse_relationship(row.get('FriendsandCoworkers'))
    for friend in friends:
        if (friend,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': friend,
                'relationship_type': 'Friend'
            })

# Write relationships to CSV file
with open('clean_and_import_resources/artist_relationships.csv', 
          'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['artist1_name', 'birth_year1', 'artist2_name', 'relationship_type'])
    
    for rel in relationships:
        writer.writerow([
            rel['artist1_name'],
            rel['birth_year1'],
            rel['artist2_name'],
            rel['relationship_type']
        ])

print(f"Processed {len(relationships)} relationships between existing artists")

100%|██████████| 60713/60713 [00:27<00:00, 2227.71it/s]


Processed 127007 relationships between existing artists


#### Schools and Artist_Schools

In [None]:
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

# First, create and populate the PaintingSchools table
painting_schools_data = artists_combined['PaintingSchool']

# Create a set to store unique painting schools
unique_schools = set()

# Process each row in the PaintingSchool column with tqdm progress bar
print("Extracting unique painting schools...")
for school in tqdm(painting_schools_data, desc="Processing schools"):
    if isinstance(school, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_schools.update([sch.strip() for sch in school.split(', ') if sch.strip()])
    elif isinstance(school, list):  
        # If it's a list, clean each element
        unique_schools.update([str(sch).strip() for sch in school if pd.notna(sch) and str(sch).strip()])
    elif isinstance(school, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(school).all():  
            unique_schools.update([str(sch).strip() for sch in school.tolist() if pd.notna(sch) and str(sch).strip()])
    elif pd.notna(school):  
        # Handle any other non-null value
        unique_schools.add(str(school).strip())

# Insert unique painting schools into the PaintingSchools table with tqdm progress bar
print("Inserting unique painting schools into the Schools table...")
for school in tqdm(unique_schools, desc="Inserting schools"):
    try:
        cursor.execute("""
            INSERT INTO Schools (school_name)
            VALUES (%s)
            ON CONFLICT (school_name) DO NOTHING;
        """, (school,))
    except Exception as e:
        print(f"Error inserting painting school {school} -> {e}")
        conn.rollback()

# Now populate the Artist_PaintingSchools table
artist_schools = []

# Process the DataFrame to extract artist-school relationships
print("Processing artist-painting school relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']
    
    # Get painting schools for this artist
    schools = []
    time_periods = {}
    
    # Check if we have a school_details column with structured data including time periods
    if 'school_details' in row and pd.notna(row['school_details']):
        # Assuming school_details is a dict or can be parsed as JSON with format:
        # {school_name: {"start_year": year, "end_year": year}, ...}
        if isinstance(row['school_details'], dict):
            school_details = row['school_details']
        elif isinstance(row['school_details'], str):
            try:
                school_details = json.loads(row['school_details'])
            except:
                school_details = {}
        
        for school_name, period in school_details.items():
            if school_name.strip():
                schools.append(school_name.strip())
                time_periods[school_name.strip()] = period
    
    # If no structured data, extract schools from the PaintingSchool column
    if not schools and 'PaintingSchool' in row and pd.notna(row['PaintingSchool']):
        if isinstance(row['PaintingSchool'], str):
            schools = [sch.strip() for sch in row['PaintingSchool'].split(', ') if sch.strip()]
        elif isinstance(row['PaintingSchool'], list):
            schools = [str(sch).strip() for sch in row['PaintingSchool'] if pd.notna(sch) and str(sch).strip()]
        elif isinstance(row['PaintingSchool'], np.ndarray):
            schools = [str(sch).strip() for sch in row['PaintingSchool'].tolist() if pd.notna(sch) and str(sch).strip()]
        elif pd.notna(row['PaintingSchool']):
            schools = [str(row['PaintingSchool']).strip()]
    
    # Add to our list of relationships to insert
    for school in schools:
        if school:
            # Get time period for this school if available, or set to empty dict
            period_json = time_periods.get(school, {})
            
            artist_schools.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'school_name': school,
                'time_period': json.dumps(period_json) if period_json else None
            })

# Insert the artist-painting school relationships using executemany with psycopg2
print("Inserting artist-painting school relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Schools (artist_name, birth_year, school_name, time_period)
        VALUES (%(artist_name)s, %(birth_year)s, %(school_name)s, %(time_period)s::jsonb)
        ON CONFLICT (artist_name, birth_year, school_name) DO UPDATE 
        SET time_period = EXCLUDED.time_period;
    """, artist_schools)
    conn.commit()
    print(f"Successfully inserted {len(artist_schools)} artist-painting school relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist painting schools: {e}")

print("Artist Painting Schools Insertion Complete!")

Commit and close the connection:

In [None]:
print(artists_combined.columns.tolist())

['artist', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements', 'birth_place', 'death_place', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'locations', 'locations_with_years', 'styles_extended', 'StylesCount', 'StylesYears', 'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'Type', 'artist_name', 'Wikidata QID', 'DisplayName', 'Wiki QID', 'ULAN', 'Title', 'Artist', 'Date', 'Medium', 'Dimensions', 'CreditLine', 'AccessionNumber', 'Classification', 'Department', 'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ImageURL', 'OnView', 'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)', 'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)', 'Duration (sec.)', 'start_year', 'end_year', 'birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']


In [31]:
# Commit changes
conn.commit()

# Close the connection
cursor.close()
conn.close()