In [2]:
# ! conda install psycopg2 -y

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3/envs/dsc202project

  added / updated specs:
    - psycopg2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libpq-17.4                 |       h184c1cd_0         2.8 MB
    psycopg2-2.9.9             |  py310h46256e1_1         159 KB
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be INSTALLED:

  cyrus-sasl         pkgs/main/osx-64::cyrus-sasl-2.1.28-h3973f50_1 
  krb5               pkgs/main/osx-64::krb5-1.20.1-h428f121_1 
  libedit            pkgs/main/osx-64::libedit-3.1.20230828-h6c40b1e_0 
  libpq              pkgs/main/osx-64::libpq-17.4-h184c1cd_0 
  openldap           pkgs/main

In [25]:
# ! conda install tqdm -y

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3/envs/dsc202project

  added / updated specs:
    - tqdm


The following NEW packages will be INSTALLED:

  tqdm               pkgs/main/osx-64::tqdm-4.67.1-py310h20db666_0 



Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [29]:
from tqdm import tqdm

In [None]:
import pandas as pd
import numpy as np
import psycopg2

# Load datasets
moma_artists = pd.read_csv('Artists.csv')
moma_artworks = pd.read_csv('Artworks.csv')
painter_palette = pd.read_csv('PainterPalette.csv')
wikidata = pd.read_csv('Total_Merged_Painters_Data.csv')

Clean MOMA Artists dataset:

In [4]:
# Ensure ConstituentID is unique
moma_artists = moma_artists.drop_duplicates(subset=['ConstituentID'])

# Extract birth and death years from ArtistBio. Fill missing values and convert data types.
moma_artists['birth_year'] = moma_artists['ArtistBio'].str.extract(r'(\d{4})–').astype(float).fillna(0).astype(int)
moma_artists['death_year'] = moma_artists['ArtistBio'].str.extract(r'–(\d{4})').astype(float).fillna(0).astype(int)
moma_artists['Nationality'] = moma_artists['Nationality'].fillna('Unknown')
moma_artists['Gender'] = moma_artists['Gender'].fillna('Unknown')

Clean MOMA Artworks dataset:

In [5]:
# Ensure artwork is unique
moma_artworks = moma_artworks.drop_duplicates(subset=['ObjectID'], keep='first')
moma_artworks = moma_artworks.drop_duplicates(subset=['Title'])

# Convert ConstituentID column to string, split, explode, and convert to integers
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].astype(str).str.split(', ')
moma_artworks = moma_artworks.explode('ConstituentID')
moma_artworks['ConstituentID'] = pd.to_numeric(moma_artworks['ConstituentID'], errors='coerce').fillna(0).astype(int)

# Filter out rows where ConstituentID is 0
moma_artworks = moma_artworks[moma_artworks['ConstituentID'] != 0]

# Extract start and end years from the date column
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float)  # Extract start year
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)  # Extract end year (last 2 digits)

# Handle cases where the end year is only 2 digits (e.g., "1976-77")
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float).fillna(0).astype(int)
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)
moma_artworks['end_year'] = (
    moma_artworks['start_year'].astype(str).str[:2] + 
    moma_artworks['end_year'].astype(str).str.zfill(2)
).fillna(moma_artworks['start_year'].astype(str))

moma_artworks['start_year'] = pd.to_numeric(moma_artworks['start_year'], errors='coerce').fillna(0).astype(int)
moma_artworks['end_year'] = pd.to_numeric(moma_artworks['end_year'], errors='coerce').fillna(0).astype(int)

moma_artworks['Artist'] = moma_artworks['Artist'].fillna('Unknown')

Clean PainterPalette dataset:

In [6]:
# Clean PainterPalette dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

painter_palette[columns_to_clean] = painter_palette[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    painter_palette[col] = painter_palette[col].str.split(',')

Clean WikiData dataset:

In [7]:
# Clean WikiData dataset
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

wikidata[columns_to_clean] = wikidata[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    wikidata[col] = wikidata[col].str.split(',')

Merge datasets:

In [8]:
# Fill in PainterPalette with data from WikiData dataset

artists_combined = pd.merge(
    painter_palette, 
    wikidata, 
    left_on='artist', 
    right_on='artist_name', 
    how='left',
    suffixes=('', '_wiki')
)

# List of columns to fill from the wikidata dataset
columns_to_fill = ['Nationality', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements','birth_place', 'death_place', 'birth_year', 'death_year',
                   'locations', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'styles_extended', 'locations_with_years', 'StylesCount', 'StylesYears', 
                   'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 
                   'FriendsandCoworkers', 'Contemporary', 'Type']

# Combine the columns to fill from both painter_palette and wikidata
for col in columns_to_fill:
    if col in artists_combined.columns and f'{col}_wiki' in artists_combined.columns:
        # Fill missing values in the target column (from painter_palette) with the values from the wiki column
        artists_combined[col] = artists_combined[col].combine_first(artists_combined[f'{col}_wiki'])
    else:
        print(f"Warning: Column {col} or {col}_wiki not found in the merged dataset.")

# Drop the columns that were added from wikidata dataset to avoid duplication
for col in columns_to_fill:
    wiki_col = f'{col}_wiki'
    if wiki_col in artists_combined.columns:
        artists_combined.drop(columns=[wiki_col], inplace=True)

In [9]:
# Merge with MOMA Artists dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artists, 
    left_on='artist', 
    right_on='DisplayName', 
    how='left'
)

artists_combined['Nationality'] = artists_combined['Nationality_x']
artists_combined = artists_combined.drop(columns=['Nationality_x', 'Nationality_y'])

In [10]:
# Merge with MOMA Artworks dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artworks, 
    left_on='artist', 
    right_on='Artist', 
    how='left'
)

In [11]:
# List of columns to combine that have '_x' and '_y' versions
columns_to_combine = ['birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']

# Combine the '_x' and '_y' columns
for col in columns_to_combine:
    col_x = col + '_x'
    col_y = col + '_y'
    
    # Ensure both columns exist in the dataframe
    if col_x in artists_combined.columns and col_y in artists_combined.columns:
        # Combine the columns (using _x values, and filling missing values from _y)
        artists_combined[col] = artists_combined[col_x].combine_first(artists_combined[col_y])
        
        # Drop the original '_x' and '_y' columns after combining
        artists_combined.drop(columns=[col_x, col_y], inplace=True)
    else:
        continue

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

artists_combined.head()

artists_combined.to_csv('artists_combined.csv', index=False)

### Connect To Postgres:

In [39]:
# Connect to Postgres
conn = psycopg2.connect(
    dbname="202Project",
    user="postgres",
    password="***REMOVED***",
    host="localhost",
    port="5432"
)
cursor = conn.cursor()

### Insert Artists

In [14]:
import pandas as pd

# Iterate through artists_combined and insert into Artists table
for _, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0]):

    # Check and clean year columns to ensure valid ranges
    birth_year = row.get('birth_year', None)
    death_year = row.get('death_year', None)
    career_start_year = row.get('FirstYear', None)
    career_end_year = row.get('LastYear', None)

    # Skip rows where year columns NaN or too large
    if pd.isna(birth_year) or birth_year > 2147483647:
        continue  # Skip this row

    if pd.isna(death_year) or death_year > 2147483647:
        continue  # Skip this row

    if pd.isna(career_start_year) or career_start_year > 2147483647:
        continue  # Skip this row

    if pd.isna(career_end_year) or career_end_year > 2147483647:
        continue  # Skip this row

    # Convert to int
    birth_year = int(birth_year)
    death_year = int(death_year)
    career_start_year = int(career_start_year)
    career_end_year = int(career_end_year)

    artist_name = row['artist']
    birth_place = row.get('birth_place', None)  # Use None if no value exists
    death_place = row.get('death_place', None)
    nationality = row.get('Nationality', None)
    citizenship = row.get('citizenship', None)
    gender = row.get('gender', None)

    # Insert data into the Artists table
    cursor.execute("""
        INSERT INTO Artists (
            artist_name, birth_year, birth_place, death_place, nationality, 
            citizenship, gender, death_year, career_start_year, career_end_year
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (artist_name, birth_year) DO NOTHING;
    """, (
        artist_name, birth_year, birth_place, death_place, nationality, 
        citizenship, gender, death_year, career_start_year, career_end_year
    ))


### Inserting Occupations & Occupations_Artists

In [None]:
# # Extract unique occupations, flatten lists
# unique_occupations = set()

# for occupations in artists_combined['occupations'].dropna():
#     if isinstance(occupations, list):  # If stored as a list
#         unique_occupations.update(occupations)
#     else:
#         unique_occupations.add(occupations)  # If single value

# # Insert unique occupations into the Occupations table
# for occupation in unique_occupations:
#     cursor.execute("""
#         INSERT INTO Occupations (occupation_name)
#         VALUES (%s)
#         ON CONFLICT (occupation_name) DO NOTHING;
#     """, (occupation,))

In [17]:
# Extract the "occupations" column from artists_combined DataFrame
occupations_data = artists_combined['occupations']

# Create a set to store unique occupations
unique_occupations = set()

# Process each row in the occupations column
for occupations in occupations_data:
    if isinstance(occupations, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_occupations.update([occ.strip() for occ in occupations.split(', ') if occ.strip()])
    elif isinstance(occupations, list):  
        # If it's a list, clean each element
        unique_occupations.update([str(occ).strip() for occ in occupations if pd.notna(occ) and str(occ).strip()])
    elif isinstance(occupations, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(occupations).all():  
            unique_occupations.update([str(occ).strip() for occ in occupations.tolist() if pd.notna(occ) and str(occ).strip()])
    elif pd.notna(occupations):  
        # Handle any other non-null value
        unique_occupations.add(str(occupations).strip())

# Insert unique occupations into the Occupations table
for occupation in unique_occupations:
    try:
        cursor.execute("""
            INSERT INTO Occupations (occupation_name)
            VALUES (%s)
            ON CONFLICT (occupation_name) DO NOTHING;
        """, (occupation,))
    except Exception as e:
        print(f"Error inserting occupation {occupation} -> {e}")
        conn.rollback()


print("Occupation Insertion Complete!")

Error inserting occupation athletic director -> current transaction is aborted, commands ignored until end of transaction block

Occupation Insertion Complete!


In [18]:
import pandas as pd
import numpy as np

# Fetch existing artists from the database before processing
cursor.execute("SELECT artist_name, birth_year FROM Artists;")
existing_artists = set(cursor.fetchall())  # Store as a set of tuples (artist_name, birth_year)

# Iterate over each row in the DataFrame
for _, row in artists_combined.iterrows():
    artist_name = row.get('artist', None)
    birth_year = row.get('birth_year', None)

    # Skip if birth_year is NaN, too large, or artist_name is missing
    if pd.isna(artist_name) or pd.isna(birth_year) or birth_year > 2147483647:
        continue

    birth_year = int(birth_year)  # Convert birth_year to int

    # Ensure the artist exists in Artists table before inserting occupations
    if (artist_name, birth_year) not in existing_artists:
        continue  # Skip this row if artist doesn't exist in Artists table

    # Extract occupations from row
    occupations = row.get('occupations', None)
    unique_occupations = set()

    # Ensure occupations exist and are valid
    if isinstance(occupations, str):  
        unique_occupations.update([occ.strip() for occ in occupations.split(', ') if occ.strip()])
    elif isinstance(occupations, list):  
        unique_occupations.update([str(occ).strip() for occ in occupations if pd.notna(occ) and str(occ).strip()])
    elif isinstance(occupations, np.ndarray):  
        # Ensure array is valid and not NaN
        if not pd.isna(occupations).all():  # Prevent ambiguous truth value
            unique_occupations.update([str(occ).strip() for occ in occupations.tolist() if pd.notna(occ) and str(occ).strip()])
    elif pd.notna(occupations):  
        unique_occupations.add(str(occupations).strip())

    # Insert valid occupations into the Artist_Occupations table
    for occupation in unique_occupations:
        try:
            cursor.execute("""
                INSERT INTO Artist_Occupations (artist_name, birth_year, occupation_name)
                VALUES (%s, %s, %s)
                ON CONFLICT (artist_name, birth_year, occupation_name) DO NOTHING;
            """, (artist_name, birth_year, occupation))
        except Exception as e:
            print(f"Error inserting occupation for {artist_name}, {birth_year}: {occupation} -> {e}")
            conn.rollback()

# Commit all changes after insertions
conn.commit()

print("Occupation Insertion Complete!")


Occupation Insertion Complete!


### Inserting Artworks, Additional Artists, Artists_Artworks

In [None]:
# ! conda install regex -y

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3/envs/dsc202project

  added / updated specs:
    - regex


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    regex-2024.11.6            |  py310h46256e1_0         345 KB
    ------------------------------------------------------------
                                           Total:         345 KB

The following NEW packages will be INSTALLED:

  regex              pkgs/main/osx-64::regex-2024.11.6-py310h46256e1_0 



Downloading and Extracting Packages:
                                                                                
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [22]:
import regex as re

In [27]:
x = 'blah blah 1998'
match = re.search(r'\d{4}', x)
if match is not None:
    match = match.group(0)
display(match)

'1998'

In [57]:
#! pip install tqdm

In [36]:
nationality = '(American) (American) (French)'
" ".join(set(str(nationality).replace("(", "").replace(")", "").split(" ")))

'American French'

In [48]:
import re
import pandas as pd
import psycopg2
import psycopg2.extras

# 🔹 **Step 1: Fetch Existing Artists**
cursor.execute("SELECT artist_name, birth_year FROM Artists;")
existing_artists = set(cursor.fetchall())  # Store as a set of (artist_name, birth_year)

# 🔹 **Step 2: Prepare Data for Bulk Insert**
artists_to_insert = []
artworks_to_insert = []
artists_artworks_to_insert = []

for _, row in tqdm(moma_artworks.iterrows(), total=len(moma_artworks)):
    title = row.get('Title', None)
    artwork_date = row.get('Date', None)

    # 🔹 **Extract the first four-digit year from artwork_date**
    artwork_date_match = re.search(r'\d{4}', str(artwork_date)) if artwork_date else None
    artwork_date = int(artwork_date_match.group(0)) if artwork_date_match else None

    artists_info = {}

    artist_name = row.get('Artist', None)
    if (artist_name is not None) and ("," in artist_name):
        artists_list = artist_name.split(", ")
        for i in range(len(artists_list)):
            artists_info[i] = {}
            artists_info[i]['artist_name'] = artists_list[i]
    else:
        artists_info[0] = {}
        artists_info[0]['artist_name'] = artist_name

    birth_year = row.get('BeginDate', None)
    if (birth_year is not None) and (" " in birth_year):
        birth_year_list = birth_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(birth_year_list)):
            birth_year_match = re.search(r'\d{4}', str(birth_year_list[i])) if birth_year_list[i] else None
            birth_year = int(birth_year_match.group(0)) if birth_year_match else None
            artists_info[i]['birth_year'] = birth_year
    elif (birth_year is not None):
        birth_year_match = re.search(r'\d{4}', str(birth_year))
        birth_year = int(birth_year_match.group(0)) if birth_year_match else None
        artists_info[0]['birth_year'] = birth_year


    medium = row.get('Medium', None)
    department = row.get('Department', None)
    date_acquired = row.get('DateAcquired', None)
    art_classification = row.get('Classification', None)
    credit_line = row.get('CreditLine', None)

    nationality = row.get('Nationality', None)
    if "unknown" in str(nationality).lower():
        nationality = None
    elif (nationality is not None) and (") (" in nationality):
        nationality_list = nationality.strip("(").strip(")").split(") (")
        for i in range(len(nationality_list)):
            artists_info[i]['nationality'] = nationality_list[i]
    elif (nationality is not None):
        artists_info[0]['nationality'] = nationality.replace("(", "").replace(")", "")

    gender = row.get('Gender', None)
    if (gender is not None) and (") (" in gender):
        gender_list = gender.strip("(").strip(")").split(") (")
        for i in range(len(gender_list)):
            artists_info[i]['gender'] = gender_list[i]
    elif (gender is not None):
        artists_info[0]['gender'] = gender.replace("(", "").replace(")", "")

    death_year = row.get('EndDate', None)
    if (death_year is not None) and (" " in death_year):
        death_year_list = death_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(death_year_list)):
            death_year_match = re.search(r'\d{4}', str(death_year_list[i])) if death_year_list[i] else None
            death_year = int(death_year_match.group(0)) if death_year_match else None
            artists_info[i]['death_year'] = death_year
    elif (death_year is not None):
        death_year_match = re.search(r'\d{4}', str(death_year)) if death_year else None
        death_year = int(death_year_match.group(0)) if death_year_match else None
        artists_info[0]['death_year'] = death_year


    # 🔹 **Check if the artist exists, otherwise add to insert list**
    for i in range(len(artists_info)):
        artist_name = artists_info[i].get('artist_name', None)
        birth_year = artists_info[i].get('birth_year', None)

        # Skip if artist_name or birth_year is missing
        if (artist_name is None) or (birth_year is None):
            continue

        # Skip if artist_name and birth_year already exist in the database
        if (artist_name, birth_year) not in existing_artists:

            artists_to_insert.append((artist_name,
                                      birth_year,
                                      artists_info[i].get('nationality', None),
                                      artists_info[i].get('gender', None), 
                                      artists_info[i].get('death_year', None)))
            existing_artists.add((artist_name, birth_year))  # Add to cache to prevent duplicate inserts

            artists_artworks_to_insert.append((artist_name, birth_year, title, artwork_date))

    # 🔹 **Prepare artwork insert data**
    artworks_to_insert.append((title, artwork_date, medium, department, date_acquired, art_classification, credit_line))

pd.DataFrame(artists_to_insert, columns=['artist_name', 'birth_year', 'nationality', 'gender', 'death_year']).to_csv("artists_to_insert.csv", index=False)
pd.DataFrame(artworks_to_insert, columns=['title', 'artwork_date', 'medium', 'department', 'date_acquired', 'art_classification', 'credit_line']).to_csv("artworks_to_insert.csv", index=False)
pd.DataFrame(artists_artworks_to_insert, columns=['artist_name', 'birth_year', 'title', 'artwork_date']).to_csv("artists_artworks_to_insert.csv", index=False)

# # 🔹 **Step 3: Batch Insert Artists**
# if artists_to_insert:
#     query = """
#         INSERT INTO Artists (artist_name, birth_year, nationality, gender, death_year)
#         VALUES %s
#         ON CONFLICT (artist_name, birth_year) DO NOTHING;
#     """
#     psycopg2.extras.execute_values(cursor, query, artists_to_insert)
#     print(f"Inserted {len(artists_to_insert)} new artists.")

# # 🔹 **Step 4: Batch Insert Artworks**
# if artworks_to_insert:
#     query = """
#         INSERT INTO Artworks (title, artwork_date, artist_name, birth_year, medium, department, date_acquired, art_classification, credit_line)
#         VALUES %s
#         ON CONFLICT (title, artwork_date) DO NOTHING;
#     """
#     psycopg2.extras.execute_values(cursor, query, artworks_to_insert)
#     print(f"Inserted {len(artworks_to_insert)} artworks.")

# # 🔹 **Step 5: Commit Changes**


# print(" Data Insertion Complete!")


100%|██████████| 118807/118807 [00:12<00:00, 9874.07it/s] 


### Important Note

Use files "artists_to_insert.csv", "artworks_to_insert.csv", and "artists_artworks_to_insert.csv" to manually import data into tables using DataGrip.

The complexity of psycopg2.extras.execute_values and regulary query execution calls are too high in python to run in a reasonable amount of time.

In [None]:
# cursor.execute("SELECT artist_name, birth_year FROM Artists;")
# existing_artists = set(cursor.fetchall())  # Store as a set of (artist_name, birth_year)

# # 🔹 **Step 4: Insert Artworks**
# for _, row in moma_artworks.iterrows():
#     title = row.get('Title', None)  # Make sure this column name is correct
#     artwork_date = row.get('Date', None)
#     if artwork_date is not None:
#         artwork_date = re.search(r'\d{4}', artwork_date)
#         if artwork_date is not None:
#             artwork_date = int(artwork_date.group(0))  # Adjust column name if needed
#     artist_name = row.get('Artist', None)  # Adjust column name if needed
#     birth_year = row.get('BeginDate', None)
#     if birth_year is not None:
#         birth_year = re.search(r'\d{4}', birth_year)
#         if birth_year is not None:
#             birth_year = int(birth_year.group(0))  # Ensure it's properly handled
#     medium = row.get('Medium', None)
#     department = row.get('Department', None)
#     date_acquired = row.get('DateAcquired', None)
#     art_classification = row.get('Classification', None)
#     credit_line = row.get('CreditLine', None)

#     # don't add an artwork + artist if we don't have the artist's birth year
#     if birth_year is None:
#         continue

#     # 🔹 **Ensure the artist exists in the Artists table**
#     if (artist_name, birth_year) not in existing_artists:
#         # insert artist into artists table
#         nationality = row.get('Nationality', None)
#         gender = row.get('Gender', None)
#         death_year = row.get('EndDate', None)
#         if death_year is not None:
#             death_year = re.search(r'\d{4}', str(birth_year))
#             if death_year is not None:
#                 death_year = death_year.group(0)
#         cursor.execute("""
#             INSERT INTO Artists (artist_name, birth_year, nationality, gender, death_year)
#             VALUES (%s, %s, %s, %s, %s)
#             ON CONFLICT (artist_name, birth_year) DO NOTHING;
#         """, (artist_name, birth_year, nationality, gender, death_year))
#         #print(f"Skipping artwork '{title}' - Artist '{artist_name}' ({birth_year}) not found in Artists table.")
#         print(f"Artist '{artist_name}' ({birth_year})")
#         continue

#     # # 🔹 **Ensure birth_year is converted safely**
#     # birth_year = int(birth_year) if pd.notna(birth_year) else None

#     # 🔹 **Insert into Artworks table**
#     try:
#         cursor.execute("""
#             INSERT INTO Artworks (
#                 title, artwork_date, artist_name, birth_year, medium, department, 
#                 date_acquired, art_classification, credit_line
#             )
#             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
#             ON CONFLICT (title, artwork_date) DO NOTHING;
#         """, (title, artwork_date, artist_name, birth_year, medium, department, date_acquired, art_classification, credit_line))

#     except Exception as e:
#         print(f"Error inserting artwork '{title}': {e}")
#         conn.rollback()



Artist 'Otto Wagner' (1841
Artist 'Christian de Portzamparc' (1944
Artist 'Emil Hoppe' (1876
Artist 'Bernard Tschumi' (1944
Artist 'Emil Hoppe' (1876
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Louis I. Kahn' (1901
Artist 'Bernard Tschumi' (1944
Artist 'Marcel Kammerer' (1878
Artist 'Bernard Tschumi' (1944
Artist 'Otto Schönthal' (1878
Artist 'Bernard Tschumi' (1944
Artist 'Otto Schönthal' (1878
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Bernard Tschumi' (1944
Artist 'Hans Poelzig' (1869
Artist 'Raimund Abraham' (1933
Artist 'Peter Eisenman, Robert Cole' (1932
Artist 'Peter Eisenman, Robert Cole' (1932
Artist 'Rem Koolhaas, Madelon Vriesendorp' (1944
Artist 'Rem Koolhaas, Madelon Vriesendorp' (1944
Artist 'Roger C. Ferri' (1949
Artist 'Bernard Tschumi' (1944
Artist 'Roger C. Ferri' (1949
Artist 'Bernard Tschumi' (1944
Artist 'Roger C. Ferri' 

In [None]:
from tqdm import tqdm
import pandas as pd

#### Populate Movement Table

In [None]:
# # Extract the "movement" column from artists_combined DataFrame
m# ovement_data = artists_combined['movement']

# C# reate a set to store unique movement
uniq# ue_movement = set()

# Pro# cess each row in the movement column
for mo# vement in movement_data:
    if # isinstance(movement, str):  
        # # If it's a comma-separated string, split and strip spaces
        u# nique_moverr = ment.update([occ.strip() for occ in movement.split(', ') if cc# .strip()])
    elif isinstance(movement,#  list):  
        # If it's a list, clean each#  elementcurr = t.update([str(occ).strip() for occ in movement if pd.notna(occ) and str(occ)st# rip()])
    elif isinstance(movement, np.ndarr# ay):  
        # Convert NumPy array to list and # clean
        if not pd.isna(movement).all(# ):  
      curr = date([str(occ).strip() for occ in movement.tolist() if pd.notna(occ) and str(occ).strp(# )])
    elif pd.notna(movement):#   
        # Handle any other non-null valu# e
      curr = d(str(movement).strip
#     if curr == 'Unknown':
#         continue
#     unique_movement.update(curr)))

# # Insert unique occupations into the Occupations table
f# or movement in unique_movement:
  #   try:
   #      cursor.execute("""
    #         INSERT INTO Movements (movement_name)
     #        VALUES (%s)
      #       ON CONFLICT (movement_name) DO NOTHING;
       #  """, (movement,))
    exce# pt Exception as e:
        p# rint(f"Error inserting movement {movement} -> {e}")
        co# nn.rollback()


print("Moveme# nt Insertion Complete!")
cu

#### Insert Artist Movement

In [None]:
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

# First, extract unique movements and insert them into the Movements table (assuming it exists)
movements_data = artists_combined['movements']

# Create a set to store unique movements
unique_movements = set()

# Process each row in the movements column with tqdm progress bar
print("Extracting unique movements...")
for movements in tqdm(movements_data, desc="Processing movements"):
    if isinstance(movements, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_movements.update([mov.strip() for mov in movements.split(', ') if mov.strip()])
    elif isinstance(movements, list):  
        # If it's a list, clean each element
        unique_movements.update([str(mov).strip() for mov in movements if pd.notna(mov) and str(mov).strip()])
    elif isinstance(movements, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(movements).all():  
            unique_movements.update([str(mov).strip() for mov in movements.tolist() if pd.notna(mov) and str(mov).strip()])
    elif pd.notna(movements):  
        # Handle any other non-null value
        unique_movements.add(str(movements).strip())

# Insert unique movements into the Movements table with tqdm progress bar
print("Inserting unique movements into the Movements table...")
for movement in tqdm(unique_movements, desc="Inserting movements"):
    try:
        cursor.execute("""
            INSERT INTO Movements (movement_name)
            VALUES (%s)
            ON CONFLICT (movement_name) DO NOTHING;
        """, (movement,))
    except Exception as e:
        print(f"Error inserting movement {movement} -> {e}")
        conn.rollback()

# Now populate the Artist_Movements table
artist_movements = []

# Process the DataFrame to extract artist-movement relationships with years_active
print("Processing artist-movement relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']
    
    # Get movements for this artist
    movements = []
    years_active = {}
    
    # Check if we have a movements_details column with structured data including years active
    if 'movements_details' in row and pd.notna(row['movements_details']):
        # Assuming movements_details is a dict or can be parsed as JSON with format:
        # {movement_name: {"start_year": year, "end_year": year}, ...}
        if isinstance(row['movements_details'], dict):
            movement_details = row['movements_details']
        elif isinstance(row['movements_details'], str):
            try:
                movement_details = json.loads(row['movements_details'])
            except:
                movement_details = {}
        
        for movement_name, active_years in movement_details.items():
            if movement_name.strip():
                movements.append(movement_name.strip())
                years_active[movement_name.strip()] = active_years
    
    # If no structured data, extract movements from the movements column
    if not movements and 'movements' in row and pd.notna(row['movements']):
        if isinstance(row['movements'], str):
            movements = [mov.strip() for mov in row['movements'].split(', ') if mov.strip()]
        elif isinstance(row['movements'], list):
            movements = [str(mov).strip() for mov in row['movements'] if pd.notna(mov) and str(mov).strip()]
        elif isinstance(row['movements'], np.ndarray):
            movements = [str(mov).strip() for mov in row['movements'].tolist() if pd.notna(mov) and str(mov).strip()]
        elif pd.notna(row['movements']):
            movements = [str(row['movements']).strip()]
    
    # Add to our list of relationships to insert
    for movement in movements:
        if movement:
            # Get years active for this movement if available, or set to empty dict
            years_json = years_active.get(movement, {})
            
            artist_movements.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'movement_name': movement,
                'years_active': json.dumps(years_json) if years_json else None
            })

# Insert the artist-movement relationships using executemany with psycopg2
print("Inserting artist-movement relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Movements (artist_name, birth_year, movement_name, years_active)
        VALUES (%(artist_name)s, %(birth_year)s, %(movement_name)s, %(years_active)s::jsonb)
        ON CONFLICT (artist_name, birth_year, movement_name) DO UPDATE 
        SET years_active = EXCLUDED.years_active;
    """, artist_movements)
    conn.commit()
    print(f"Successfully inserted {len(artist_movements)} artist-movement relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist movements: {e}")

print("Artist Movements Insertion Complete!")

### Schools and Artist_Schools

In [None]:
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

# First, create and populate the PaintingSchools table
painting_schools_data = artists_combined['PaintingSchool']

# Create a set to store unique painting schools
unique_schools = set()

# Process each row in the PaintingSchool column with tqdm progress bar
print("Extracting unique painting schools...")
for school in tqdm(painting_schools_data, desc="Processing schools"):
    if isinstance(school, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_schools.update([sch.strip() for sch in school.split(', ') if sch.strip()])
    elif isinstance(school, list):  
        # If it's a list, clean each element
        unique_schools.update([str(sch).strip() for sch in school if pd.notna(sch) and str(sch).strip()])
    elif isinstance(school, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(school).all():  
            unique_schools.update([str(sch).strip() for sch in school.tolist() if pd.notna(sch) and str(sch).strip()])
    elif pd.notna(school):  
        # Handle any other non-null value
        unique_schools.add(str(school).strip())

# Insert unique painting schools into the PaintingSchools table with tqdm progress bar
print("Inserting unique painting schools into the Schools table...")
for school in tqdm(unique_schools, desc="Inserting schools"):
    try:
        cursor.execute("""
            INSERT INTO Schools (school_name)
            VALUES (%s)
            ON CONFLICT (school_name) DO NOTHING;
        """, (school,))
    except Exception as e:
        print(f"Error inserting painting school {school} -> {e}")
        conn.rollback()

# Now populate the Artist_PaintingSchools table
artist_schools = []

# Process the DataFrame to extract artist-school relationships
print("Processing artist-painting school relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']
    
    # Get painting schools for this artist
    schools = []
    time_periods = {}
    
    # Check if we have a school_details column with structured data including time periods
    if 'school_details' in row and pd.notna(row['school_details']):
        # Assuming school_details is a dict or can be parsed as JSON with format:
        # {school_name: {"start_year": year, "end_year": year}, ...}
        if isinstance(row['school_details'], dict):
            school_details = row['school_details']
        elif isinstance(row['school_details'], str):
            try:
                school_details = json.loads(row['school_details'])
            except:
                school_details = {}
        
        for school_name, period in school_details.items():
            if school_name.strip():
                schools.append(school_name.strip())
                time_periods[school_name.strip()] = period
    
    # If no structured data, extract schools from the PaintingSchool column
    if not schools and 'PaintingSchool' in row and pd.notna(row['PaintingSchool']):
        if isinstance(row['PaintingSchool'], str):
            schools = [sch.strip() for sch in row['PaintingSchool'].split(', ') if sch.strip()]
        elif isinstance(row['PaintingSchool'], list):
            schools = [str(sch).strip() for sch in row['PaintingSchool'] if pd.notna(sch) and str(sch).strip()]
        elif isinstance(row['PaintingSchool'], np.ndarray):
            schools = [str(sch).strip() for sch in row['PaintingSchool'].tolist() if pd.notna(sch) and str(sch).strip()]
        elif pd.notna(row['PaintingSchool']):
            schools = [str(row['PaintingSchool']).strip()]
    
    # Add to our list of relationships to insert
    for school in schools:
        if school:
            # Get time period for this school if available, or set to empty dict
            period_json = time_periods.get(school, {})
            
            artist_schools.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'school_name': school,
                'time_period': json.dumps(period_json) if period_json else None
            })

# Insert the artist-painting school relationships using executemany with psycopg2
print("Inserting artist-painting school relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Schools (artist_name, birth_year, school_name, time_period)
        VALUES (%(artist_name)s, %(birth_year)s, %(school_name)s, %(time_period)s::jsonb)
        ON CONFLICT (artist_name, birth_year, school_name) DO UPDATE 
        SET time_period = EXCLUDED.time_period;
    """, artist_schools)
    conn.commit()
    print(f"Successfully inserted {len(artist_schools)} artist-painting school relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist painting schools: {e}")

print("Artist Painting Schools Insertion Complete!")

Commit and close the connection:

In [None]:
print(artists_combined.columns.tolist())

['artist', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements', 'birth_place', 'death_place', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'locations', 'locations_with_years', 'styles_extended', 'StylesCount', 'StylesYears', 'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'Type', 'artist_name', 'Wikidata QID', 'DisplayName', 'Wiki QID', 'ULAN', 'Title', 'Artist', 'Date', 'Medium', 'Dimensions', 'CreditLine', 'AccessionNumber', 'Classification', 'Department', 'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ImageURL', 'OnView', 'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)', 'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)', 'Duration (sec.)', 'start_year', 'end_year', 'birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']


In [37]:
# Commit changes
conn.commit()

# Close the connection
cursor.close()
conn.close()