In [None]:
import pandas as pd
import numpy as np
import psycopg2
from tqdm import tqdm
import regex as re
from ast import literal_eval
import csv

### Combine Data Sources

In [None]:
# Load datasets
moma_artists = pd.read_csv('Artists.csv')
moma_artworks = pd.read_csv('Artworks.csv')
painter_palette = pd.read_csv('PainterPalette.csv')
wikidata = pd.read_csv('Total_Merged_Painters_Data.csv')

Clean MOMA Artists dataset:

In [None]:
# Ensure ConstituentID is unique
moma_artists = moma_artists.drop_duplicates(subset=['ConstituentID'])

# Extract birth and death years from ArtistBio. Fill missing values and convert to int.
moma_artists['birth_year'] = moma_artists['ArtistBio'].str.extract(r'(\d{4})–').astype(float).fillna(0).astype(int)
moma_artists['death_year'] = moma_artists['ArtistBio'].str.extract(r'–(\d{4})').astype(float).fillna(0).astype(int)
moma_artists['Nationality'] = moma_artists['Nationality'].fillna('Unknown')
moma_artists['Gender'] = moma_artists['Gender'].fillna('Unknown')

Clean MOMA Artworks dataset:

In [None]:
# Ensure artwork is unique
moma_artworks = moma_artworks.drop_duplicates(subset=['ObjectID'], keep='first')
moma_artworks = moma_artworks.drop_duplicates(subset=['Title'])

# Convert ConstituentID column to string, split, explode, and convert to integers
moma_artworks['ConstituentID'] = moma_artworks['ConstituentID'].astype(str).str.split(', ')
moma_artworks = moma_artworks.explode('ConstituentID')
moma_artworks['ConstituentID'] = pd.to_numeric(moma_artworks['ConstituentID'], errors='coerce').fillna(0).astype(int)

# Filter out rows where ConstituentID is 0
moma_artworks = moma_artworks[moma_artworks['ConstituentID'] != 0]

# Extract start and end years from date column
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float)  # Extract start year
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)  # Extract end year (last 2 digits)

# Handle rows where end year is only 2 digits (e.g., "1976-77")
moma_artworks['start_year'] = moma_artworks['Date'].str.extract(r'(\d{4})').astype(float).fillna(0).astype(int)
moma_artworks['end_year'] = moma_artworks['Date'].str.extract(r'-(\d{2})').astype(float)
moma_artworks['end_year'] = (
    moma_artworks['start_year'].astype(str).str[:2] + 
    moma_artworks['end_year'].astype(str).str.zfill(2)
).fillna(moma_artworks['start_year'].astype(str))

moma_artworks['start_year'] = pd.to_numeric(moma_artworks['start_year'], errors='coerce').fillna(0).astype(int)
moma_artworks['end_year'] = pd.to_numeric(moma_artworks['end_year'], errors='coerce').fillna(0).astype(int)

moma_artworks['Artist'] = moma_artworks['Artist'].fillna('Unknown')

Clean PainterPalette dataset:

In [None]:
# Define columns to clean
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

# Fill missing values with 'Unknown' and split columns
painter_palette[columns_to_clean] = painter_palette[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    painter_palette[col] = painter_palette[col].str.split(',')

Clean WikiData dataset:

In [None]:
# Define columns to clean
columns_to_clean = [
    'Nationality', 'citizenship', 'gender', 'styles', 'movement', 'birth_place', 
    'death_place', 'occupations', 'Influencedby', 'Influencedon', 
    'Pupils', 'Teachers', 'FriendsandCoworkers', 'Contemporary', 'PaintingSchool'
]

# Fill missing values with 'Unknown' and split columns
wikidata[columns_to_clean] = wikidata[columns_to_clean].fillna('Unknown')
for col in ['styles', 'Nationality', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers'
            , 'FriendsandCoworkers', 'Contemporary', 'occupations']:
    wikidata[col] = wikidata[col].str.split(',')

Merge datasets:

In [None]:
# Fill in PainterPalette with data from WikiData
artists_combined = pd.merge(
    painter_palette, 
    wikidata, 
    left_on='artist', 
    right_on='artist_name', 
    how='left',
    suffixes=('', '_wiki')
)

# List of columns to fill
columns_to_fill = ['Nationality', 'citizenship', 'gender', 'styles', 'movement', 'Art500k_Movements','birth_place', 'death_place', 'birth_year', 'death_year',
                   'locations', 'FirstYear', 'LastYear', 'wikiart_pictures_count', 'styles_extended', 'locations_with_years', 'StylesCount', 'StylesYears', 
                   'occupations', 'PaintingsExhibitedAt', 'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby', 'Influencedon', 'Pupils', 'Teachers', 
                   'FriendsandCoworkers', 'Contemporary', 'Type']

# Combine columns to fill from painter_palette and wikidata
for col in columns_to_fill:
    if col in artists_combined.columns and f'{col}_wiki' in artists_combined.columns:
        # Fill missing values in target column with values from Wiki column
        artists_combined[col] = artists_combined[col].combine_first(artists_combined[f'{col}_wiki'])
    else:
        print(f"Warning: Column {col} or {col}_wiki not found in the merged dataset.")

# Drop columns added from WikiData dataset to avoid duplication
for col in columns_to_fill:
    wiki_col = f'{col}_wiki'
    if wiki_col in artists_combined.columns:
        artists_combined.drop(columns=[wiki_col], inplace=True)

In [None]:
# Merge with MOMA Artists dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artists, 
    left_on='artist', 
    right_on='DisplayName', 
    how='left'
)

# Combine nationality columns into one
artists_combined['Nationality'] = artists_combined['Nationality_x']
artists_combined = artists_combined.drop(columns=['Nationality_x', 'Nationality_y'])

In [None]:
# Merge with MOMA Artworks dataset
artists_combined = pd.merge(
    artists_combined, 
    moma_artworks, 
    left_on='artist', 
    right_on='Artist', 
    how='left'
)

In [None]:
# List of columns to combine that have '_x' and '_y' versions
columns_to_combine = ['birth_year', 'death_year', 'Nationality', 'Gender', 'ArtistBio', 'ConstituentID', 'BeginDate', 'EndDate']

# Combine the '_x' and '_y' columns
for col in columns_to_combine:
    col_x = col + '_x'
    col_y = col + '_y'
    
    # Ensure both columns exist in df
    if col_x in artists_combined.columns and col_y in artists_combined.columns:
        # Combine columns (using _x values and filling missing values from _y)
        artists_combined[col] = artists_combined[col_x].combine_first(artists_combined[col_y])
        
        # Drop original '_x' and '_y' columns
        artists_combined.drop(columns=[col_x, col_y], inplace=True)
    else:
        continue

In [None]:
# Save artists_combined to CSV
artists_combined.to_csv('artists_combined.csv', index=False)

### Load Combined Data

In [None]:
artists_combined = pd.read_csv('artists_combined.csv')

### Create Postgres Database

#### Connect To Postgres:

In [None]:
# Connect to Postgres
conn = psycopg2.connect(
    dbname="dbname",
    user="user",
    password="password",
    host="host",
    port="port"
)
cursor = conn.cursor()

#### Insert Artists

In [None]:
# Iterate through artists_combined and insert into Artists table
for _, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0]):

    # Check and clean year columns to ensure valid ranges
    birth_year = row.get('birth_year', None)
    death_year = row.get('death_year', None)
    career_start_year = row.get('FirstYear', None)
    career_end_year = row.get('LastYear', None)

    # Skip rows where year columns NaN or too large
    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None 

    if pd.isna(death_year) or death_year > 2030:
        death_year = None 

    if pd.isna(career_start_year) or career_start_year > 2030:
        career_start_year = None 

    if pd.isna(career_end_year):
        career_end_year = None 
        
    if type(career_end_year) == str:
        if "https" in career_end_year:
            career_end_year = None
        else:
            career_end_year = int(career_end_year.replace(".0", ""))
    if (career_end_year is not None) and career_end_year > 2030:
        career_end_year = None

    # Convert to int
    birth_year = int(birth_year) if birth_year is not None else None


    birth_year = None if birth_year == 0 else birth_year
    death_year = None if (death_year is not None) and death_year == 0 else death_year
    career_start_year = None if (career_start_year is not None) and career_start_year == 0 else career_start_year
    career_end_year = None if (career_end_year is not None) and career_end_year == 0 else career_end_year

    artist_name = row['artist']

    nationality = row.get('Nationality', None)
    if re.match(r'Q\d+', str(nationality)):
        nationality = None
    if nationality is not None:
        nationality = str(nationality).strip("[").strip("]").replace("'", "")
    if 'unknown' in str(nationality).lower():
        nationality = None
    citizenship = row.get('citizenship', None)
    if 'unknown' in str(citizenship).lower():
        citizenship = None
    gender = row.get('gender', None)
    if 'unknown' in str(gender).lower():
        gender = None

    # Insert data into Artists table
    cursor.execute("""
        INSERT INTO Artists (
            artist_name, birth_year, nationality, 
            citizenship, gender, death_year, career_start_year, career_end_year
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (artist_name, birth_year_key) DO NOTHING;
    """, (
        artist_name, birth_year, nationality, 
        citizenship, gender, death_year, career_start_year, career_end_year
    ))

# Commit changes
conn.commit()


#### Inserting Places, Artist_Birth_Places, & Artist_Death_Places



In [None]:
# Extract unique place names from 'birth_place' and 'death_place' columns
unique_places = pd.concat([artists_combined['birth_place'], artists_combined['death_place']]).unique()

# Insert unique places into Places table
insert_query = "INSERT INTO Places (place_name) VALUES (%s) ON CONFLICT (place_name) DO NOTHING;"

for place in unique_places:
    if place:  # Ensure no NULL or empty places inserted
        cursor.execute(insert_query, (place,))

# Commit changes
conn.commit()

In [None]:
# Insert Artist Birth Places Data into table
insert_artist_query = """
    INSERT INTO Artists (artist_name, birth_year)
    VALUES (%s, %s)
    ON CONFLICT (artist_name, birth_year_key) DO NOTHING;
"""

insert_birth_place_query = """
    INSERT INTO Artist_Birth_Places (artist_name, birth_year, place_name)
    VALUES (%s, %s, %s)
    ON CONFLICT (artist_name, birth_year_key, place_name) DO NOTHING;
"""

# Iterate through extracted data and insert each artist's birth place
for _, row in artists_combined.iterrows():

    artist_name = row.get('artist', None)
    birth_year = row.get('birth_year', None)

    # Skip if artist_name is missing, birth_year is NaN, or birth_year is too large
    if pd.isna(artist_name) or pd.isna(birth_year) or birth_year > 2050:
        continue

    birth_year = int(birth_year)  # Convert birth_year to int
    birth_place = row['birth_place']

    # Ensure artist_name is not null and birth_place is not null or empty
    if artist_name and birth_place:
        # Insert into Artists table if the artist is not already present
        cursor.execute(insert_artist_query, (artist_name, birth_year))

        # Insert artist's birth place into Artist_Birth_Places table
        cursor.execute(insert_birth_place_query, (artist_name, birth_year, birth_place))

# Commit changes
conn.commit()


In [None]:
# Adjusted insert queries
insert_artist_query = """
    INSERT INTO Artists (artist_name, birth_year)
    VALUES (%s, %s)
    ON CONFLICT (artist_name, birth_year_key) DO NOTHING;
"""

insert_death_place_query = """
    INSERT INTO Artist_Death_Places (artist_name, death_year, place_name)
    VALUES (%s, %s, %s)
    ON CONFLICT (artist_name, death_year_key, place_name) DO NOTHING;
"""

# Iterate through extracted data and insert each artist's death place
for _, row in artists_combined.iterrows():

    artist_name = row.get('artist', None)
    death_year = row.get('death_year', None)

    # Skip if artist_name is missing, death_year is NaN, or death_year is too large
    if pd.isna(artist_name) or pd.isna(death_year) or death_year > 2050:
        continue

    death_year = int(death_year)  # Convert death_year to int
    death_place = row['death_place']

    # Ensure artist_name is not null and death_place is not null or empty
    if artist_name and death_place:
        # Insert artist into Artists table if not already present
        cursor.execute(insert_artist_query, (artist_name, death_year))

        # Insert artist's death place into Artist_Death_Places table
        cursor.execute(insert_death_place_query, (artist_name, death_year, death_place))

# Commit changes
conn.commit()

#### Inserting Occupations & Occupations_Artists

In [None]:
# Create and populate the Occupations table
occupations_data = artists_combined['occupations']

# Create a set to store unique occupations
unique_occupations = set()

# Process each row in the Occupation column with tqdm progress bar
print("Extracting unique occupations...")
for occupation in tqdm(occupations_data, desc="Processing occupations"):
    if pd.isna(occupation):
        continue
    if 'unknown' in occupation.lower():
        continue
    for o in occupation.lower().split(","):
        unique_occupations.add(o.strip())

cleaner_occupations = set()

for occupation in tqdm(unique_occupations, desc="processing isolated occs"):
    occupation = occupation.replace("[","").replace("]","").replace("'","").replace("\"","")
    if re.match(r'Q\d+', occupation):
        continue
    cleaner_occupations.add(occupation)

# Insert unique occupations into Occupations table with tqdm progress bar
print("Inserting unique occupations into the Occupations table...")
for occupation in tqdm(cleaner_occupations, desc="Inserting occupations"):
    try:
        cursor.execute("""
            INSERT INTO Occupations (occupation_name)
            VALUES (%s)
            ON CONFLICT (occupation_name) DO NOTHING;
        """, (occupation.strip().lower(),))
    except Exception as e:
        print(f"Error inserting occupation {occupation} -> {e}")
        conn.rollback()

conn.commit()

In [None]:
# Populate Artist_Occupations table
artist_occupations = []

# Process DataFrame to extract artist-occupation relationships
print("Processing artist-occupation relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']

    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None 
    birth_year = int(birth_year) if birth_year is not None else None
    birth_year = None if birth_year == 0 else birth_year
    
    # Get occupations for this artist
    unique_occupations = set()
    
    occupation = row.get('occupations', None)
    if pd.isna(occupation):
        continue
    if 'unknown' in occupation.lower():
        continue
    for o in occupation.lower().split(","):
        unique_occupations.add(o.strip())

    cleaner_occupations = set()
    for occupation in unique_occupations:
        occupation = occupation.replace("[","").replace("]","").replace("'","").replace("\"","")
        if re.match(r'Q\d+', occupation):
            continue
        cleaner_occupations.add(occupation)  

    # Add to our list of relationships to insert
    for occupation in cleaner_occupations:
        if occupation:
            artist_occupations.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'occupation_name': occupation.strip(),
            })

# Insert artist-occupation relationships using executemany with psycopg2
print("Inserting artist-occupation relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Occupations (artist_name, birth_year, occupation_name)
        VALUES (%(artist_name)s, %(birth_year)s, %(occupation_name)s)
        ON CONFLICT (artist_name, birth_year_key, occupation_name) DO NOTHING;
    """, artist_occupations)
    conn.commit()
    print(f"Successfully inserted {len(artist_occupations)} artist-occupation relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist occupations: {e}")

print("Artist Occupations Insertion Complete")

#### Schools and Artist_Schools

In [None]:
# Create and populate PaintingSchools table
painting_schools_data = artists_combined['PaintingSchool']

# Create set to store unique painting schools
unique_schools = set()

# Process each row in PaintingSchool column with tqdm progress bar
print("Extracting unique painting schools...")
for school in tqdm(painting_schools_data, desc="Processing schools"):
    if pd.isna(school):
        continue
    if 'unknown' in school.lower():
        continue
    for s in school.lower().split(","):
        unique_schools.add(s.strip())

# Insert unique painting schools into PaintingSchools table with tqdm progress bar
print("Inserting unique painting schools into the Schools table...")
for school in tqdm(unique_schools, desc="Inserting schools"):
    try:
        cursor.execute("""
            INSERT INTO Schools (school_name)
            VALUES (%s)
            ON CONFLICT (school_name) DO NOTHING;
        """, (school,))
    except Exception as e:
        print(f"Error inserting painting school {school} -> {e}")
        conn.rollback()

# Commit changes
conn.commit()

In [None]:
# Populate Artist_PaintingSchools table
artist_schools = []

# Extract artist-school relationships
print("Processing artist-painting school relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']

    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None 
    birth_year = int(birth_year) if birth_year is not None else None
    birth_year = None if birth_year == 0 else birth_year
    
    # Get painting schools for this artist
    unique_schools = set()
    
    school = row.get('PaintingSchool', None)
    if pd.isna(school):
        continue
    if 'unknown' in school.lower():
        continue
    for s in school.lower().split(","):
        unique_schools.add(s.strip())
    
    # Add to our list of relationships to insert
    for school in unique_schools:
        if school:
            # Get time period for this school if available or set to empty dict
            artist_schools.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'school_name': school,
            })

# Insert artist-painting school relationships using executemany with psycopg2
print("Inserting artist-painting school relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Schools (artist_name, birth_year, school_name)
        VALUES (%(artist_name)s, %(birth_year)s, %(school_name)s)
        ON CONFLICT (artist_name, birth_year_key, school_name) DO NOTHING;
    """, artist_schools)
    conn.commit()
    print(f"Successfully inserted {len(artist_schools)} artist-painting school relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist painting schools: {e}")

print("Artist Painting Schools Insertion Complete")

#### Inserting Artworks, Additional Artists, Artists_Artworks

In [None]:
# Fetch existing artists from the database to prevent duplicate inserts
cursor.execute("SELECT artist_name, birth_year FROM Artists;")
existing_artists = set(cursor.fetchall())  # Store as a set of (artist_name, birth_year)

# Initiate lists to store data for batch insert
artists_to_insert = []
artworks_to_insert = []
artists_artworks_to_insert = []

for _, row in tqdm(moma_artworks.iterrows(), total=len(moma_artworks)):
    title = row.get('Title', None)
    artwork_date = row.get('Date', None)

    # Extract the first four-digit year from artwork_date
    artwork_date_match = re.search(r'\d{4}', str(artwork_date)) if artwork_date else None
    artwork_date = int(artwork_date_match.group(0)) if artwork_date_match else None

    artists_info = {}

    artist_name = row.get('Artist', None)
    if (artist_name is not None) and ("," in artist_name):
        artists_list = artist_name.split(", ")
        for i in range(len(artists_list)):
            artists_info[i] = {}
            artists_info[i]['artist_name'] = artists_list[i]
    else:
        artists_info[0] = {}
        artists_info[0]['artist_name'] = artist_name

    birth_year = row.get('BeginDate', None)
    if (birth_year is not None) and (" " in birth_year):
        birth_year_list = birth_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(birth_year_list)):
            birth_year_match = re.search(r'\d{4}', str(birth_year_list[i])) if birth_year_list[i] else None
            birth_year = int(birth_year_match.group(0)) if birth_year_match else None
            artists_info[i]['birth_year'] = birth_year
    elif (birth_year is not None):
        birth_year_match = re.search(r'\d{4}', str(birth_year))
        birth_year = int(birth_year_match.group(0)) if birth_year_match else None
        artists_info[0]['birth_year'] = birth_year


    medium = row.get('Medium', None)
    department = row.get('Department', None)
    date_acquired = row.get('DateAcquired', None)
    art_classification = row.get('Classification', None)
    credit_line = row.get('CreditLine', None)

    nationality = row.get('Nationality', None)
    if "unknown" in str(nationality).lower():
        nationality = None
    elif (nationality is not None) and (") (" in nationality):
        nationality_list = nationality.strip("(").strip(")").split(") (")
        for i in range(len(nationality_list)):
            artists_info[i]['nationality'] = nationality_list[i]
    elif (nationality is not None):
        artists_info[0]['nationality'] = nationality.replace("(", "").replace(")", "")

    gender = row.get('Gender', None)
    if (gender is not None) and (") (" in gender):
        gender_list = gender.strip("(").strip(")").split(") (")
        for i in range(len(gender_list)):
            artists_info[i]['gender'] = gender_list[i]
    elif (gender is not None):
        artists_info[0]['gender'] = gender.replace("(", "").replace(")", "")

    death_year = row.get('EndDate', None)
    if (death_year is not None) and (" " in death_year):
        death_year_list = death_year.replace("(", "").replace(")", "").split(" ")
        for i in range(len(death_year_list)):
            death_year_match = re.search(r'\d{4}', str(death_year_list[i])) if death_year_list[i] else None
            death_year = int(death_year_match.group(0)) if death_year_match else None
            artists_info[i]['death_year'] = death_year
    elif (death_year is not None):
        death_year_match = re.search(r'\d{4}', str(death_year)) if death_year else None
        death_year = int(death_year_match.group(0)) if death_year_match else None
        artists_info[0]['death_year'] = death_year


    # Check if the artist exists, otherwise add to insert list
    for i in range(len(artists_info)):
        artist_name = artists_info[i].get('artist_name', None)
        birth_year = artists_info[i].get('birth_year', None)

        # Skip if artist_name or birth_year is missing
        if (artist_name is None) or (birth_year is None):
            continue

        # Skip if artist_name and birth_year already exist in the database
        if (artist_name, birth_year) not in existing_artists:

            artists_to_insert.append((artist_name,
                                      birth_year,
                                      artists_info[i].get('nationality', None),
                                      artists_info[i].get('gender', None), 
                                      artists_info[i].get('death_year', None)))
            existing_artists.add((artist_name, birth_year))  # Add to cache to prevent duplicate inserts

            artists_artworks_to_insert.append((artist_name, birth_year, title, artwork_date))

    artworks_to_insert.append((title, artwork_date, medium, department, date_acquired, art_classification, credit_line))

pd.DataFrame(artists_to_insert, columns=['artist_name', 'birth_year', 'nationality', 'gender', 'death_year']).to_csv("/Users/***REMOVED***/Desktop/UCSD/DSC202/artists_to_insert.csv", index=False)
pd.DataFrame(artworks_to_insert, columns=['title', 'artwork_date', 'medium', 'department', 'date_acquired', 'art_classification', 'credit_line']).to_csv("/Users/***REMOVED***/Desktop/UCSD/DSC202/artworks_to_insert.csv", index=False)
pd.DataFrame(artists_artworks_to_insert, columns=['artist_name', 'birth_year', 'title', 'artwork_date']).to_csv("/Users/***REMOVED***/Desktop/UCSD/DSC202/artists_artworks_to_insert.csv", index=False)

#### Important Note

Use files "artists_to_insert.csv", "artworks_to_insert.csv", and "artists_artworks_to_insert.csv" to manually import data into tables using DataGrip.

The complexity of psycopg2.extras.execute_values and regulary query execution calls are too high in python to run in a reasonable amount of time.

#### Insert Movement and Artist Movement

In [None]:
# Extract unique movements and insert into Movements table
movements_data = artists_combined['movement']

# Create set to store unique movements
unique_movements = set()

# Process each row in movements column with tqdm progress bar
print("Extracting unique movements...")
for movements in tqdm(movements_data, desc="Processing movements"):
    if isinstance(movements, str):  
        # If it's a comma-separated string, split and strip spaces
        unique_movements.update([mov.strip() for mov in movements.split(', ') if mov.strip()])
    elif isinstance(movements, list):  
        # If it's a list, clean each element
        unique_movements.update([str(mov).strip() for mov in movements if pd.notna(mov) and str(mov).strip()])
    elif isinstance(movements, np.ndarray):  
        # Convert NumPy array to list and clean
        if not pd.isna(movements).all():  
            unique_movements.update([str(mov).strip() for mov in movements.tolist() if pd.notna(mov) and str(mov).strip()])
    elif pd.notna(movements):  
        # Handle any other non-null value
        unique_movements.add(str(movements).strip())

# Insert unique movements into Movements table with tqdm progress bar
print("Inserting unique movements into the Movements table...")
for movement in tqdm(unique_movements, desc="Inserting movements"):
    try:
        cursor.execute("""
            INSERT INTO Movements (movement_name)
            VALUES (%s)
            ON CONFLICT (movement_name) DO NOTHING;
        """, (movement,))
    except Exception as e:
        print(f"Error inserting movement {movement} -> {e}")
        conn.rollback()

# Populate the Artist_Movements table
artist_movements = []

# Process DataFrame to extract artist-movement relationships with years_active
print("Processing artist-movement relationships...")
for index, row in tqdm(artists_combined.iterrows(), total=len(artists_combined), desc="Processing artists"):
    artist_name = row['artist_name']
    birth_year = row['birth_year']

    if pd.isna(birth_year) or birth_year > 2030:
        continue  # Skip invalid years

    birth_year = int(birth_year) # Convert to int
    birth_year = None if birth_year == 0 else birth_year
    if birth_year is None:
        continue
    
    
    # Get movements for this artist
    movements = []
    
    if isinstance(row['movement'], str):
        movements = [mov.strip() for mov in row['movement'].split(', ') if mov.strip()]
    elif isinstance(row['movement'], list):
        movements = [str(mov).strip() for mov in row['movement'] if pd.notna(mov) and str(mov).strip()]
    elif isinstance(row['movement'], np.ndarray):
        movements = [str(mov).strip() for mov in row['movement'].tolist() if pd.notna(mov) and str(mov).strip()]
    elif pd.notna(row['movement']):
        movements = [str(row['movement']).strip()]

    # Add to our list of relationships to insert
    for movement in movements:
        if movement is not None:
            artist_movements.append({
                'artist_name': artist_name,
                'birth_year': birth_year,
                'movement_name': movement
            })

# Insert artist-movement relationships using executemany with psycopg2
print("Inserting artist-movement relationships...")
try:
    cursor.executemany("""
        INSERT INTO Artist_Movements (artist_name, birth_year, movement_name)
        VALUES (%(artist_name)s, %(birth_year)s, %(movement_name)s)
        ON CONFLICT (artist_name, birth_year_key, movement_name) DO NOTHING;
    """, artist_movements)
    conn.commit()
    print(f"Successfully inserted {len(artist_movements)} artist-movement relationships")
except Exception as e:
    conn.rollback()
    print(f"Error batch inserting artist movements: {e}")

print("Artist Movements Insertion Complete")

#### Styles and Artist_Styles

In [None]:
# Iterate through each row in the df
for _, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0], desc="Inserting styles"):
    artist_name = row['artist']
    birth_year = row['birth_year']

    # Skip if birth_year is NaN, invalid (-999999), too large, or artist_name is missing
    if pd.isna(artist_name) or pd.isna(birth_year) or birth_year == -999999 or birth_year > 2030:
        continue

    # Convert birth_year to int if valid
    birth_year = int(birth_year) if (not pd.isna(birth_year)) and (birth_year is not None) else None
    birth_year = None if (pd.isna(birth_year)) or (birth_year == 0) or (birth_year > 2030) else birth_year

    # Fetch birth_year_key
    cursor.execute("""
        SELECT birth_year_key
        FROM Artists
        WHERE artist_name = %s AND birth_year = %s;
    """, (artist_name, birth_year))
    result = cursor.fetchone()

    if result is None:
        continue  # Skip if artist record doesn't exist

    birth_year_key = result[0]

    # Process styles
    styles = row['styles']
    if isinstance(styles, str):
        # Strip brackets, quotes, and unnecessary spaces
        styles = re.sub(r'[\[\]"\']', '', styles).strip()
        styles = [s.strip() for s in styles.split(',')]
    elif styles is None or (isinstance(styles, (list, pd.Series, pd.DataFrame)) and pd.isnull(styles).all()):
        continue  # Skip if styles are missing or empty


    # Process StylesCount dictionary
    styles_count = {}
    if isinstance(row['StylesCount'], str):
        clean_styles_count = re.sub(r'[\[\]{}"\']', '', row['StylesCount']).strip()  # Clean brackets, quotes
        for entry in clean_styles_count.split(', '):
            try:
                style, count = entry.rsplit(':', 1)
                styles_count[style.strip()] = int(count)
            except ValueError:
                continue

    # Process StylesYears dictionary
    styles_years = {}
    if isinstance(row['StylesYears'], str):
        clean_styles_years = re.sub(r'[\[\]{}"\']', '', row['StylesYears']).strip()  # Clean brackets, quotes
        for entry in clean_styles_years.split(','):
            try:
                style, years = entry.split(':', 1)
                styles_years[style.strip()] = years.strip()
            except ValueError:
                continue

    # Insert styles into the Styles table if they don't exist
    for style in styles:
        style = re.sub(r'[\[\]"\']', '', style).strip()  # Clean each style name

        # Insert style if it doesn't already exist
        cursor.execute("""
            INSERT INTO Styles (style_name)
            VALUES (%s)
            ON CONFLICT (style_name) DO NOTHING;
        """, (style,))

        # Check if the artist_name, birth_year_key, and style_name combination already exists
        cursor.execute("""
            SELECT 1
            FROM Artist_Styles
            WHERE artist_name = %s AND birth_year_key = %s AND style_name = %s;
        """, (artist_name, birth_year_key, style))

        existing_record = cursor.fetchone()

        # If combination exists, skip insertion for this row
        if existing_record:
            continue

        # If no existing record, insert it
        style_count = styles_count.get(style, None)
        style_years = styles_years.get(style, None)

        cursor.execute("""
            INSERT INTO Artist_Styles (artist_name, birth_year, style_name, style_count, style_years)
            VALUES (%s, %s, %s, %s, %s);
        """, (artist_name, birth_year, style, style_count, style_years))

# Commit changes
conn.commit()

#### Process Artist Relationships for CSV Import

In [None]:
cursor.execute("SELECT artist_name FROM Artists;")
existing_artists = set(cursor.fetchall())

# Clean artist names
def clean_artist_name(name):
    # Remove path prefixes
    name = re.sub(r'^Artists2/[^/]+/', '', name)
    # Remove file extensions and suffixes
    name = re.sub(r'/[^/]+\s\d{4}.*$', '', name)
    name = re.sub(r'/Not_Detected_\d+.*$', '', name)
    return name.strip()

# Parse relationship string into list of artist names
def parse_relationship(relationship_str):
    if pd.isna(relationship_str) or relationship_str == "['Unknown']":
        return []
    
    try:
        relationship_list = literal_eval(relationship_str)
        artists = []
        for item in relationship_list:
            if item and not any([
                item.startswith(('Louvre', 'Museo', 'National', 'Royal', 'Private')),
                re.match(r'^\d+\s*x\s*\d+', item),
                item.endswith(('cm', 'France', 'Spain')),
                re.match(r'^[a-z\-]+$', item)  # Lowercase with hyphens (tags)
            ]):
                artists.append(clean_artist_name(item))
        return artists
    except:
        return []

# Process each row to extract relationships
relationships = []

for index, row in tqdm(artists_combined.iterrows(), total=artists_combined.shape[0]):
    artist_name = row.get('artist', None)
    
    birth_year = row.get('birth_year', None)
    if pd.isna(birth_year) or birth_year > 2030:
        birth_year = None
    birth_year = int(birth_year) if birth_year is not None else None
    birth_year = None if birth_year == 0 else birth_year
    
    # Skip if this artist doesn't exist in the database
    if (artist_name,) not in existing_artists:
        continue
    
    # Process pupils
    pupils = parse_relationship(row.get('Pupils'))
    for pupil in pupils:
        if (pupil,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': pupil,
                'relationship_type': 'Teacher'
            })
    
    # Process teachers
    teachers = parse_relationship(row.get('Teachers'))
    for teacher in teachers:
        if (teacher,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': teacher,
                'relationship_type': 'Pupil'
            })
    
    # Process influences
    influences = parse_relationship(row.get('Influencedby'))
    for influence in influences:
        if (influence,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': influence,
                'relationship_type': 'Influenced By'
            })
    
    influenced = parse_relationship(row.get('Influencedon'))
    for influence in influenced:
        if (influence,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': influence,
                'relationship_type': 'Influenced On'
            })
    
    # Process friends
    friends = parse_relationship(row.get('FriendsandCoworkers'))
    for friend in friends:
        if (friend,) in existing_artists:
            relationships.append({
                'artist1_name': artist_name,
                'birth_year1': birth_year,
                'artist2_name': friend,
                'relationship_type': 'Friend'
            })

# Write relationships to CSV file
with open('clean_and_import_resources/artist_relationships.csv', 
          'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['artist1_name', 'birth_year1', 'artist2_name', 'relationship_type'])
    
    for rel in relationships:
        writer.writerow([
            rel['artist1_name'],
            rel['birth_year1'],
            rel['artist2_name'],
            rel['relationship_type']
        ])

print(f"Processed {len(relationships)} relationships between existing artists")

#### Close Connection

In [None]:
# Close the connection
cursor.close()
conn.close()