### MoMA and PainterPallette Data Augmentation with Wikidata

This notebook adds additional information missing from the static data with wikidata api calls.

Upload data from Artists.txt, Artwork.txt, and PainterPalette.csv

In [None]:
import sys
sys.path.append('../')

In [None]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json
import re

#### Read in PainterPalette CSV

In [None]:
pp_df = pd.read_csv("202 Project/PainterPalette.csv")  
column_names = [
    "ConstituentID", "DisplayName", "ArtistBio", "Nationality", "Gender",
    "BeginDate", "EndDate", "Wiki QID", "ULAN"
]

# Read the file as CSV, handling missing values
artists_df = pd.read_csv("202 Project/Artists.txt", header=None, names=column_names, dtype={"ConstituentID": str}, na_values=["", " ", "null"])

artworks_df = pd.read_csv("202 Project/Artworks.txt", header=None, names=column_names, dtype={"ConstituentID": str}, na_values=["", " ", "null"])



  artworks_df = pd.read_csv("202 Project/Artworks.txt", header=None, names=column_names, dtype={"ConstituentID": str}, na_values=["", " ", "null"])


#### Format for the artists.txt

  "ConstituentID": 1,  
  "DisplayName": "Robert Arneson",   
  "ArtistBio": "American, 1930–1992",  
  "Nationality": "American",  
  "Gender": "male",  
  "BeginDate": 1930,  
  "EndDate": 1992,  
  "Wiki QID": null,  
  "ULAN": null  



In [3]:
print(artists_df)

       ConstituentID          DisplayName                         ArtistBio  \
0      ConstituentID          DisplayName                         ArtistBio   
1                  1       Robert Arneson               American, 1930–1992   
2                  2       Doroteo Arnaiz                Spanish, born 1936   
3                  3          Bill Arnold               American, born 1941   
4                  4      Charles Arnoldi               American, born 1946   
...              ...                  ...                               ...   
15663         138843      Liliana Maresca            Argentine, 1951 – 1994   
15664         138844          Elba Bairon               Bolivian, born 1947   
15665         138845      Marcia Schvartz                         born 1955   
15666         138858  Anastasia Samoylova  American, born Russia, born 1984   
15667         138892       Sadie Red Wing             Indigenous, born 1990   

       Nationality  Gender  BeginDate  EndDate  Wik

In [None]:
#Check if ConstituentID has duplicates

duplicates = artists_df.duplicated(subset="ConstituentID", keep=False) 
dupe_counts = artists_df["ConstituentID"].value_counts()
dupes = dupe_counts[dupe_counts > 1]  

print(dupes)


Series([], Name: count, dtype: int64)


In [None]:
# Total number of rows
total_rows = len(artists_df)

# Count NaN rows
null_rows = artists_df[["Nationality", "Gender", "BeginDate", "EndDate"]].isnull().any(axis=1).sum()

# Counts zeros
zero_rows = (artists_df[["BeginDate", "EndDate"]] == 0).any(axis=1).sum()

percent_null_rows = (null_rows / total_rows) * 100
percent_zero_rows = (zero_rows / total_rows) * 100

# Checking for missing values (NaN/null)
missing_counts = artists_df[["Nationality", "Gender", "BeginDate", "EndDate"]].isnull().sum()

# Checking for zero values in numerical columns
zero_counts = (artists_df[["BeginDate", "EndDate"]] == 0).sum()

# Display results
print("Missing (NaN) values per column:")
print(missing_counts)

print("\nZero values per column:")
print(zero_counts)

print(f"\nPercentage of rows with at least one NaN value: {percent_null_rows:.2f}%")
print(f"Percentage of rows with at least one zero value: {percent_zero_rows:.2f}%")


Missing (NaN) values per column:
Nationality    2499
Gender         3276
BeginDate         0
EndDate           0
dtype: int64

Zero values per column:
BeginDate    0
EndDate      0
dtype: int64

Percentage of rows with at least one NaN value: 25.02%
Percentage of rows with at least one zero value: 0.00%


#### Pulls artist dataframes from WikiData

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json
import re

#Query to pull wikidata information
query = """
SELECT ?artist ?artistLabel ?nationalityLabel ?birthDate WHERE {
  ?artist wdt:P31 wd:Q5;  # Instance of human
         wdt:P106 wd:Q1028181;  # Occupation: artist
         wdt:P27 ?nationality;  # Country of citizenship
         OPTIONAL { ?artist wdt:P569 ?birthDate. }  # Date of birth (optional to prevent missing values)
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 1***REMOVED***  # Limit results to prevent timeouts
"""

# Initialize SPARQL wrapper
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

try:

    response = sparql.query().response.read().decode("utf-8")

    # JSON validation
    if not response.strip().startswith("{"):
        raise ValueError("SPARQL endpoint did not return JSON. Response received:\n" + response)

    # Error management
    cleaned_response = re.sub(r"[\x00-\x1F\x7F]", "", response)

    
    results = json.loads(cleaned_response)

    # Process results into a DF
    data = []
    for result in results.get("results", {}).get("bindings", []):
        data.append({
            "artist": result["artistLabel"]["value"],
            "nationality": result.get("nationalityLabel", {}).get("value", None),
            "birthDate": result.get("birthDate", {}).get("value", None)
        })

    wikidata_df = pd.DataFrame(data)


    wiki_len = len(wikidata_df)
    print(f"Total records fetched: {wiki_len}")
    print(wikidata_df.head())

except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as ex:
    print(f"Unexpected error: {ex}")


Total records fetched: 1***REMOVED***
                  artist     nationality             birthDate
0       Matthias Brandes         Germany  1950-01-01T00:00:00Z
1       Mikhail Larionov          France  1881-06-03T00:00:00Z
2       Mikhail Larionov  Russian Empire  1881-06-03T00:00:00Z
3       André Cluysenaar         Belgium  1872-05-31T00:00:00Z
4  Pierre-Auguste Renoir          France  1841-02-25T00:00:00Z


In [None]:
#Fetch Records

import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import re

# Define the SPARQL query to fetch as many relevant columns as possible
query = """
SELECT ?artist ?artistLabel ?nationalityLabel ?citizenshipLabel ?genderLabel ?birthDate ?deathDate ?birthPlaceLabel ?deathPlaceLabel 
       ?movementLabel ?occupationLabel ?paintingSchoolLabel ?influencedByLabel ?influencedOnLabel ?pupilsLabel ?teachersLabel ?friendsLabel
WHERE {
  ?artist wdt:P31 wd:Q5;  # Instance of human
         wdt:P106 wd:Q1028181.  # Occupation: artist
  OPTIONAL { ?artist wdt:P27 ?nationality. }  # Country of citizenship
  OPTIONAL { ?artist wdt:P27 ?citizenship. }  # Citizenship
  OPTIONAL { ?artist wdt:P21 ?gender. }  # Gender
  OPTIONAL { ?artist wdt:P569 ?birthDate. }  # Date of birth
  OPTIONAL { ?artist wdt:P570 ?deathDate. }  #  Date of death
  OPTIONAL { ?artist wdt:P19 ?birthPlace. }  # Birthplace
  OPTIONAL { ?artist wdt:P20 ?deathPlace. }  # Deathplace
  OPTIONAL { ?artist wdt:P135 ?movement. }  # Artistic movement
  OPTIONAL { ?artist wdt:P106 ?occupation. }  # Occupation
  OPTIONAL { ?artist wdt:P1027 ?paintingSchool. }  # Painting school
  OPTIONAL { ?artist wdt:P737 ?influencedBy. }  # Influenced by
  OPTIONAL { ?artist wdt:P737 ?influencedOn. }  # Influenced on
  OPTIONAL { ?artist wdt:P802 ?pupils. }  # Pupils
  OPTIONAL { ?artist wdt:P108 ?teachers. }  # Teachers
  OPTIONAL { ?artist wdt:P1416 ?friends. }  # Friends and coworkers
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 1***REMOVED***  # Limit results to prevent timeouts
"""

# Initialize SPARQL wrapper
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

try:
    # Fetch response safely
    response = sparql.query().response.read().decode("utf-8")

    # Validate if response is JSON
    if not response.strip().startswith("{"):
        raise ValueError("SPARQL endpoint did not return JSON. Response received:\n" + response)

    # Remove invalid control characters that may cause JSONDecodeError
    cleaned_response = re.sub(r"[\x00-\x1F\x7F]", "", response)

    # Convert JSON response safely
    results = json.loads(cleaned_response)

    # Process results into a DataFrame
    data = []
    for result in results.get("results", {}).get("bindings", []):
        data.append({
            "artist": result["artistLabel"]["value"],
            "nationality": result.get("nationalityLabel", {}).get("value", None),
            "citizenship": result.get("citizenshipLabel", {}).get("value", None),
            "gender": result.get("genderLabel", {}).get("value", None),
            "birth_year": result.get("birthDate", {}).get("value", None),
            "death_year": result.get("deathDate", {}).get("value", None),
            "birth_place": result.get("birthPlaceLabel", {}).get("value", None),
            "death_place": result.get("deathPlaceLabel", {}).get("value", None),
            "movement": result.get("movementLabel", {}).get("value", None),
            "occupation": result.get("occupationLabel", {}).get("value", None),
            "painting_school": result.get("paintingSchoolLabel", {}).get("value", None),
            "influenced_by": result.get("influencedByLabel", {}).get("value", None),
            "influenced_on": result.get("influencedOnLabel", {}).get("value", None),
            "pupils": result.get("pupilsLabel", {}).get("value", None),
            "teachers": result.get("teachersLabel", {}).get("value", None),
            "friends_and_coworkers": result.get("friendsLabel", {}).get("value", None),
        })

    wikidata_df = pd.DataFrame(data)

    # Save the fetched data to a CSV file
    wikidata_df.to_csv("Wikidata_Artists_Info.csv", index=False)

    # Display results
    print(f"Total records fetched: {len(wikidata_df)}")
    print(wikidata_df.head())

except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as ex:
    print(f"Unexpected error: {ex}")


Total records fetched: 1***REMOVED***
      artist     nationality     citizenship gender            birth_year  \
0  Rembrandt  Dutch Republic  Dutch Republic   male  1606-07-15T00:00:00Z   
1  Rembrandt  Dutch Republic  Dutch Republic   male  1606-07-15T00:00:00Z   
2  Rembrandt  Dutch Republic  Dutch Republic   male  1606-07-15T00:00:00Z   
3  Rembrandt  Dutch Republic  Dutch Republic   male  1606-07-15T00:00:00Z   
4  Rembrandt  Dutch Republic  Dutch Republic   male  1606-07-15T00:00:00Z   

             death_year birth_place death_place                   movement  \
0  1669-10-04T00:00:00Z      Leiden   Amsterdam  Dutch Golden Age painting   
1  1669-10-04T00:00:00Z      Leiden   Amsterdam  Dutch Golden Age painting   
2  1669-10-04T00:00:00Z      Leiden   Amsterdam  Dutch Golden Age painting   
3  1669-10-04T00:00:00Z      Leiden   Amsterdam  Dutch Golden Age painting   
4  1669-10-04T00:00:00Z      Leiden   Amsterdam  Dutch Golden Age painting   

      occupation painting_scho

In [87]:
null_counts = wikidata_df.isnull().sum()

# Print missing values for each column
print("Missing (NaN) values per column:")
print(null_counts)

Missing (NaN) values per column:
artist                       0
nationality                 12
citizenship                 12
gender                       0
birth_year                   0
death_year                 120
birth_place                  0
death_place                125
movement                   953
occupation                   0
painting_school          1***REMOVED***
influenced_by             1678
influenced_on             1678
pupils                    5371
teachers                  7411
friends_and_coworkers    1***REMOVED***
dtype: int64


#### Saving CSV for download

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json
import re
from collections import defaultdict

# Define the SPARQL query to fetch as many relevant columns as possible
query = """
SELECT ?artist ?artistLabel ?nationalityLabel ?citizenshipLabel ?genderLabel ?birthDate ?deathDate ?birthPlaceLabel ?deathPlaceLabel 
       ?movementLabel ?occupationLabel ?paintingSchoolLabel ?influencedByLabel ?influencedOnLabel ?pupilsLabel ?teachersLabel ?friendsLabel
WHERE {
  ?artist wdt:P31 wd:Q5;  # Instance of human
         wdt:P106 wd:Q1028181.  # Occupation: artist
  OPTIONAL { ?artist wdt:P27 ?nationality. }  # Country of citizenship
  OPTIONAL { ?artist wdt:P27 ?citizenship. }  # Citizenship
  OPTIONAL { ?artist wdt:P21 ?gender. }  # Gender
  OPTIONAL { ?artist wdt:P569 ?birthDate. }  # Date of birth
  OPTIONAL { ?artist wdt:P570 ?deathDate. }  # Date of death
  OPTIONAL { ?artist wdt:P19 ?birthPlace. }  # Birthplace
  OPTIONAL { ?artist wdt:P20 ?deathPlace. }  # Deathplace
  OPTIONAL { ?artist wdt:P135 ?movement. }  # Artistic movement
  OPTIONAL { ?artist wdt:P106 ?occupation. }  # Occupation
  OPTIONAL { ?artist wdt:P1027 ?paintingSchool. }  # Painting school
  OPTIONAL { ?artist wdt:P737 ?influencedBy. }  # Influenced by
  OPTIONAL { ?artist wdt:P737 ?influencedOn. }  # Influenced on
  OPTIONAL { ?artist wdt:P802 ?pupils. }  # Pupils
  OPTIONAL { ?artist wdt:P108 ?teachers. }  # Teachers
  OPTIONAL { ?artist wdt:P1416 ?friends. }  # Friends and coworkers
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 1***REMOVED***  # Limit results to prevent timeouts
"""

# Initialize SPARQL wrapper
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

try:
    response = sparql.query().response.read().decode("utf-8")

    # JSON Validation
    if not response.strip().startswith("{"):
        raise ValueError("SPARQL endpoint did not return JSON. Response received:\n" + response)

    # Remove invalid control characters that may cause JSONDecodeError
    cleaned_response = re.sub(r"[\x00-\x1F\x7F]", "", response)

    # Convert JSON response safely
    results = json.loads(cleaned_response)

    # Initialize a dictionary to store unique artist data with lists for multiple values
    artist_data = defaultdict(lambda: {
        "nationality": [], "citizenship": [], "gender": [], "birth_year": None, "death_year": None,
        "birth_place": [], "death_place": [], "movement": [], "occupation": [], "painting_school": [],
        "influenced_by": [], "influenced_on": [], "pupils": [], "teachers": [], "friends_and_coworkers": []
    })

    # Process results into the dictionary, aggregating multiple values into lists
    for result in results.get("results", {}).get("bindings", []):
        artist_name = result["artistLabel"]["value"]

        artist_data[artist_name]["nationality"].append(result.get("nationalityLabel", {}).get("value", None))
        artist_data[artist_name]["citizenship"].append(result.get("citizenshipLabel", {}).get("value", None))
        artist_data[artist_name]["gender"].append(result.get("genderLabel", {}).get("value", None))
        artist_data[artist_name]["birth_place"].append(result.get("birthPlaceLabel", {}).get("value", None))
        artist_data[artist_name]["death_place"].append(result.get("deathPlaceLabel", {}).get("value", None))
        artist_data[artist_name]["movement"].append(result.get("movementLabel", {}).get("value", None))
        artist_data[artist_name]["occupation"].append(result.get("occupationLabel", {}).get("value", None))
        artist_data[artist_name]["painting_school"].append(result.get("paintingSchoolLabel", {}).get("value", None))
        artist_data[artist_name]["influenced_by"].append(result.get("influencedByLabel", {}).get("value", None))
        artist_data[artist_name]["influenced_on"].append(result.get("influencedOnLabel", {}).get("value", None))
        artist_data[artist_name]["pupils"].append(result.get("pupilsLabel", {}).get("value", None))
        artist_data[artist_name]["teachers"].append(result.get("teachersLabel", {}).get("value", None))
        artist_data[artist_name]["friends_and_coworkers"].append(result.get("friendsLabel", {}).get("value", None))

        # Assign unique values for birth and death year (assuming single values)
        if artist_data[artist_name]["birth_year"] is None:
            artist_data[artist_name]["birth_year"] = result.get("birthDate", {}).get("value", None)
        if artist_data[artist_name]["death_year"] is None:
            artist_data[artist_name]["death_year"] = result.get("deathDate", {}).get("value", None)

    # Convert lists with single values into single values
    for artist in artist_data:
        for key in artist_data[artist]:
            if isinstance(artist_data[artist][key], list):
                artist_data[artist][key] = list(filter(None, artist_data[artist][key]))  # Remove None values

    # Convert dictionary to DF
    final_wikidata_df = pd.DataFrame.from_dict(artist_data, orient="index").reset_index().rename(columns={"index": "artist"})

    # Save and display the final DataFrame
    final_wikidata_df.to_csv("Wikidata_Artists_Info_Aggregated.csv", index=False)



except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as ex:
    print(f"Unexpected error: {ex}")


#### PULL_ARTIST_FROM_WIKI


In [8]:
#Pulls artist name and qid from the wikidata source
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

# SPARQL endpoint for Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# SPARQL query to fetch all painters and their QIDs
def fetch_all_painters():
    query = """
    SELECT ?artist ?artistLabel WHERE {
      ?artist wdt:P31 wd:Q5;  # Instance of human
              wdt:P106 wd:Q1028181.  # Occupation: Painter
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 12500
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    painter_data = []
    for result in results["results"]["bindings"]:
        qid = result["artist"]["value"].split("/")[-1]  # Extract QID
        name = result.get("artistLabel", {}).get("value", "Unknown")
        painter_data.append((name, qid))
    
    return painter_data

# Fetch all painters
painters = fetch_all_painters()

# Convert to DataFrame
painter_df = pd.DataFrame(painters, columns=["Artist Name", "Wikidata QID"])

# Save to CSV
painter_df.to_csv("202 Project/All_Painters_QID_3
s.csv", index=False)

# Print results
print(painter_df)

                    Artist Name Wikidata QID
0                George W. Bush         Q207
1                  Claude Monet         Q296
2               Diego Velázquez         Q297
3                      El Greco         Q301
4                     Bob Dylan         Q392
...                         ...          ...
12495  Friedrich Karl Rupprecht     Q1460252
12496    Karl Friedrich Ströher     Q1460262
12497     Friedrich Karl Thauer     Q1460265
12498                Jakob Häne     Q1460276
12499               János Viski     Q1460294

[12500 rows x 2 columns]


In [23]:
#This merges the painter palette csv and the all painters qid generated from the query, only pulls 
#MERGES PAINTER PALETTE AND ALL PAINTER QID AND OUTPUTS A NEW MERGED PAINTERS CSV
import pandas as pd

# Load both datasets
painters_qid_df = pd.read_csv("202 Project/All_Painters_QIDs.csv")  # Ensure this file exists
painter_palette_df = pd.read_csv("202 Project/PainterPalette.csv")  # Ensure this file exists

# Print column names for debugging
print("All_Painters_QIDs.csv columns:", painters_qid_df.columns)
print("PainterPalette.csv columns:", painter_palette_df.columns)

# Standardize column names based on actual names in files
painters_qid_df.rename(columns={"Artist Name": "artist_name"}, inplace=True)
painter_palette_df.rename(columns={"artist": "artist_name"}, inplace=True)  # Fixing the column name

# Merge datasets on artist_name
merged_df = pd.merge(painter_palette_df, painters_qid_df, on="artist_name", how="inner")

# Save merged data to a new CSV
merged_df.to_csv("202 Project/Merged_Painters_Data.csv", index=False)

# Print the first few rows of the merged dataset
print(merged_df.head())


All_Painters_QIDs.csv columns: Index(['Artist Name', 'Wikidata QID'], dtype='object')
PainterPalette.csv columns: Index(['artist', 'Nationality', 'citizenship', 'gender', 'styles', 'movement',
       'Art500k_Movements', 'birth_place', 'death_place', 'birth_year',
       'death_year', 'FirstYear', 'LastYear', 'wikiart_pictures_count',
       'locations', 'locations_with_years', 'styles_extended', 'StylesCount',
       'StylesYears', 'occupations', 'PaintingsExhibitedAt',
       'PaintingsExhibitedAtCount', 'PaintingSchool', 'Influencedby',
       'Influencedon', 'Pupils', 'Teachers', 'FriendsandCoworkers',
       'Contemporary', 'Type'],
      dtype='object')
                    artist_name            Nationality     citizenship  \
0            Bracha L. Ettinger  French,Jewish,Israeli          Israel   
1  Maria Helena Vieira da Silva      French,Portuguese          France   
2   Eleanor Fortescue-Brickdale                British  United Kingdom   
3      Anna Ostroumova-Lebedeva     

In [22]:
#ALTERNATIVE TO KEEP ALL THE ROWS EVEN IF THERE IS NO QID RESULT

import pandas as pd

# Load both datasets
painters_qid_df = pd.read_csv("202 Project/All_Painters_QID_combined_200k.csv")  # Ensure this file exists
painter_palette_df = pd.read_csv("202 Project/PainterPalette.csv")  # Ensure this file exists

# Standardize column names (strip spaces and lower case for uniformity)
painters_qid_df.rename(columns={"Artist Name": "artist_name"}, inplace=True)
painter_palette_df.rename(columns={"artist": "artist_name"}, inplace=True)

# Merge datasets on artist_name, keeping all rows from painter_palette_df
merged_df = pd.merge(painter_palette_df, painters_qid_df, on="artist_name", how="left")

# Save merged data to a new CSV
merged_df.to_csv("202 Project/Total_Merged_Painters_Data.csv", index=False)

# Print the first few rows of the merged dataset
print(merged_df.head())

                         artist_name            Nationality     citizenship  \
0                 Bracha L. Ettinger  French,Jewish,Israeli          Israel   
1                 William H. Johnson               American   United States   
2                 Alexey  Bogolyubov                Russian             NaN   
3                 O. Louis Guglielmi      American,Egyptian   United States   
4  Mikalojus Konstantinas Ciurlionis             Lithuanian  Russian Empire   

   gender                                             styles  \
0  female                              New European Painting   
1    male  Cubism, Expressionism, Futurism, Naïve Art (Pr...   
2     NaN                               Realism, Romanticism   
3    male               Cubism, Expressionism, Magic Realism   
4    male                                          Symbolism   

                                  movement  \
0                    New European Painting   
1  Harlem Renaissance (New Negro Movement)   
2 

#### Pulls wikidata 12,500 at a time to a combined csv

In [10]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# SPARQL endpoint for Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# Parameters
LIMIT = 12500  # Number of results per query
OFFSET = 0  # Starting offset
all_painters = []

def fetch_painters(offset):
    """Fetch painters from Wikidata with pagination."""
    query = f"""
    SELECT ?artist ?artistLabel WHERE {{
      ?artist wdt:P31 wd:Q5;  # Instance of human
              wdt:P106 wd:Q1028181.  # Occupation: Painter
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT {LIMIT}
    OFFSET {offset}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        painter_data = []
        for result in results["results"]["bindings"]:
            qid = result["artist"]["value"].split("/")[-1]  # Extract QID
            name = result.get("artistLabel", {}).get("value", "Unknown")
            painter_data.append((name, qid))
        return painter_data
    except Exception as e:
        print(f"Error at offset {offset}: {e}")
        return []

# Fetch all painters in chunks
while True:
    print(f"Fetching data with OFFSET {OFFSET}...")
    painters_chunk = fetch_painters(OFFSET)
    
    if not painters_chunk:  # Stop when no more results are returned
        break
    
    all_painters.extend(painters_chunk)
    OFFSET += LIMIT  # Move to next batch
    time.sleep(5)  # Avoid overloading the server

# Convert to DataFrame
painter_df = pd.DataFrame(all_painters, columns=["Artist Name", "Wikidata QID"])

# Save to CSV
painter_df.to_csv("202 Project/All_Painters_QID_combined.csv", index=False)

# Print results
print(f"Total painters fetched: {len(painter_df)}")
print(painter_df.head())


Fetching data with OFFSET 0...
Fetching data with OFFSET 12500...
Fetching data with OFFSET 25000...
Fetching data with OFFSET 37500...
Fetching data with OFFSET 5***REMOVED***...
Fetching data with OFFSET 62500...
Fetching data with OFFSET 75000...
Fetching data with OFFSET 87500...
Fetching data with OFFSET 1***REMOVED***0...
Fetching data with OFFSET 112500...
Fetching data with OFFSET 125000...
Fetching data with OFFSET 137500...
Error at offset 137500: Expecting property name enclosed in double quotes: line 36549 column 2 (char 900223)
Total painters fetched: 137500
       Artist Name Wikidata QID
0   George W. Bush         Q207
1     Claude Monet         Q296
2  Diego Velázquez         Q297
3         El Greco         Q301
4        Bob Dylan         Q392


#### Testing to see if i can pull all 13,000 rows from the painter palette dataset with artist names 'artist_names.csv' with the corresponding columns from wikidata

In [23]:
# from SPARQLWrapper import SPARQLWrapper, JSON
# import pandas as pd
# import time

# # Load the artist names from the CSV file
# input_csv = "202 Project/Artist_Names.csv"  # Update the correct path
# df = pd.read_csv(input_csv)

# # Ensure the column name is correct
# artist_names = df["Artist Name"].dropna().unique().tolist()  # Adjust column name if necessary

# # SPARQL endpoint
# sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# # Batch settings
# BATCH_SIZE = 500  # Query 500 artists per request
# SLEEP_TIME = 5  # Wait time between queries
# all_results = []

# def fetch_artist_data(artist_batch):
#     """Fetch artist QIDs and additional properties from Wikidata."""
#     names_query = " ".join([f'"{name}"' for name in artist_batch])

#     query = f"""
#     SELECT ?artist ?artistLabel ?nationalityLabel ?citizenshipLabel ?genderLabel ?birthDate ?deathDate 
#            ?birthPlaceLabel ?deathPlaceLabel ?movementLabel ?occupationLabel ?paintingSchoolLabel 
#            ?influencedByLabel ?influencedOnLabel ?pupilsLabel ?teachersLabel ?friendsLabel WHERE {{
#       ?artist wdt:P31 wd:Q5;  # Instance of human
#               wdt:P106 wd:Q1028181.  # Occupation: artist
#       OPTIONAL {{ ?artist wdt:P27 ?nationality. }}  # Country of citizenship
#       OPTIONAL {{ ?artist wdt:P27 ?citizenship. }}  # Citizenship
#       OPTIONAL {{ ?artist wdt:P21 ?gender. }}  # Gender
#       OPTIONAL {{ ?artist wdt:P569 ?birthDate. }}  # Date of birth
#       OPTIONAL {{ ?artist wdt:P570 ?deathDate. }}  # Date of death
#       OPTIONAL {{ ?artist wdt:P19 ?birthPlace. }}  # Birthplace
#       OPTIONAL {{ ?artist wdt:P20 ?deathPlace. }}  # Deathplace
#       OPTIONAL {{ ?artist wdt:P135 ?movement. }}  # Artistic movement
#       OPTIONAL {{ ?artist wdt:P106 ?occupation. }}  # Occupation
#       OPTIONAL {{ ?artist wdt:P1027 ?paintingSchool. }}  # Painting school
#       OPTIONAL {{ ?artist wdt:P737 ?influencedBy. }}  # Influenced by
#       OPTIONAL {{ ?artist wdt:P737 ?influencedOn. }}  # Influenced on
#       OPTIONAL {{ ?artist wdt:P802 ?pupils. }}  # Pupils
#       OPTIONAL {{ ?artist wdt:P108 ?teachers. }}  # Teachers
#       OPTIONAL {{ ?artist wdt:P1416 ?friends. }}  # Friends and coworkers
#       SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
#       FILTER(?artistLabel IN ({names_query}))  # Only fetch names in our batch
#     }}
#     """

#     sparql.setQuery(query)
#     sparql.setReturnFormat(JSON)

#     try:
#         results = sparql.query().convert()
#         artist_data = []
#         for result in results["results"]["bindings"]:
#             qid = result["artist"]["value"].split("/")[-1]  # Extract QID
#             name = result.get("artistLabel", {}).get("value", "Unknown")
#             nationality = result.get("nationalityLabel", {}).get("value", "Unknown")
#             citizenship = result.get("citizenshipLabel", {}).get("value", "Unknown")
#             gender = result.get("genderLabel", {}).get("value", "Unknown")
#             birth_date = result.get("birthDate", {}).get("value", "Unknown")
#             death_date = result.get("deathDate", {}).get("value", "Unknown")
#             birth_place = result.get("birthPlaceLabel", {}).get("value", "Unknown")
#             death_place = result.get("deathPlaceLabel", {}).get("value", "Unknown")
#             movement = result.get("movementLabel", {}).get("value", "Unknown")
#             occupation = result.get("occupationLabel", {}).get("value", "Unknown")
#             painting_school = result.get("paintingSchoolLabel", {}).get("value", "Unknown")
#             influenced_by = result.get("influencedByLabel", {}).get("value", "Unknown")
#             influenced_on = result.get("influencedOnLabel", {}).get("value", "Unknown")
#             pupils = result.get("pupilsLabel", {}).get("value", "Unknown")
#             teachers = result.get("teachersLabel", {}).get("value", "Unknown")
#             friends = result.get("friendsLabel", {}).get("value", "Unknown")
#             artist_data.append((name, qid, nationality, citizenship, gender, birth_date, death_date, birth_place, 
#                                 death_place, movement, occupation, painting_school, influenced_by, influenced_on, 
#                                 pupils, teachers, friends))
#         return artist_data
#     except Exception as e:
#         print(f"Error fetching data: {e}")
#         return []

# # Process the artist names in batches
# for i in range(0, len(artist_names), BATCH_SIZE):
#     batch = artist_names[i : i + BATCH_SIZE]  # Get batch of 500
#     print(f"Fetching {len(batch)} artists... ({i}/{len(artist_names)})")

#     batch_results = fetch_artist_data(batch)
#     all_results.extend(batch_results)

#     time.sleep(SLEEP_TIME)  # Avoid overloading Wikidata

# # Convert to DataFrame
# columns = ["Artist Name", "Wikidata QID", "Nationality", "Citizenship", "Gender", "Birthdate", "Deathdate", 
#            "Birthplace", "Deathplace", "Movement", "Occupation", "Painting School", "Influenced By", 
#            "Influenced On", "Pupils", "Teachers", "Friends"]
# final_df = pd.DataFrame(all_results, columns=columns)

# # Save results to CSV
# output_csv = "202 Project/PP_and_wikidata_combined"
# final_df.to_csv(output_csv, index=False)

# # Print summary
# print(f"Total artists retrieved: {len(final_df)}")
# print(final_df.head())

#### Add QIDS to Arist_Names.csv dataset for easier processing 

In [24]:
# from SPARQLWrapper import SPARQLWrapper, JSON
# import pandas as pd
# import time

# # Load the artist names from CSV
# input_csv = "202 Project/Artist_Names.csv"  # Update the correct path
# df = pd.read_csv(input_csv)

# # Ensure correct column name
# artist_names = df["Artist Name"].dropna().unique().tolist()

# # SPARQL endpoint
# sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# # Batch settings
# BATCH_SIZE = 25  # Query 25 artists at a time
# SLEEP_TIME = 5  # Avoid rate limits
# qid_results = []

# def fetch_qids(artist_batch):
#     """Fetch Wikidata QIDs for a batch of artist names using FILTER and rdfs:label."""
#     filters = " || ".join([f'CONTAINS(LCASE(?artistLabel), "{name.lower()}")' for name in artist_batch])

#     query = f"""
#     SELECT ?artist ?artistLabel WHERE {{
#       ?artist wdt:P31 wd:Q5;  # Instance of human
#               wdt:P106 wd:Q1028181.  # Occupation: Painter
#       ?artist rdfs:label ?artistLabel.
#       FILTER (LANG(?artistLabel) = "en")  # Only English labels
#       FILTER ({filters})
#     }}
#     """

#     sparql.setQuery(query)
#     sparql.setReturnFormat(JSON)

#     try:
#         results = sparql.query().convert()
#         artist_data = []
#         for result in results["results"]["bindings"]:
#             qid = result["artist"]["value"].split("/")[-1]
#             name = result.get("artistLabel", {}).get("value", "Unknown")
#             artist_data.append((name, qid))
#         return artist_data
#     except Exception as e:
#         print(f"Error fetching QIDs: {e}")
#         return []

# # Process in batches to prevent long URIs
# for i in range(0, len(artist_names), BATCH_SIZE):
#     batch = artist_names[i : i + BATCH_SIZE]
#     print(f"Fetching QIDs for {len(batch)} artists... ({i}/{len(artist_names)})")
    
#     batch_results = fetch_qids(batch)
#     qid_results.extend(batch_results)
    
#     time.sleep(SLEEP_TIME)  # Avoid overloading Wikidata

# # Convert QIDs to DataFrame and save
# qid_df = pd.DataFrame(qid_results, columns=["Artist Name", "Wikidata QID"])
# qid_df.to_csv("202 Project/Artist_QIDs.csv", index=False)

# print(f"Total artists with QIDs: {len(qid_df)}")
# print(qid_df.head())


#### Pulling artist names and their corresponding qid to gather list of 200,000 artists

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# SPARQL endpoint for Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# Parameters
LIMIT = 1***REMOVED***  # Number of results per query
OFFSET = 0  
SLEEP_TIME = 5  #time between queries
all_painters = []

def fetch_painters(offset):
    """Fetch painters from Wikidata with pagination and error handling."""
    query = f"""
    SELECT ?artist ?artistLabel WHERE {{
      ?artist wdt:P31 wd:Q5;  # Instance of human
              wdt:P106 wd:Q1028181.  # Occupation: Painter
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT {LIMIT}
    OFFSET {offset}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        painter_data = []
        for result in results["results"]["bindings"]:
            qid = result["artist"]["value"].split("/")[-1]  # Extract QID
            name = result.get("artistLabel", {}).get("value", "Unknown")
            painter_data.append((name, qid))
        return painter_data
    except Exception as e:
        print(f"Error at offset {offset}: {e}")
        return []

# Fetch all painters in partitions
while True:
    print(f"Fetching data with OFFSET {OFFSET}...")
    painters_chunk = fetch_painters(OFFSET)
    
    if not painters_chunk:  
        break
    
    all_painters.extend(painters_chunk)
    OFFSET += LIMIT  # Move to next batch
    
   #Stopping to ensure query limit isn't reached
    time.sleep(SLEEP_TIME)

# Convert to DataFrame
painter_df = pd.DataFrame(all_painters, columns=["Artist Name", "Wikidata QID"])

# Save to CSV
painter_df.to_csv("202 Project/All_Painters_QID_combined_200k.csv", index=False)

# Print results
print(f"Total painters fetched: {len(painter_df)}")
print(painter_df.head())


Fetching data with OFFSET 0...
Fetching data with OFFSET 1***REMOVED***...
Fetching data with OFFSET 2***REMOVED***...
Fetching data with OFFSET 3***REMOVED***...
Fetching data with OFFSET 4***REMOVED***...
Fetching data with OFFSET 5***REMOVED***...
Fetching data with OFFSET 6***REMOVED***...
Fetching data with OFFSET 7***REMOVED***...
Fetching data with OFFSET 8***REMOVED***...
Fetching data with OFFSET 9***REMOVED***...
Fetching data with OFFSET 1***REMOVED***0...
Fetching data with OFFSET 11***REMOVED***...
Fetching data with OFFSET 12***REMOVED***...
Fetching data with OFFSET 13***REMOVED***...
Fetching data with OFFSET 14***REMOVED***...
Fetching data with OFFSET 15***REMOVED***...
Fetching data with OFFSET 16***REMOVED***...
Fetching data with OFFSET 17***REMOVED***...
Fetching data with OFFSET 18***REMOVED***...
Fetching data with OFFSET 19***REMOVED***...
Fetching data with OFFSET 2***REMOVED***0...
Fetching data with OFFSET 21***REMOVED***...
Fetching data with OFFSET 22***RE

#### PainterPalette rows with a corresponding Wikidata QID 

In [None]:
import pandas as pd

# Load file
file_path = "202 Project/Total_Merged_Painters_Data.csv"  # Update if needed
df = pd.read_csv(file_path)

# Count non-null values in the 'Wikidata QID' column
qid_non_null_count = df["Wikidata QID"].notna().sum()

# Print the result
print(f"Number of rows with a valid 'Wikidata QID': {qid_non_null_count}")


Number of rows with a valid 'Wikidata QID': 5334


#### Importing additional information based on the QIDS generated 

In [None]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON, POST

# Load the QIDs from your file
df = pd.read_csv("C:\\Users\\***REMOVED***\\.vscode\\Total_Merged_Painters_Data.csv")

# Extract and clean QIDs
qid_list = df["Wikidata QID"].dropna().unique()
batch_size = 100  # Wikidata query limit
batches = [qid_list[i:i + batch_size] for i in range(0, len(qid_list), batch_size)]

# Wikidata SPARQL endpoint
endpoint_url = "https://query.wikidata.org/sparql"

# Function to execute SPARQL query using POST
def run_sparql_query(query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.setMethod(POST)  # Use POST to avoid URI too long errors
    return sparql.query().convert()

# Initialize results list
all_results = []

# Process each batch
total_batches = len(batches)
for batch_index, batch in enumerate(batches, start=1):
    qid_values = " ".join(f'wd:{qid}' for qid in batch)
    
    query = f"""
    SELECT ?artist ?artistLabel ?nationalityLabel ?citizenshipLabel ?genderLabel ?birthDate ?deathDate 
           ?birthPlaceLabel ?deathPlaceLabel ?movementLabel ?occupationLabel ?paintingSchoolLabel 
           ?influencedByLabel ?influencedOnLabel ?pupilsLabel ?teachersLabel ?friendsLabel
    WHERE {{
      VALUES ?artist {{{qid_values}}}

      OPTIONAL {{ ?artist wdt:P27 ?nationality. }}
      OPTIONAL {{ ?artist wdt:P27 ?citizenship. }}
      OPTIONAL {{ ?artist wdt:P21 ?gender. }}
      OPTIONAL {{ ?artist wdt:P569 ?birthDate. }}
      OPTIONAL {{ ?artist wdt:P570 ?deathDate. }}
      OPTIONAL {{ ?artist wdt:P19 ?birthPlace. }}
      OPTIONAL {{ ?artist wdt:P20 ?deathPlace. }}
      OPTIONAL {{ ?artist wdt:P135 ?movement. }}
      OPTIONAL {{ ?artist wdt:P106 ?occupation. }}
      OPTIONAL {{ ?artist wdt:P1027 ?paintingSchool. }}
      OPTIONAL {{ ?artist wdt:P737 ?influencedBy. }}
      OPTIONAL {{ ?artist wdt:P737 ?influencedOn. }}
      OPTIONAL {{ ?artist wdt:P802 ?pupils. }}
      OPTIONAL {{ ?artist wdt:P108 ?teachers. }}
      OPTIONAL {{ ?artist wdt:P1416 ?friends. }}

      # Fetch human-readable labels instead of URLs
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    
    # Run query and collect results
    try:
        results = run_sparql_query(query)
        for result in results["results"]["bindings"]:
            row = {key: result[key]["value"] if key in result else None for key in [
                "artist", "artistLabel", "nationalityLabel", "citizenshipLabel", "genderLabel", 
                "birthDate", "deathDate", "birthPlaceLabel", "deathPlaceLabel", "movementLabel",
                "occupationLabel", "paintingSchoolLabel", "influencedByLabel", "influencedOnLabel", 
                "pupilsLabel", "teachersLabel", "friendsLabel"
            ]}
            all_results.append(row)
        
        print(f" Finished batch {batch_index}/{total_batches}")

    except Exception as e:
        print(f" Error fetching batch {batch_index}/{total_batches}: {e}")

# Save results to CSV
output_file = "wikidata_results_fixed.csv"
df_results = pd.DataFrame(all_results)
df_results.to_csv(output_file, index=False, encoding="utf-8") 

print(f"\n Data saved to {output_file}")


#### Creating code to compress the ~400k row outputs with artist information spread over hundreds of rows into one list per artist with a list of row values

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("wikidata_results_fixed.csv", low_memory=False, encoding="utf-8")

# Define the columns to aggregate based on updated SPARQL query
columns_to_aggregate = [
    "artistLabel", "nationalityLabel", "citizenshipLabel", "genderLabel", "birthDate", "deathDate", 
    "birthPlaceLabel", "deathPlaceLabel", "movementLabel", "occupationLabel", "paintingSchoolLabel", 
    "influencedByLabel", "influencedOnLabel", "pupilsLabel", "teachersLabel", "friendsLabel"
]

# Function to aggregate values into lists while removing duplicates and NaNs
def aggregate_values(series):
    unique_values = set(series.dropna())  # Remove NaN values and get unique values
    return list(unique_values) if len(unique_values) > 1 else next(iter(unique_values), None)

# Group by 'artist' and apply the aggregation function to each column
df_grouped = df.groupby("artist").agg(aggregate_values)

# Reset index to save back as a CSV file
df_grouped = df_grouped.reset_index()

# Save the compressed data to a new CSV file
compressed_output_file = "wikidata_results_compressed_fix.csv"
df_grouped.to_csv(compressed_output_file, index=False, encoding="utf-8")

print(f" Data compressed and saved to {compressed_output_file}")



#### Stripping the QID from the link column to be able to join with the Painter Palette column

In [None]:
# Reload necessary libraries after execution state reset
import pandas as pd

#File path
file_path = "<YOUR wikidata_results_compressed_fix_with_QID FILE PATH>"
df = pd.read_csv(file_path, low_memory=False, encoding="utf-8")

# Ensure 'artist' column is right type
df["artist"] = df["artist"].astype(str)

# Extract the QID from the 'artist' column using regex 
df["QID"] = df["artist"].str.extract(r'(Q\d+)$')

# Save the updated file with the new QID column
updated_file_path = "YOUR NEW FILEPATH"
df.to_csv(updated_file_path, index=False, encoding="utf-8")


#### Joining Wikipada compressed csv with the Painter Palette Data to join the Painter Palette data with it's corresponding wikidata

In [None]:
# Load two files
file_wikidata = "<YOUR wikidata_results_compressed_fix_with_QID FILEPATH>"
file_painters = "<YOUR Total_Merged_Painters_Data FILEPATH>"

# Read files
df_wikidata = pd.read_csv(file_wikidata, low_memory=False, encoding="utf-8")
df_painters = pd.read_csv(file_painters, low_memory=False, encoding="utf-8")

# Merge the two datasets on the 'QID' column from Wikidata and 'Wikidata QID' from the painters dataset
df_merged = df_painters.merge(df_wikidata, how="left", left_on="Wikidata QID", right_on="QID")

# Save 
merged_file_path = "<YOUR NEW OUTPUT Merged_painters_finished.csv FILEPATH>"
df_merged.to_csv(merged_file_path, index=False, encoding="utf-8")


merged_file_path
