# Setup
## Downloading dependencies
###### *Cell 1*

In [None]:
from IPython.display import clear_output
# If it ain't here, you pip it. https://www.w3schools.com/python/python_ref_modules.asp
!pip install --upgrade kaggle
!pip install --upgrade pandas
clear_output()

## Importing dependencies
###### *Cell 2*

In [1]:
import os
import tkinter as tk
from tkinter import filedialog
import json
from pathlib import Path
import numpy as np
import pandas as pd
import re

## Initialising the Kaggle CLI
### Option #1: New token
If you don't already have a token or have lost the file to your current token, in the settings of your Kaggle account, click on the button 'Generate New Token'. Follow the instructions and copy the alphanumeric string at the top when such a floating dialogue appears. Then, run cell 3 and paste the key when prompted.
###### *Cell 3*

In [None]:
def kaggle_key() :
    key = input('Paste your token here, or leave empty if your token comes in the form of a JSON file: ')
    clear_output()
    if len(key) > 3 :
        # Create api.txt in working directory, where `KAGGLE_API_TOKEN {key}`.
        api_file = Path("api.txt")
        with api_file.open("w", encoding="utf-8") as f:
            f.write('KAGGLE_API_TOKEN ' + key)
        os.environ['KAGGLE_API_TOKEN'] = key
        os.environ.pop('KAGGLE_USERNAME', None)
        os.environ.pop('KAGGLE_KEY', None)
        print('The last four characters of your API key are: ' + key[-4:] + '. If you suspect that you have entered something wrong, run cell 4 again. Otherwise, you may move to the next section.')
        return True
    else :
        return False
    
def legacy_kaggle_key() :
    '''
    Opens a file dialog, validates the selection, and returns the path 
    if a valid JSON file is selected. Handles all edge cases.
    '''
    root = tk.Tk()
    root.withdraw()
    root.call('wm', 'attributes', '.', '-topmost', True)
    
    file_path = tk.filedialog.askopenfilename(
        title='Find the file with your Kaggle API key...',
        filetypes=(('JSON files', '*.json'), ('All files', '*.*'))
    )
    
    root.destroy()

    if not file_path :
        # Case: User closes the dialog without selecting anything
        print('No file selected. Run cell 5 again if you like.')
        return
    
    # Now try to open and validate the *contents* of the JSON file
    try :
        with open(file_path, 'r') as f :
            data = json.load(f)
        
        # Case: User selects the correct JSON file that contains their API key
        if 'key' in data and isinstance(data['username'], str) and isinstance(data['key'], str) :
            # Create api.txt in working directory, where `KAGGLE_USERNAME {data['username']}` and `KAGGLE_KEY {data['key']}`.
            api_file = Path("api.txt")
            with api_file.open("w", encoding="utf-8") as f:
                f.write('KAGGLE_USERNAME ' + data['username'] + '\nKAGGLE_KEY ' + data['key'])
            os.environ['KAGGLE_USERNAME'] = data['username']
            os.environ['KAGGLE_KEY'] = data['key']
            os.environ.pop('KAGGLE_API_TOKEN', None)
            print('The last four characters of your API key are: ' + data['key'][-4:] + '. If you suspect that this is not an alphanumeric string, find another file by running cell 5 again. Otherwise, you may move to the next section.')
            
        else :
            # Case: User selects a JSON file, but it's not one that contains their API key
            print('The \'key\' field is missing or invalid. To find another file, run this cell again.')
            
    except Exception :
        print('This file may not contain valid JSON. To find another file, run this cell again.')
        return

# If api.txt is in the working directory, separate by newline, then separate by spaces. Each line has the form '{key} {value}`, where the environment variable `key` should be created with value `value`.
# Otherwise, run `kaggle_key()`. If that returns `False`, then run `legacy_kaggle_key()`.

api_file = Path('api.txt')

if api_file.exists() :
    try :
        with api_file.open('r', encoding='utf-8') as f :
            for line in f :
                line = line.strip()
                if not line or line.startswith('#') :
                    continue

                parts = line.split(None, 1)  # split on first whitespace
                if len(parts) != 2 :
                    continue

                env_key, env_value = parts
                os.environ[env_key] = env_value

    except OSError as e :
        print('Found api.txt but could not read it:\n' + e)
else :
    print('api.txt was not found.')

preview = os.environ.get('KAGGLE_API_TOKEN') or os.environ.get('KAGGLE_KEY')
if preview:
    print('Loaded existing credentials from api.txt. The last four characters of your API key are: ' + preview[-4:] + '. If that looks wrong, run cell 4 or 5.')
else:
    # No api.txt present; go through interactive flow
    if not kaggle_key():
        legacy_kaggle_key()

###### *Cell 4*

In [None]:
if not kaggle_key() :
    print('You probably didn\'t enter a valid key. Run this cell again if you like.')

###### *Cell 5*

In [None]:
legacy_kaggle_key()

###### *Cell 6*

In [None]:
kaggle_module_description = !pip show kaggle
os.environ['PATH'] = os.environ['PATH'] + kaggle_module_description[-3][10:-13] + 'Scripts;'
!kaggle datasets download flkuhm/art-price-dataset -p dataset -f artDataset.csv

I assume:
- Your kernel is running on Python 3.13, and Windows 11.
- You have 'tcl/tk and IDLE' checked this Python environment was installed. In other words, if you were to create and run a cell anywhere in this notebook with the following line `!pip freeze`, you are able to find `tkinter` in the output.
- You are not running the kernel with any virtual environment.
# Preparing the dataset
###### *Cell 7*

In [2]:
raw_dataset = dataset = pd.read_csv('dataset/artDataset.csv')
raw_dataset

Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement
0,0,28.500 USD,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque
1,1,3.000 USD,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism
2,2,5.000 USD,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract
3,3,5.000 USD,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract
4,4,2.500 USD,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract
...,...,...,...,...,...,...,...,...,...
749,749,680 USD,Jane Kent,Miracle Grow #17,2012,Signed and dated on lower right.,Not examined out of frame.No obvious signs of ...,Contemporary,Abstract
750,750,1.275 USD,Gary Bower,Rolph Series,1970,[nan],Not examined out of frame.Significant undulati...,Contemporary,Geometric Abstraction
751,751,680 USD,Jane Kent,Untitled,2012,[nan],Not examined out of frame.No apparent imperfec...,Contemporary,Geometric Abstraction
752,752,1.275 USD,T. L. Solien,Juniper,1986,[nan],Not examined outside of frame.Pinholes at edge...,Contemporary,Abstract


The current columns are:
- `Unnamed: 0`: why even lol
- `price`: Numerical
- `artist`: Categorical
- `title`: Not needed?
- `yearCreation`: Numerical, but can be split into a categorical component
- `signed`: Word frequency
- `condition`: Word frequency
- `period`: Categorical, but might line up with `yearCreation`.
- `movement`: Categorical or word frequency

## `Unnamed: 0`
At first glance, the values in this column line up with the values as prescribed by the leftmost index column. There are a variety of parameters when using `pandas.DataFrame.to_csv()` to save a pandas DataFrame to `.csv`. If there is at least one column filled entirely with unique, non-empty values, `index_label` can be used to designate one of them as the index column. Otherwise, `index` can be used to influence whether a new column of indices is created. If an index-like column already exists in the DataFrame, but isn't designated as such, pandas will treat it like any other column, as it could contain real information. As the `Unnamed: 0` column seems to be a common enough phenomenon within datasets uploaded to Kaggle (https://www.kaggle.com/discussions/general/354943), I believe this is what happened in the creation of this dataset.

To find out if `Unnamed: 0` is effectively an index column, I created a filter to find any rows whose `Unnamed: 0` value is different from the index column's.
###### *Cell 8*

In [3]:
dataset[dataset['Unnamed: 0'] != dataset.index]

Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement


There were none. Therefore, all rows had values which corresponded with their indices. Since `Unnamed: 0` is an index column and does not contain any other unique information, I chose to remove it.
###### *Cell 9*

In [4]:
dataset = dataset.drop(columns = ['Unnamed: 0'])

## `price`
The prices in each row seem to follow a pattern. The United States dollar (USD) is specified as a currency, and the thousands are separated by periods (`.`). This is reminiscent of a convention of writing large numbers that is popular in continental Europe. In contrast to the Anglophone convention, which uses commas (`,`) to separate thousands, and a single period to mark the beginning of the decimal part, the continental convention swaps them around. Nevertheless, characters like these, including the `USD` suffix, cause the prices to be represented as strings. I will need to convert them into numerical representations so that I can carry out sorting and linear regression.

I sought to verify these assumptions:
1. That all prices are denominated in USD.
2. That the period is used as a thousands separator at all times.

First, I searched for any rows whose price was not annnotated with `' USD'`.
###### *Cell 10*

In [5]:
dataset[~dataset['price'].str.endswith(' USD')]

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement


There were none, meaning that all prices were annotated in `' USD'`. There could be additional clerical errors, like `'USD'` appearing more than once. To test that, I provisionally removed the `' USD'` suffix in a copy of the column, then tested for the presence of additional letters by combining all the strings into one and listing all the unique characters within that string.
###### *Cell 11*

In [6]:
column_price_numerical = dataset['price'].str.replace(' USD', '')
set(column_price_numerical.str.cat())

{'.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}

The removal of `' USD'` was a success, and caused all the strings to no longer contain any letters. This proved to me that all prices are indeed denominated in USD, removing the need to perform currency conversions. Furthermore, the period is the only symbol to appear, with whitespaces (` `) and commas notably absent.

I could use this uniformity in formatting to my advantage, and further assume that all prices are natural numbers, i.e., none of the prices have decimal parts. I still considered the edge case that the period is used both as a thousands separator and a decimal separator, and to disprove that I checked the prices against the following regular expression `^\d{1,3}(\.\d{3})*$`. Here are some characteristics:
- Without a period, there can only be up to 3 free digits. With periods, there can only be up to 3 leading digits before the leftmost period.
- There must be at least 1 free digit. Without a period, this ensures the number has at least 1 digit. With periods, this ensures the leftmost period is not exposed (`0.100` as opposed to `.100`).
- Every period must be succeeded by exactly 3 digits. There can be as many groups as possible of periods and 3 trailing digits to represent the powers of a thousand, like millions and billions.
- This expression specifically disallows the common convention of denoting cents with a period and 2 trailing digits.

###### *Cell 12*

In [7]:
column_price_numerical.str.fullmatch(r'^\d{1,3}(\.\d{3})*$').all()

np.True_

Since all prices fit the above pattern, I'm sufficiently convinced that all prices are to the nearest dollar, and that the period is only ever used as a thousands separator. I acknowledge the deeper edge case that prices are shown to three decimal places (i.e., thousandths of a US dollar). However, apart from being highly unlikely that prices are denominated in anything other than dollars and cents, the smallest division of the US dollar is the cent, which is a hundredth of a dollar. Should the agreed-upon price contain part of a cent, buyers may find it hard to pay the exact amount. Still, in case of any further doubt, you may manually review the dataset on Kaggle or a spreadsheet viewer of your choice. You may find and remove any entry whose price you feel should be reasonably interpreted as being a thousandth of a dollar, and run this section again.

Since I assumed the period is not a decimal separator, I assumed too that removing periods will not cause the price to be interpreted as 100 times larger. Thus, I removed all the periods and parsed all strings as integers, which completed the preprocessing of this column. The prices could now be sorted and used in regression, as seen below.
###### *Cell 13*

In [8]:
dataset['price'] = pd.to_numeric(column_price_numerical.str.replace('.', ''))
dataset.sort_values(by = ['price'])

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement
400,595,Paul Manes,Untitled,1986,Signed and dated “Paul Manes ‘86” lower right;...,This work is in good condition.Not examined ou...,Post-War,Abstract
443,600,George Platt Lynes,Nude Figures On A Brass Bed,[nan],[nan],This work is in good condition.Slight rippling...,Modern,Realism
445,600,George Platt Lynes,Laurie Douglas Horbach,1944,[nan],This work is in good condition.Slight rippling...,Modern,Realism
444,600,George Platt Lynes,Kate Lawson,1935,[nan],This work is in good condition.Slight rippling...,Modern,Realism
384,650,David Levinthal,Barbie 64,1998 / 2011,Signed and numbered in ink verso,This work is in excellent condition with no si...,Contemporary,Pop Art
...,...,...,...,...,...,...,...,...
418,71500,Larry Bell,New Test #1,2003,"Signed, titled and dated bottom right corner r...",Not examined out of frame.Moderate sheet undul...,Contemporary,Minimalism
415,75000,Biff Elrod,Ceiling Painting,2018,Signed verso,This work is in excellent condition.,Contemporary,Pop Art
352,128000,John Baldessari,"National City (W,1,2,3,4,5,6,B)",1996 - 2009,Signed on the reverse.,Excellent condition.,Contemporary,Pop Art
148,135000,Vija Celmins,"Untitled (Desert), Untitled (Ocean), Untitled ...",1975,Signed by artist on bottom right,"The work is in excellent condition, direct fro...",Contemporary,Photorealism


## `artist`
Some artists are more famous than others, and consistently command higher prices. Therefore, the provenance of a work should be a predictor of its price. To model this, I sought to turn this column into two features:
  - Dummy (one-hot) encodings for prolific artists. The result is as many columns of dummy encodings as there are artists I wish to track. Any work would take a value of `1` on the column that represents its artist, and `0` everywhere else. The names for each of these columns shall be copied from the strings in `artist`.
  - `works_by_artist`: The number of works in this dataset that are from the same artist. I hypothesised that even if a buyer was unfamiliar with a particular artist, the fact that an auction house's catalogue features many works from the same artist could give an impression of relevance, popularity or market confidence. The result is a numerical column.

My main consideration was the variability introduced during data entry. Different works may have been catalogued decades apart, by different appraisers, who may or may not follow the prevailing formatting guides of their time. Auction houses somewhat mitigate this through authority control, using internal databases to reconcile pseudonyms, name changes (e.g., after marriage), and other variants of a creator’s identity. However, mismatches still arise, and many must be resolved by the curators'/buyers' understanding. For example, if a system doesn't have the right heuristics to link two identities together, it may consider two works, attributed to "PICASSO" and "Pablo Picasso" respectively, to be unrelated. It is up to the buyer to make that connection between the two names, and correctly identify that either of these works are just as valuable as the other.

Considering that this dataset may contain legacy formatting and clerical inconsistencies, and considering that a heuristic approach would not fit within the amount of effort I expect in this section, I decided on human judgement as the most practical choice in consolidating artist names. So, I performed a manual pass to merge entries that were clearly intended to refer to the same person. First, I needed to grasp the space of possible data I needed to process.
###### *Cell 14*

In [9]:
def tablify(array, columns, transpose = False) :
    rows = int(np.ceil(len(array) / columns))
    array_copy = array + [''] * (rows * columns - len(array))

    if transpose :
        array_as_table = np.array(array_copy).reshape(columns, rows).transpose()
    else :
        array_as_table = np.array(array_copy).reshape(rows, columns)
    print(pd.DataFrame(array_as_table).to_string(index = False, header = False))

artist_names = dataset['artist'].value_counts(dropna = False).sort_index().index.to_list()
tablify(artist_names, 5, True)

                       Aaron Siskind                    David Fokos       Hossein Edalatkhah            Louis Fabien             Ralph Morse
                     Abelardo Morell                  David Hornung                 Hung Liu       Louise  Bourgeois          Ray Ciarrocchi
                      Adolf Schreyer                  David Lamelas      Ian Hamilton Finlay          Louise Donegan      Raymond Cauchetier
                  Adrian Wiszniewski                David Levinthal             Irene Mamiye          Lucas  Samaras           Reed Anderson
                    Adrianne Wortzel                     David Roth            Ivan Albright            Lucien Hervé         Richard Ballard
                    After Kota Ezawa                   David Storey             JOHN BELLANY            Lucy Sallick       Richard Bernstein
                         Agnes Story                     David True              JOHN GIBSON        Ludwig Bemelmans        Richard Chiriani
             

I created a helper function that displays long lists in a tabular format, and passed in the unique artist names alphabetically. This sped up my discovery of potential clerical inconsistencies. In many cases, small formatting variations such as differences in spacing, punctuation, or capitalisation become obvious when the entries are viewed side-by-side. For example, I could immediately see on the rightmost column that "T.L. Solien" appeared just below "T. L. Solien", as well as another variant a little further up: "Solien T.L.".

Listed last in the alphabetical ordering, was `nan`.
###### *Cell 15*

In [10]:
dataset[dataset['artist'].isna()]

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement
725,1275,,[nan],[nan],Signed and dated in pencil to verso,Not examined out of frame.Minor sheet undulati...,Contemporary,Realism


There was surprisingly only one `nan` value in this entire dataset. However, this lone `nan` caused friction with many of the pandas functions, and I had to include extra parameters in my function calls, like `dropna = False`. So I replaced it with the string "unknown". This choice avoided conflicts with real artist names, and retained its convenient position at the bottom of alphabetical listings.
###### *Cell 16*

In [11]:
dataset.loc[725, 'artist'] = 'unknown'

With the full set of unique names in alphabetical order, I began manually identifying connections between these names—an example of data linkage, as I later learnt the process is called.

ChatGPT was useful in accelerating the discovery of possible matches. As someone unfamiliar with all of the artists in the dataset, it brought to the table contextual information, such as an artist's history, aliases, or typical subject matter. With this knowledge, it could propose matches that I wouldn't have otherwise considered. However, given the length of the list of names, it hallucinated often, made up new names and proposed links between unrelated artists. Therefore, I used these suggestions as prompts for further verification rather than as authoritative claims.
###### *Cell 17*

In [12]:
dataset[dataset['artist'].str.contains('Kota')]

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement
312,3495,Kota Ezawa,The Melody of Destiny,2008,[nan],The transparency and light box are in generall...,Contemporary,Conceptual
313,3495,Kota Ezawa,Conical Intersect (After Gordon Matta-Clark 1975),2005,[nan],The transparency and light box are in generall...,Contemporary,Realism
315,3995,Kota Ezawa,Earth from Moon (After NASA 1969),2006,[nan],The transparency and light box are in generall...,Contemporary,Realism
316,3995,After Kota Ezawa,The Bohemians (After August Sander 1924),2006,[nan],The transparency and light box are in generall...,Contemporary,Realism


One such case where I overruled ChatGPT was with [Kota Ezawa](https://en.wikipedia.org/wiki/Kota_Ezawa). He is a German and Japanese artist currently based in San Francisco, whose works remix existing media (like photographs and movie stills) in his style of flat colours and a limited palette. As a result, the titles of his works contain credits to the source material, in the form of "(After)".

<table>
    <tr>
        <td>
            <img src="https://sothebys-md.brightspotcdn.com/72/8a/8076c8744baeafc979cf7d160c85/bsp7f-front.png" />
        </td>
        <td>
            <img src="https://sothebys-md.brightspotcdn.com/e3/21/b2d2428f4169b020afa671e01d5f/bsp7g-front.png" />
        </td>
    </tr>
    <tr>
        <td colspan="2"> <!-- Use colspan to span across 2 columns -->
            <p><center><i>"Conical Intersect" (left) and "The Bohemians" (right). Images provided by Sotheby's online catalogue.</i></center></p>
        </td>
    </tr>
</table>

ChatGPT suggested that "The Bohemians (After August Sander 1924)", attributed to "After Kota Ezawa", was the work of an unnamed artist, who was reinterpreting Ezawa’s own interpretation of August Sander. However, I considered this unlikely. If The Bohemians had been made by someone else, I would expect that it was presented in a different condition than the other genuine Ezawas, be offered at a different price, or was at least dated later than them. But in all these respects it was similar to the other Ezawas. Most of the descriptive fields had a consistent format, if not outright identical, and appeared in adjacent index positions. This implied to me that they were catalogued at the same time and by the same appraiser.

Still, I verified my assumptions with a manual search. Although the work has been made unavailable today, I found its [listing](https://www.sothebys.com/en/buy/_the-bohemians-after-august-sander-1924-from-the-history-of-photography-remix-3eed). Here, I saw that not only were the style and colour palette similar to the other Ezawas, the description, which is missing from the dataset as a field, also read:
> Kota Ezawa (German, b. 1969).
>
> This piece is final sale and not eligible for return.

With this, I concluded that ChatGPT was wrong in this instance, and that The Bohemians was indeed created by Kota Ezawa.

After identifying variants of a single artist's name, the next step was to make one of them the *canonical form*: a single identity with which all other variants shall be reconciled. All works in the dataset that had been attributed to any of these variants were re-attributed to this canonical form. Typically, I chose the identity with the most amount of works to be the canonical form.
###### *Cell 18*

In [13]:
# Merge variants of "T.L. Solien"
dataset['artist'] = dataset['artist'].replace(['T. L. Solien', 'Solien T.L.'], 'T.L. Solien')

# Merge variants of "Kota Ezawa"
dataset['artist'] = dataset['artist'].replace('After Kota Ezawa', 'Kota Ezawa')

# Merge variants of "John Andre Gundelfinger"
dataset['artist'] = dataset['artist'].replace('John Gundelfinger', 'John Andre Gundelfinger')

# Merge variants of "Richard Woods"
dataset['artist'] = dataset['artist'].replace('RICHARD WOODS', 'Richard Woods')

As I do not have the expertise of an art curator, I was necessarily conservative in discovering links between the various identities. Beyond the clerical variants, I did not make further assumptions about any other pair of names, and it is possible that some pseudonyms still lie undiscovered.

Now that the works have been (sufficiently) correctly attributed, some artists now have more works to their name, which more accurately reflects their perceived popularity in the online gallery. I computed this number for each artist and added them to each of their works as a numerical feature: `works_by_artist`, as I had introduced in the beginning of this section.

Notably, I assigned a value of 0 to the work with the unknown artist. While it is certain that this work was created by someone, the absence of a traceable or recognisable identity removes any potential authorship premium. From a buyer’s perspective, unknown authorship introduces uncertainty and eliminates the possibility of prior recognition, institutional validation, or reputational signalling. Even artists represented by a single work retain a non-zero probability of name recognition, whereas an unknown artist does not. For this reason, I treat the unknown artist as having zero name-based recognition. Any value attributed to this work must therefore arise from other observable characteristics, such as aesthetic qualities, condition, or alignment with prevailing trends.
###### *Cell 19*

In [14]:
artist_popularity = dataset['artist'].value_counts()
dataset.insert(loc = 2, column = 'works_by_artist', value = dataset['artist'].map(artist_popularity))
dataset.loc[725, 'works_by_artist'] = 0
dataset

Unnamed: 0,price,artist,works_by_artist,title,yearCreation,signed,condition,period,movement
0,28500,Tommaso Ottieri,1,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque
1,3000,Pavel Tchelitchew,1,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism
2,5000,Leo Gabin,1,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract
3,5000,Matthias Dornfeld,1,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract
4,2500,Alexis Marguerite Teplin,1,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract
...,...,...,...,...,...,...,...,...,...
749,680,Jane Kent,4,Miracle Grow #17,2012,Signed and dated on lower right.,Not examined out of frame.No obvious signs of ...,Contemporary,Abstract
750,1275,Gary Bower,1,Rolph Series,1970,[nan],Not examined out of frame.Significant undulati...,Contemporary,Geometric Abstraction
751,680,Jane Kent,4,Untitled,2012,[nan],Not examined out of frame.No apparent imperfec...,Contemporary,Geometric Abstraction
752,1275,T.L. Solien,9,Juniper,1986,[nan],Not examined outside of frame.Pinholes at edge...,Contemporary,Abstract


While `works_by_artist` sought to capture the effect of confidence derived from seeing many works from the same artist, individual artists carry their own reputations too. Just knowing that a work was created by a renowned artist can add to its perceived value. I sought to encode this aforementioned "authorship premium" by adding a column for each artist, denoting whether a work was attributed to them. This allows a model to learn the "value" of an artist's name, independent of the qualities and aesthetics of their works.

However, as it was seen in the preview of the dataset above, and as seen below, the majority of artists appear only once: 318 out of 449.
###### *Cell 20*

In [15]:
artist_popularity = artist_popularity.drop('unknown')
artist_attribution_thresholds = artist_popularity.value_counts().sort_index(ascending = False)
artist_attribution_thresholds

count
17      1
15      1
13      1
9       2
8       3
7       4
6       5
5       5
4      10
3      27
2      72
1     318
Name: count, dtype: int64

Dummy variables are extremely sparse; a column corresponding to an artist with a single work would take the value 1 for exactly one row and 0 everywhere else. As such a feature carries no repeatable signal, its coefficient would be estimated from a single observation and would therefore primarily capture noise rather than a stable effect attributable to the artist’s reputation. Including hundreds of such sparse features would greatly increase the dimensionality of the dataset without providing meaningful predictive power, increasing the risk of overfitting. At the same time, I consider artist indicators to act as tags rather than numerical features. I'm not measuring the *Pavel Tchelitchew*-ness of a work just like I am the price or year of creation. Instead, if a work had been made by Pavel Tchelitchew, the value associated with the tag is added to its predicted price, like a bias. Under this reasoning, one-work artists account for 318 of the 754 works (42%), and excluding these tags from being learned would remove any possibility of modelling name-based effects for a large subset of works.

In the end, I decided to track every single artist (apart from "unknown") in a separate DataFrame.
###### *Cell 21*

In [16]:
artist_attributions = pd.DataFrame(
    {
        artist: (dataset['artist'] == artist).astype(int)
        for artist in artist_popularity.to_dict()
    },
    index=dataset.index
)
artist_attributions

Unnamed: 0,Russell Young,John Fischer,Ruth Bernhard,T.L. Solien,Donald Sultan,Grant Hacking,Ed Ruscha,Richard Bernstein,Cindy Sherman,Robert Indiana,...,Anish Kapoor,Charline von Heyl,Patricia Treib,Guy Pène Du Bois,Jason Bereswill,Lita Albuquerque,Louis Fabien,Kehinde Wiley,Math Bass,John Duff
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
750,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
752,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


This completes the first set of features that I had introduced in the beginning of the section: dummy encodings for prolific artists. `artist_attributions` is designed to be modular; the columns are ordered in descending artist representation, and leftmost columns represent artists who have the most works. Rather than permanently committing to a fixed subset of artist indicators, a selected subset of its columns will be appended to the dataset. `artist_attribution_thresholds`, as shown in cell 20, is a reference for how many of the columns to include for training. It shows that the first 3 columns represent artists who have 13 or more works. The next 19 columns represented artists who have 5 or more works, and so on. By progressively including more columns and therefore less represented artists, I hoped to find the balance between the comprehensive tracking of artists' values and the noisiness of the resulting artist tags.

## `title`
I acknowledge that the title of an artwork influences a prospective buyer. First impressions matter in a marketplace of many works, and an engaging title would help a particular piece stand out to them, rather than be glossed over. However, most titles are unique to a single work and there is no obvious ordinal or numerical structure. To use title as an input to a linear regression model would require transforming the text into a set of bag-of-words features, which, due to the small sample size and large word space, would result in high sparsity and greatly increase the risk of overfitting. Therefore, I decided to retain `title` for my reference, but exclude it from the set of predictive features.

I created a filter to exclude this column.
###### *Cell 22*

In [17]:
training_features = dataset.columns
training_features = training_features.drop(['artist', 'title'])

pd.concat([dataset[training_features], artist_attributions], axis = 1)

Unnamed: 0,price,works_by_artist,yearCreation,signed,condition,period,movement,Russell Young,John Fischer,Ruth Bernhard,...,Anish Kapoor,Charline von Heyl,Patricia Treib,Guy Pène Du Bois,Jason Bereswill,Lita Albuquerque,Louis Fabien,Kehinde Wiley,Math Bass,John Duff
0,28500,1,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3000,1,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5000,1,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5000,1,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2500,1,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,680,4,2012,Signed and dated on lower right.,Not examined out of frame.No obvious signs of ...,Contemporary,Abstract,0,0,0,...,0,0,0,0,0,0,0,0,0,0
750,1275,1,1970,[nan],Not examined out of frame.Significant undulati...,Contemporary,Geometric Abstraction,0,0,0,...,0,0,0,0,0,0,0,0,0,0
751,680,4,2012,[nan],Not examined out of frame.No apparent imperfec...,Contemporary,Geometric Abstraction,0,0,0,...,0,0,0,0,0,0,0,0,0,0
752,1275,9,1986,[nan],Not examined outside of frame.Pinholes at edge...,Contemporary,Abstract,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Since a model would have no need for `artist` or `title`, this DataFrame is closer to what it would be trained on. I will remove more columns from this filter as I explain the further preprocessing steps below.

## `yearCreation`

In [18]:
year_conversion = dataset['yearCreation'].value_counts(dropna = False).to_frame()
year_conversion

Unnamed: 0_level_0,count
yearCreation,Unnamed: 1_level_1
2012,34
1990,28
1989,23
2008,23
1986,21
...,...
1931,1
1954,1
Circa 1991,1
Circa 2001,1


In [19]:
def rule_numerical_year(year) :
    if year.isnumeric() :
        return int(year) + 0.5
    return None

def rule_numerical_year_interval(year) :
    if year.isnumeric() :
        return 1
    return None

year_conversion['year'] = year_conversion.index.map(rule_numerical_year)
year_conversion['year_interval'] = year_conversion.index.map(rule_numerical_year_interval)

unconverted_strings = year_conversion['year'].isna()
year_conversion[~unconverted_strings]

Unnamed: 0_level_0,count,year,year_interval
yearCreation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,34,2012.5,1.0
1990,28,1990.5,1.0
1989,23,1989.5,1.0
2008,23,2008.5,1.0
1986,21,1986.5,1.0
...,...,...,...
1894,1,1894.5,1.0
1944,1,1944.5,1.0
1941,1,1941.5,1.0
1931,1,1931.5,1.0


In [20]:
def rule_circa_year(year) :
    if not year.startswith('Circa '):
        return None
    year = year[6:]
    if year.isnumeric():
        return int(year) + 0.5
    return None

def rule_circa_year_interval(year) :
    if not year.startswith('Circa '):
        return None
    year = year[6:]
    if year.isnumeric():
        return 5
    return None

year_conversion.loc[unconverted_strings, 'year'] = year_conversion.loc[unconverted_strings].index.map(rule_circa_year)
year_conversion.loc[unconverted_strings, 'year_interval'] = year_conversion.loc[unconverted_strings].index.map(rule_circa_year_interval)

unconverted_strings = year_conversion['year'].isna()
year_conversion[unconverted_strings]

Unnamed: 0_level_0,count,year,year_interval
yearCreation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
[nan],19,,
Second Half 20th Century,17,,
21st Century,7,,
Late 20th Century,7,,
Mid 20th Century,3,,
Late 19th Century,3,,
Early 20th Century,3,,
First Half 20th Century,2,,
19th Century,2,,
20th Century,2,,


In [21]:
def rule_year_range(year) :
    if year.startswith('Circa '):
        year = year[6:]
    start_end = year.replace(' ', '').split('-')
    if not len(start_end) == 2 :
        return None
    if not (start_end[0].isnumeric() and start_end[1].isnumeric()) :
        return None
    return (int(start_end[0]) + int(start_end[1])) / 2 + 0.5

def rule_year_range_interval(year) :
    if year.startswith('Circa '):
        year = year[6:]
    start_end = year.replace(' ', '').split('-')
    if not len(start_end) == 2 :
        return None
    if not (start_end[0].isnumeric() and start_end[1].isnumeric()) :
        return None
    return int(start_end[1]) - int(start_end[0]) + 1

year_conversion.loc[unconverted_strings, 'year'] = year_conversion.loc[unconverted_strings].index.map(rule_year_range)
year_conversion.loc[unconverted_strings, 'year_interval'] = year_conversion.loc[unconverted_strings].index.map(rule_year_range_interval)

unconverted_strings = year_conversion['year'].isna()
year_conversion[unconverted_strings].index

Index(['[nan]', 'Second Half 20th Century ', '21st Century ',
       'Late 20th Century ', 'Mid 20th Century ', 'Late 19th Century ',
       'Early 20th Century ', 'First Half 20th Century ', '19th Century ',
       '20th Century ', 'Second Half 19th Century ', '1961, printed in 2010',
       '3D printed using ABS, PLA plastics, resin, automobile paints, etched brass, dry transfers, acrylic mirror, batik fabric, quartz clock',
       '1998 / 2011', 'Printed 1984'],
      dtype='object', name='yearCreation')

In [22]:
dataset[dataset['yearCreation'] == '1998 / 2011']

Unnamed: 0,price,artist,works_by_artist,title,yearCreation,signed,condition,period,movement
384,650,David Levinthal,1,Barbie 64,1998 / 2011,Signed and numbered in ink verso,This work is in excellent condition with no si...,Contemporary,Pop Art


In [23]:
year_conversion.loc['Second Half 20th Century ', ['year', 'year_interval']] = [1975, 50]
year_conversion.loc['21st Century ', ['year', 'year_interval']] = [2011.11, 22.22]
year_conversion.loc['2022', ['year', 'year_interval']] = [2022.11, 0.22] # By this logic, "2022" should also only span from January the 1st to March the 20th.
year_conversion.loc['Late 20th Century ', ['year', 'year_interval']] = [1983 + 1 / 3, 100 / 3]
year_conversion.loc['Mid 20th Century ', ['year', 'year_interval']] = [1950, 100 / 3]
year_conversion.loc['Late 19th Century ', ['year', 'year_interval']] = [1883 + 1 / 3, 100 / 3]
year_conversion.loc['Early 20th Century ', ['year', 'year_interval']] = [1916 + 2 / 3, 100 / 3]
year_conversion.loc['First Half 20th Century ', ['year', 'year_interval']] = [1925, 50]
year_conversion.loc['19th Century ', ['year', 'year_interval']] = [1850, 100]
year_conversion.loc['20th Century ', ['year', 'year_interval']] = [1950, 100]
year_conversion.loc['Second Half 19th Century ', ['year', 'year_interval']] = [1875, 50]
year_conversion.loc['1961, printed in 2010', ['year', 'year_interval']] = [2010.5, 1]
year_conversion.loc[year_conversion.index.str.contains('3D'), ['year', 'year_interval']] = [2019.5, 1]
year_conversion.loc['1998 / 2011', ['year', 'year_interval']] = [2011.5, 1]
year_conversion.loc['Printed 1984', ['year', 'year_interval']] = [1984.5, 1]
year_conversion.loc['[nan]', ['year', 'year_interval']] = [1911.11, 222.22] # January the 1st, 1800, to March the 20th, 2022.

year_conversion[year_conversion['year'].isna()]

Unnamed: 0_level_0,count,year,year_interval
yearCreation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [24]:
dataset.insert(4, 'year', dataset['yearCreation'].map(year_conversion['year']))
dataset.insert(5, 'year_interval', dataset['yearCreation'].map(year_conversion['year_interval']))
dataset = dataset.drop(columns = ['yearCreation'])
dataset

Unnamed: 0,price,artist,works_by_artist,title,year,year_interval,signed,condition,period,movement
0,28500,Tommaso Ottieri,1,Bayreuth Opera,2021.5,1.0,Signed on verso,This work is in excellent condition.,Contemporary,Baroque
1,3000,Pavel Tchelitchew,1,Drawings of the Opera,1925.0,50.0,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism
2,5000,Leo Gabin,1,Two on Sidewalk,2016.5,1.0,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract
3,5000,Matthias Dornfeld,1,Blumenszene,2010.5,1.0,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract
4,2500,Alexis Marguerite Teplin,1,Feverish Embarkation,2001.5,1.0,Signed on verso,This work is in excellent condition.,Contemporary,Abstract
...,...,...,...,...,...,...,...,...,...,...
749,680,Jane Kent,4,Miracle Grow #17,2012.5,1.0,Signed and dated on lower right.,Not examined out of frame.No obvious signs of ...,Contemporary,Abstract
750,1275,Gary Bower,1,Rolph Series,1970.5,1.0,[nan],Not examined out of frame.Significant undulati...,Contemporary,Geometric Abstraction
751,680,Jane Kent,4,Untitled,2012.5,1.0,[nan],Not examined out of frame.No apparent imperfec...,Contemporary,Geometric Abstraction
752,1275,T.L. Solien,9,Juniper,1986.5,1.0,[nan],Not examined outside of frame.Pinholes at edge...,Contemporary,Abstract


## `signed`

In [25]:
signed_conversion = dataset['signed'].value_counts(dropna = False).to_frame()
signed_conversion

Unnamed: 0_level_0,count
signed,Unnamed: 1_level_1
[nan],153
Signed lower right,29
Signed verso,15
Signed lower right recto,11
Signed and dated lower right recto,9
...,...
Signed Linda Besse; dated © 2002 (lower left),1
Signed Grant Hacking (lower left),1
Signed FETHEROLF. (lower right); titled (on the stretcher),1
Signed Fetherolf (lower right); titled (on the stretcher),1


In [26]:
signed_conversion['is_signed'] = (signed_conversion.index != '[nan]').astype(int)
signed_conversion

Unnamed: 0_level_0,count,is_signed
signed,Unnamed: 1_level_1,Unnamed: 2_level_1
[nan],153,0
Signed lower right,29,1
Signed verso,15,1
Signed lower right recto,11,1
Signed and dated lower right recto,9,1
...,...,...
Signed Linda Besse; dated © 2002 (lower left),1,1
Signed Grant Hacking (lower left),1,1
Signed FETHEROLF. (lower right); titled (on the stretcher),1,1
Signed Fetherolf (lower right); titled (on the stretcher),1,1


In [27]:
regex_year_range = re.compile(r'(1[8-9]\d{2}|20[0-1]\d|202[0-2])')
regex_2_digit_year = re.compile(r'[’\']\d{2}\b')
def rule_is_dated(value) :
    if 'date' in value.lower() :
        return 1
    if regex_year_range.search(value) :
        return 1
    if regex_2_digit_year.search(value) :
        return 1
    return 0

signed_conversion['is_dated'] = signed_conversion.index.map(rule_is_dated)
signed_conversion

Unnamed: 0_level_0,count,is_signed,is_dated
signed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
[nan],153,0,0
Signed lower right,29,1,0
Signed verso,15,1,0
Signed lower right recto,11,1,0
Signed and dated lower right recto,9,1,1
...,...,...,...
Signed Linda Besse; dated © 2002 (lower left),1,1,1
Signed Grant Hacking (lower left),1,1,0
Signed FETHEROLF. (lower right); titled (on the stretcher),1,1,0
Signed Fetherolf (lower right); titled (on the stretcher),1,1,0


In [28]:
regex_slash = re.compile(r'\d/\d')
def rule_is_numbered(value) :
    if 'number' in value.lower() :
        return 1
    if 'edition' in value.lower() :
        return 1
    if regex_slash.search(value) :
        return 1
    return 0

signed_conversion['is_numbered'] = signed_conversion.index.map(rule_is_numbered)
signed_conversion

Unnamed: 0_level_0,count,is_signed,is_dated,is_numbered
signed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
[nan],153,0,0,0
Signed lower right,29,1,0,0
Signed verso,15,1,0,0
Signed lower right recto,11,1,0,0
Signed and dated lower right recto,9,1,1,0
...,...,...,...,...
Signed Linda Besse; dated © 2002 (lower left),1,1,1,0
Signed Grant Hacking (lower left),1,1,0,0
Signed FETHEROLF. (lower right); titled (on the stretcher),1,1,0,0
Signed Fetherolf (lower right); titled (on the stretcher),1,1,0,0


In [31]:
signed_conversion['is_titled'] = signed_conversion.index.str.lower().str.contains('title').astype(int)
signed_conversion

Unnamed: 0_level_0,count,is_signed,is_dated,is_numbered,is_titled
signed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
[nan],153,0,0,0,0
Signed lower right,29,1,0,0,0
Signed verso,15,1,0,0,0
Signed lower right recto,11,1,0,0,0
Signed and dated lower right recto,9,1,1,0,0
...,...,...,...,...,...
Signed Linda Besse; dated © 2002 (lower left),1,1,1,0,0
Signed Grant Hacking (lower left),1,1,0,0,0
Signed FETHEROLF. (lower right); titled (on the stretcher),1,1,0,0,1
Signed Fetherolf (lower right); titled (on the stretcher),1,1,0,0,1


In [96]:
regex_slash = re.compile(r'\d/\d')
def rule_is_authenticated(value) :
    if 'certifi' in value.lower() :
        return 1
    if 'coa' in value.lower() :
        return 1
    if 'authenti' in value.lower() :
        return 1
    if 'proof' in value.lower() :
        return 1
    if 'estate' in value.lower() :
        return 1
    if 'archive' in value.lower() :
        return 1
    if 'gallery' in value.lower() :
        return 1
    if 'studio' in value.lower() :
        return 1
    if 'house' in value.lower() :
        return 1
    if 'stamp' in value.lower() :
        return 1
    if 'chop' in value.lower() :
        return 1
    if 'seal' in value.lower() :
        return 1
    if 'label' in value.lower() :
        return 1
    return 0

signed_conversion['is_authenticated'] = signed_conversion.index.map(rule_is_authenticated)
signed_conversion

Unnamed: 0_level_0,count,is_signed,is_dated,is_numbered,is_titled,is_authenticated
signed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
[nan],153,0,0,0,0,0
Signed lower right,29,1,0,0,0,0
Signed verso,15,1,0,0,0,0
Signed lower right recto,11,1,0,0,0,0
Signed and dated lower right recto,9,1,1,0,0,0
...,...,...,...,...,...,...
Signed Linda Besse; dated © 2002 (lower left),1,1,1,0,0,0
Signed Grant Hacking (lower left),1,1,0,0,0,0
Signed FETHEROLF. (lower right); titled (on the stretcher),1,1,0,0,1,0
Signed Fetherolf (lower right); titled (on the stretcher),1,1,0,0,1,0


In [86]:
signed_conversion[signed_conversion['is_authenticated'] == 1]

Unnamed: 0_level_0,count,is_signed,is_dated,is_numbered,is_titled,is_authenticated
signed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Estate of Richard Bernstein stamp on verso,4,1,0,0,0,1
Signed certificate of authenticity,3,1,0,0,0,1
Signed and numbered on label by the artist,2,1,0,1,0,1
Signed in ink on the photographer's label on the reverse,2,1,0,0,0,1
Signed on label on verso,2,1,0,0,0,1
"Signed, titled and dated on the reverse with the artist's stamp",1,1,1,0,1,1
"Signed and editioned '37 of 99' in pencil by Gary Truman, the photographer's archive manager, and Donna Schulke, the photographer's widow, on the reverse.",1,1,1,1,0,1
"Each edition is accompanied by a colophon, signed, and numbered by the artist and stamped with his seal",1,1,0,1,0,1
Stamped Pilat lower right verso,1,1,0,0,0,1
Stamped Pilat lower right verso,1,1,0,0,0,1


- is_titled - is_inscribed - has_certificate - has_stamp

In [135]:
dataset[dataset['signed'].str.contains('Keller')]

Unnamed: 0,price,artist,works_by_artist,title,year,year_interval,signed,condition,period,movement
404,8000,Walker Evans,1,Passengers on the Subway,1939.5,1.0,The photographer's credit stamp (Keller B) and...,"This print, on double-weight paper with a slig...",Modern,Realism


## `condition`

In [135]:
for i in dataset['condition'].value_counts().items() :
    print(i)

('Excellent condition.', 82)
('The work is in excellent condition, direct from the publisher.', 50)
('This work is in excellent condition, direct from the publisher.', 44)
('This work is in very good condition.Not examined out of frame.No obvious signs of wear to art.', 21)
('This work is in excellent condition.', 18)
('Very good condition', 15)
('This work is in very good condition.Not examined outside of frame.No obvious signs of damage to artwork.', 14)
('The work is in excellent condition.', 13)
('This work is in very good condition.No visible signs of wear to artwork.Not examined out of frame.', 12)
('This work is in very good condition.Artwork not examined outside of frame.No obvious signs of wear to artwork.', 12)
('No obvious signs of wear.', 9)
('No obvious signs of damage or wear.', 8)
('Not examined out of frame.No obvious signs of wear.', 7)
('[nan]', 7)
('Not examined out of frame.No obvious signs of wear to art.', 7)
('This work is in very good condition.Not examined out 

## `period`

In [160]:
dataset['period'].value_counts()

period
Contemporary    414
Post-War        285
Modern           42
19th Century     12
[nan]             1
Name: count, dtype: int64

## `movement`

In [159]:
dataset['movement'].value_counts()

movement
Realism                                  177
Abstract                                 153
Expressionism                            103
Pop Art                                   88
Conceptual                                73
Surrealism                                21
Impressionism                             20
Geometric Abstraction                     19
Minimalism                                18
Abstract Expressionism                    16
Feminist Art                               7
Traditional                                5
Organic/Biomorphic Abstraction             5
Nouveau Réalisme                           4
[nan]                                      4
Post-Minimalism                            4
Post-Impressionism                         4
Social Realism                             4
Photorealism                               4
Modernism                                  3
Performance Art                            3
Street Art                                 3
E

# Statistical analysis
Mean median? Spread kurtosis
Interpretation
# Visualisations
## Correlation between `period` and `year`
What is the most impactful visualisation?