# Setup
## Downloading dependencies
###### *Cell 1*

In [None]:
from IPython.display import clear_output
# If it ain't here, you pip it. https://www.w3schools.com/python/python_ref_modules.asp
!pip install --upgrade kaggle
!pip install --upgrade pandas
clear_output()

## Importing dependencies
###### *Cell 2*

In [None]:
import os
import tkinter as tk
from tkinter import filedialog
import json
from pathlib import Path
import numpy as np
import pandas as pd

## Initialising the Kaggle CLI
### Option #1: New token
If you don't already have a token or have lost the file to your current token, in the settings of your Kaggle account, click on the button 'Generate New Token'. Follow the instructions and copy the alphanumeric string at the top when such a floating dialogue appears. Then, run cell 3 and paste the key when prompted.
###### *Cell 3*

In [None]:
def kaggle_key() :
    key = input('Paste your token here, or leave empty if your token comes in the form of a JSON file: ')
    clear_output()
    if len(key) > 3 :
        # Create api.txt in working directory, where `KAGGLE_API_TOKEN {key}`.
        api_file = Path("api.txt")
        with api_file.open("w", encoding="utf-8") as f:
            f.write('KAGGLE_API_TOKEN ' + key)
        os.environ['KAGGLE_API_TOKEN'] = key
        os.environ.pop('KAGGLE_USERNAME', None)
        os.environ.pop('KAGGLE_KEY', None)
        print('The last four characters of your API key are: ' + key[-4:] + '. If you suspect that you have entered something wrong, run cell 4 again. Otherwise, you may move to the next section.')
        return True
    else :
        return False
    
def legacy_kaggle_key() :
    '''
    Opens a file dialog, validates the selection, and returns the path 
    if a valid JSON file is selected. Handles all edge cases.
    '''
    root = tk.Tk()
    root.withdraw()
    root.call('wm', 'attributes', '.', '-topmost', True)
    
    file_path = tk.filedialog.askopenfilename(
        title='Find the file with your Kaggle API key...',
        filetypes=(('JSON files', '*.json'), ('All files', '*.*'))
    )
    
    root.destroy()

    if not file_path :
        # Case: User closes the dialog without selecting anything
        print('No file selected. Run cell 5 again if you like.')
        return
    
    # Now try to open and validate the *contents* of the JSON file
    try :
        with open(file_path, 'r') as f :
            data = json.load(f)
        
        # Case: User selects the correct JSON file that contains their API key
        if 'key' in data and isinstance(data['username'], str) and isinstance(data['key'], str) :
            # Create api.txt in working directory, where `KAGGLE_USERNAME {data['username']}` and `KAGGLE_KEY {data['key']}`.
            api_file = Path("api.txt")
            with api_file.open("w", encoding="utf-8") as f:
                f.write('KAGGLE_USERNAME ' + data['username'] + '\nKAGGLE_KEY ' + data['key'])
            os.environ['KAGGLE_USERNAME'] = data['username']
            os.environ['KAGGLE_KEY'] = data['key']
            os.environ.pop('KAGGLE_API_TOKEN', None)
            print('The last four characters of your API key are: ' + data['key'][-4:] + '. If you suspect that this is not an alphanumeric string, find another file by running cell 5 again. Otherwise, you may move to the next section.')
            
        else :
            # Case: User selects a JSON file, but it's not one that contains their API key
            print('The \'key\' field is missing or invalid. To find another file, run this cell again.')
            
    except Exception :
        print('This file may not contain valid JSON. To find another file, run this cell again.')
        return

# If api.txt is in the working directory, separate by newline, then separate by spaces. Each line has the form '{key} {value}`, where the environment variable `key` should be created with value `value`.
# Otherwise, run `kaggle_key()`. If that returns `False`, then run `legacy_kaggle_key()`.

api_file = Path('api.txt')

if api_file.exists() :
    try :
        with api_file.open('r', encoding='utf-8') as f :
            for line in f :
                line = line.strip()
                if not line or line.startswith('#') :
                    continue

                parts = line.split(None, 1)  # split on first whitespace
                if len(parts) != 2 :
                    continue

                env_key, env_value = parts
                os.environ[env_key] = env_value

    except OSError as e :
        print('Found api.txt but could not read it:\n' + e)
else :
    print('api.txt was not found.')

preview = os.environ.get('KAGGLE_API_TOKEN') or os.environ.get('KAGGLE_KEY')
if preview:
    print('Loaded existing credentials from api.txt. The last four characters of your API key are: ' + preview[-4:] + '. If that looks wrong, run cell 4 or 5.')
else:
    # No api.txt present; go through interactive flow
    if not kaggle_key():
        legacy_kaggle_key()

###### *Cell 4*

In [None]:
if not kaggle_key() :
    print('You probably didn\'t enter a valid key. Run this cell again if you like.')

###### *Cell 5*

In [None]:
legacy_kaggle_key()

###### *Cell 6*

In [None]:
kaggle_module_description = !pip show kaggle
os.environ['PATH'] = os.environ['PATH'] + kaggle_module_description[-3][10:-13] + 'Scripts;'
!kaggle datasets download flkuhm/art-price-dataset -p dataset -f artDataset.csv

I assume:
- Your kernel is running on Python 3.13, and Windows 11.
- You have 'tcl/tk and IDLE' checked this Python environment was installed. In other words, if you were to create and run a cell anywhere in this notebook with the following line `!pip freeze`, you are able to find `tkinter` in the output.
- You are not running the kernel with any virtual environment.
# Preparing the dataset
###### *Cell 7*

In [None]:
raw_dataset = dataset = pd.read_csv('dataset/artDataset.csv')
raw_dataset

The current columns are:
- `Unnamed: 0`: why even lol
- `price`: Numerical
- `artist`: Categorical
- `title`: Not needed?
- `yearCreation`: Numerical, but can be split into a categorical component
- `signed`: Word frequency
- `condition`: Word frequency
- `period`: Categorical, but might line up with `yearCreation`.
- `movement`: Categorical or word frequency

## `Unnamed: 0`
At first glance, the values in this column line up with the values as prescribed by the leftmost index column. There are a variety of parameters when using `pandas.DataFrame.to_csv()` to save a pandas DataFrame to `.csv`. If there is at least one column filled entirely with unique, non-empty values, `index_label` can be used to designate one of them as the index column. Otherwise, `index` can be used to influence whether a new column of indices is created. If an index-like column already exists in the DataFrame, but isn't designated as such, pandas will treat it like any other column, as it could contain real information. As the `Unnamed: 0` column seems to be a common enough phenomenon within datasets uploaded to Kaggle (https://www.kaggle.com/discussions/general/354943), I believe this is what happened in the creation of this dataset.

To find out if `Unnamed: 0` is effectively an index column, I created a filter to find any rows whose `Unnamed: 0` value is different from the index column's.
###### Cell 8

In [None]:
dataset[dataset['Unnamed: 0'] != dataset.index]

There were none. Therefore, all rows had values which corresponded with their indices. Since `Unnamed: 0` is an index column and does not contain any other unique information, I chose to remove it.
###### Cell 9

In [None]:
dataset = dataset.drop(columns = ['Unnamed: 0'])

## `price`
First, I establish that these values are being saved as strings, and that for sorting to occur, I must convert them into a numerical representation. I observe that the prices are annotated with "USD". I show that all rows have prices in USD, which precludes any currency conversion. Then, as part of the conversion to a numerical representation, I remove " USD" from all of these values.

Second, I assume two things: that the period is only used as a thousands separator, like in the continental system, and that all prices are natural numbers. I show that there are no other characters in all the strings than `[0-9.]+`. Then, I show that all rows fit into one of two patterns: no period, or a period succeeded by 3 digits. Finally, I acknowledge the edge case where prices are shown to 3 decimal places. However, I believe this is highly unlikely as the US dollar is denominated down to a cent, which is a hundredth of a dollar, and invite the reader to manually review the dataset in case of further doubt. I proceed to remove the periods from all the values, and parse them all as numbers.

In order to demonstrate that the period is only used as a thousands separator, after removing the currency tag, I show that all entires fit into one of two patterns: no period, or a period succeeded by 3 digits.

In [None]:
raw_dataset['price']

In [None]:
in_usd = raw_dataset['price'].str.contains('USD')
raw_dataset[in_usd]

Of course, within the second group, there could be entries whose prices are shown to three decimal places (i.e., thousandths of a US dollar). However, I believe this is highly unlikely as the US dollar is denominated down to a cent, which is a hundredth of a dollar. In case of any further doubts, you may manually review the dataset on Kaggle or a spreadsheet viewer of your choice. You may find and remove any entry whose price you feel should be reasonably interpreted as being a thousandth of a dollar, and run this section again.

In [None]:
# 1. Remove the " USD" text from the end of the strings
# The regex=False makes it slightly faster for a fixed string replacement
cleaned_prices = raw_dataset['price'].str.replace(' USD', '', regex=False)
cleaned_prices = cleaned_prices.str.replace('.', '', regex=False)

# 2. Convert the resulting clean numeric strings to a float data type
# pd.to_numeric is generally robust and efficient
raw_dataset['price_numeric'] = pd.to_numeric(cleaned_prices)

# Optional: You can drop the old string column
#raw_dataset = raw_dataset.drop(columns=['price'])

# Check the results
#print(raw_dataset.head())
#print(raw_dataset.info())
#raw_dataset['price_numeric']
raw_dataset.sort_values(by=['price_numeric'])