In [1]:
import numpy as np

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('../artists.csv', index_col='artFinderId')
df.shape

(6820, 12)

In [3]:
df.isna().any().sum()

0

In [4]:
len(df.loc[df.duplicated()])

84

In [5]:
# remove duplicates
df = df.loc[~df.duplicated()]
len(df)

6736

In [6]:
df.dtypes

accepts_commissions             bool
artistName                    object
artist_rating                  int64
country                       object
follows                        int64
is_artist                       bool
is_artist_of_the_day_today      bool
is_represented_by_gallery       bool
products_count                 int64
profession                    object
seller_rating                 object
slug                          object
dtype: object

In [7]:
# Dropped cols are all True and False respectively
df = df.drop(['is_artist', 'is_artist_of_the_day_today'], 1)

df.columns = ['commissions', 'name', 'artist_rating', 'country',
                   'follows', 'representation', 'num_prods',
                   'profession', 'seller_rating', 'slug']

In [8]:
# this col imports as all strings - some values are Null
# change seller_rating 'n' value to '0' and change dtype of col to int64
df.seller_rating = df.seller_rating.replace('n', '0').astype('int64')

In [9]:
df.dtypes

commissions         bool
name              object
artist_rating      int64
country           object
follows            int64
representation      bool
num_prods          int64
profession        object
seller_rating      int64
slug              object
dtype: object

In [10]:
df.describe()
# What is Artist Rating? Some internal designation?
# It ranges from 0-2 so it's not the number of stars on a profile,
# which is presumably Seller Rating.

Unnamed: 0,artist_rating,follows,num_prods,seller_rating
count,6736.0,6736.0,6736.0,6736.0
mean,0.643112,299.111787,73.194329,2.38851
std,0.857652,453.173626,194.346669,2.420166
min,0.0,0.0,3.0,0.0
25%,0.0,62.0,10.0,0.0
50%,0.0,152.0,25.0,0.0
75%,2.0,331.0,68.0,5.0
max,2.0,7889.0,7996.0,5.0


In [11]:
cat_cols = df.select_dtypes(exclude=np.number)

for col in cat_cols[['commissions', 'country',
                     'representation', 'profession']]:
    print(cat_cols[col].value_counts(dropna=False), '\n')

True     6240
False     496
Name: commissions, dtype: int64 

United Kingdom            2225
United States             1025
France                     474
Russia                     325
Ukraine                    289
Italy                      219
Germany                    215
Spain                      151
Canada                     135
Netherlands                120
Bulgaria                   119
Australia                  108
Poland                      97
Lithuania                   95
Serbia                      73
India                       65
Romania                     57
Armenia                     52
Ireland                     51
Czech Republic              47
Belgium                     45
Slovakia                    41
Belarus                     40
Greece                      39
Israel                      32
Portugal                    29
Hungary                     27
Sweden                      26
Mexico                      25
Slovenia                    23
Croatia 

In [25]:
# 8% of artists do NOT accept commissions
(df.commissions.value_counts().values[1] /
    df.commissions.value_counts().values[0]) * 100

# 11% of artists have gallery representation
(df.representation.value_counts().values[1] /
    df.representation.value_counts().values[0]) * 100

10.825929582099375