In [1]:
import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engineering import functions
from utils import utils

In [4]:
# Import Christies dataset

df = pd.read_csv('./data/data-1589349272172-non-french-christies.csv')

In [5]:
# assign column names

title_desc_col = 'title'
medium_col = 'details'
dimension_col = 'details'
url_col = 'image_url'


In [7]:
df_transformed_1 = functions.text_based_transformer(df
                                        , medium=medium_col
                                        , dimensions=dimension_col
                                        , title=title_desc_col)

df_transformed_2 = functions.is_drawing(df_transformed_1)

df_transformed_2['long_list_of_things'] = df_transformed_2['details'].apply(lambda x: functions.long_list_of_things(str(x)))

df = df_transformed_2



In [8]:
# Load model
model_filename = "model_3_DecisionTreeClassifier_v2features.pkl"

with open('./models/'+ model_filename , 'rb') as file:
    model_file = pickle.load(file)

In [9]:
# Check model file
model_file

{'model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=10, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=5,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'features': ['long_list_of_things',
  'installation_keyword',
  'print_keyword',
  'drawing_paper_word',
  'drawing_keyword',
  'painting_keyword',
  'photo_keyword',
  'has_3dimensions',
  'furniture_keyword',
  'sculpture_keyword',
  'decorative_obj_keyword',
  'decorative_obj_keyword_med',
  'is_drawing'],
 'labels': ['Drawing',
  'Installation',
  'Painting',
  'Photograph',
  'Print',
  'Sculpture',
  'decorative_object',
  'furniture']}

In [10]:
model = model_file['model']
features = model_file['features']
labels = model_file['labels']

# Check all Model Features Exist

df[features]

Unnamed: 0,long_list_of_things,installation_keyword,print_keyword,drawing_paper_word,drawing_keyword,painting_keyword,photo_keyword,has_3dimensions,furniture_keyword,sculpture_keyword,decorative_obj_keyword,decorative_obj_keyword_med,is_drawing
0,False,False,False,False,False,True,False,False,False,False,False,False,False
1,True,False,False,True,False,False,True,False,False,False,False,False,True
2,False,False,False,False,False,False,False,True,False,False,False,False,False
3,False,True,False,False,False,False,False,True,False,True,False,False,False
4,False,False,False,False,False,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38926,True,True,False,False,True,True,True,False,False,False,False,False,True
38927,True,False,False,False,False,False,False,True,False,True,False,False,False
38928,False,False,False,True,False,True,False,False,False,False,False,False,True
38929,True,False,False,False,True,False,True,True,False,False,False,False,True


In [13]:
# Get Test Results

results = model.predict(df[features])
y_pred = pd.DataFrame(columns=labels
                      , data=results
                      , index=df.index)

# create label for the not classified result

y_pred['no_classification'] = y_pred.apply(lambda x: 1 if sum(x) == 0 else 0, axis=1)
x = y_pred.stack()
results = pd.Series(pd.Categorical(x[x!=0].index.get_level_values(1)))

df_classifications=pd.DataFrame(results, columns=['Classification'])

In [24]:
df_classified = df.join(df_classifications)

In [25]:
df_classified

Unnamed: 0,index,sale_date,price,estimate_min,estimate_max,currency,artist,title,size,details,...,drawing_keyword,painting_keyword,photo_keyword,sculpture_keyword,decorative_obj_keyword_med,has_3dimensions,furniture_keyword,decorative_obj_keyword,is_drawing,Classification
0,24173,2010-09-22 00:00:00,16250,15000.0,20000.0,USD,Eberhard Havekost,Untitled \r,13 3/8 x 17 3/8 in. (33.9 x 45.4 cm.),"[""signed and dated Havekost DD 97 (on the reve...",...,False,True,False,False,False,False,False,False,False,Painting
1,24113,2010-09-22 00:00:00,13750,8000.0,12000.0,USD,Vito Acconci,Changing Title\r,30 x 14\u00bd in. (76.2 x 36.8 cm.),"[""signed, titled and dated Changing Title 1969...",...,False,False,True,False,False,False,False,False,True,Drawing
2,24125,2010-09-22 00:00:00,11875,12000.0,18000.0,USD,Lucio Fontana,Untitled \r,5\u00bc x 5 x 4 in. (13.3 x 12.7 x 10.1cm.),"[""incised with initials and dated LF 47 (on th...",...,False,False,False,False,False,True,False,False,False,Sculpture
3,24126,2010-09-22 00:00:00,47500,25000.0,35000.0,USD,Lynda Benglis,Untitled (Shady Grove) \r,34\u00bd x 19\u00bd x 3\u00be in. (87.6 x 49.5...,"[""signed with initials and numbered LB 282 (on...",...,False,False,False,True,False,True,False,False,False,Sculpture
4,24235,2010-11-03 00:00:00,626500,500000.0,700000.0,USD,Marc Chagall,Fleurs dans un vase u carreaux \r,19 5/8 x 16 in. (49.8 x 40.6 cm.),"[""signed Marc Chagall (lower right) \r"",""oil o...",...,False,True,False,False,False,False,False,True,False,decorative_object
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38926,24105,2010-09-22 00:00:00,80500,40000.0,60000.0,USD,Christo,Wrapped Walkways (Project for St. Stephens Gre...,28 x 22 in. (71.1 x 55.8 cm.),"[""signed and dated CHRISTO 1977 (upper right o...",...,True,True,True,False,False,False,False,False,True,Sculpture
38927,24107,2010-09-22 00:00:00,11250,10000.0,15000.0,USD,Kenny Scharf,Phorshine \r,20 x 14 x 14 in. (50.8 x 35.5 x 35.5 cm.),"[""signed, numbered and dated Kenny Scharf 96 3...",...,False,False,False,True,False,True,False,False,False,Sculpture
38928,24132,2010-09-22 00:00:00,74500,30000.0,40000.0,USD,Alexander Calder,Untitled \r,29\u00bd x 43 in. (74.9 x 109.2 cm.),"[""signed and dated Calder 66 (lower right) \r""...",...,False,True,False,False,False,False,False,False,True,Drawing
38929,24109,2010-09-22 00:00:00,25000,10000.0,15000.0,USD,Dario Robleto,"If We Fly Away, Theyll Fly Away... \r",24 x 16 x 16 in. (60.9 x 40.6 x 40. 6 cm.),"[""Bone dust, fabric, metal, silver, lead, viny...",...,True,False,True,False,False,True,False,False,True,no_classification


In [26]:
df_cat = df_classified[['index', 'Classification']].groupby('Classification').count().sort_values('index', ascending=False)


In [27]:
df_cat

Unnamed: 0_level_0,index
Classification,Unnamed: 1_level_1
Painting,17335
Drawing,9590
Sculpture,5725
no_classification,3623
Photograph,798
decorative_object,766
Installation,626
Print,328
furniture,140


In [33]:
## Check out Furniture Examples

clas = 'furniture'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=30
                     , random_state=8)

21782


23196


32203


5117


1872


25366


8028


20884


7282


21878


20394


27094


20905


8692


11450


1082


12639


1824


4628


3840


30724


22917


24264


10885


36802


19367


12637


20081


34739


1336


In [None]:
# Some furniture not correct since it depects drawings of furniture

In [34]:
clas = 'Installation'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=30
                     , random_state=0)

22590


17293


28484


18220


28768


2477


13118


30546


35484


12201


35524


31495


5847


3961


16067


33833


12100


28979


16005


32886


11448


10008


34369


23155


35377


31014


11723


22059


2295


13781


In [None]:
# Not great for Installation

In [38]:
clas = 'Photograph'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=30
                     , random_state=888)

38888


38124


13918


2365


472


2229


38922


17630


6573


38231


22722


8154


1565


29703


31195


4977


33940


28626


4350


26451


28983


14818


28394


2363


30884


23437


12228


8767


18291


14553


In [None]:
# Pretty good for Photograph

In [40]:
clas = 'decorative_object'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=50
                     , random_state=888)

25425


14095


188


30219


5303


13863


3548


38449


29941


11590


22895


24251


37422


35405


2765


6316


29070


34314


8348


36854


14618


32019


24129


6828


29829


5279


35298


29670


8388


19554


35195


26536


34394


25141


27596


5272


410


37675


7697


3507


20177


41


13481


20078


36698


24554


38787


20495


34393


37420


In [None]:
# Some false positives due to subject matter of painint/drawing

In [42]:
clas = 'Sculpture'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=50
                     , random_state=888)

2523


6277


32175


21157


3276


31365


11369


13411


31749


21949


14119


36838


2448


25031


1248


30423


3200


35813


36211


37393


1549


32341


38035


10092


11719


7307


612


15483


3966


35932


4284


36474


12932


19045


28488


7455


21499


38636


15661


34366


8838


23608


28255


12481


13077


5851


32466


25526


29623


21928


In [None]:
# Pretty good

In [43]:
clas = 'no_classification'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=50
                     , random_state=888)

16923


34731


31313


17690


27334


28103


14155


31143


1803


35809


34544


5511


9183


37454


24858


10823


29242


25893


11933


13282


30212


36943


27953


13228


18496


31836


35556


22816


17965


11078


33656


23280


9969


7166


23991


29862


6489


26469


35751


10765


12262


15534


285


27143


21232


24912


31516


9421


38716


14822


In [None]:
# Mixed bag

In [44]:
clas = 'Drawing'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=50
                     , random_state=888)

2334


31051


16187


14694


7503


24240


10613


678


33301


3820


32331


13408


37343


17700


32520


16061


4251


34263


1671


23148


11274


28021


30034


27291


37987


30388


5561


17594


38870


25513


136


21389


13884


22126


11314


10785


1551


4110


28623


10109


38228


21808


8876


15983


494


14970


30896


28529


34734


8995


In [None]:
# Not that great

In [45]:
clas = 'Painting'
utils.view_images_from_url(df_classified[df_classified['Classification'] == clas]
                     , url_col
                     , subsample=True
                     , sample_size=50
                     , random_state=888)

22578


2443


23917


33300


19886


13681


36270


8705


27527


8755


993


24933


36452


5077


25546


11972


20900


7343


11228


6183


8277


24739


22528


34683


37924


5131


15148


14347


8276


28018


15590


11350


8615


8349


15727


14216


17877


18697


13639


32927


28677


22513


26705


19881


25069


35388


34758


28787


26383


29397


In [47]:
df_classified[df_classified['print_keyword']== True].shape

(546, 30)