## Feature Pipeline

In [2]:
import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engineering import functions
from utils import utils
from sklearn.pipeline import Pipeline

In [3]:
# Import data

df_random_test = pd.read_csv('./data/data-1589268390310-random-christies-test.csv')

In [4]:
# assign column names

title_desc_col = 'title'
medium_col = 'details'
dimension_col = 'details'
url_col = 'image_url'


In [6]:
df_transformed_1 = functions.text_based_transformer(df_random_test
                                        , medium=medium_col
                                        , dimensions=dimension_col
                                        , title=title_desc_col)

df_transformed_2 = functions.is_drawing(df_transformed_1)

df_transformed_2['long_list_of_things'] = df_transformed_2['details'].apply(lambda x: functions.long_list_of_things(str(x)))

df = df_transformed_2



In [7]:
df.columns

Index(['index', 'sale_date', 'price', 'estimate_min', 'estimate_max',
       'currency', 'artist', 'title', 'size', 'details', 'image_url',
       'timestamp', 'auction_id', 'filename_id', 'filepath', 'filename',
       'long_list_of_things', 'installation_keyword', 'print_keyword',
       'drawing_paper_word', 'drawing_keyword', 'painting_keyword',
       'photo_keyword', 'has_3dimensions', 'furniture_keyword',
       'decorative_obj_keyword', 'is_drawing'],
      dtype='object')

In [8]:
utils.view_images_from_url(df, url_col)

26


86


2


55


75


93


16


73


54


95


53


92


78


13


7


30


22


24


33


8


In [9]:
# Load model
model_filename = "model_1_DecisionTreeClassifier.pkl"

with open('./models/'+ model_filename , 'rb') as file:
    model_file = pickle.load(file)

In [10]:
# Check model file
model_file

{'model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=9, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=10,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'features': ['long_list_of_things',
  'installation_keyword',
  'print_keyword',
  'drawing_paper_word',
  'drawing_keyword',
  'painting_keyword',
  'photo_keyword',
  'has_3dimensions',
  'furniture_keyword',
  'decorative_obj_keyword',
  'is_drawing'],
 'labels': ['Drawing',
  'Installation',
  'Painting',
  'Photograph',
  'Print',
  'Sculpture',
  'decorative_object',
  'furniture']}

In [11]:
model = model_file['model']
features = model_file['features']
labels = model_file['labels']

# Check all Model Features Exist

df[features]

Unnamed: 0,long_list_of_things,installation_keyword,print_keyword,drawing_paper_word,drawing_keyword,painting_keyword,photo_keyword,has_3dimensions,furniture_keyword,decorative_obj_keyword,is_drawing
0,True,False,False,False,False,True,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,False,False
2,True,False,False,True,True,False,False,False,False,False,True
3,True,False,False,False,True,True,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
95,True,False,False,False,True,True,False,False,False,False,True
96,True,False,False,False,True,True,False,False,False,False,True
97,False,False,False,False,False,False,False,False,False,False,False
98,True,False,False,True,True,False,False,False,False,False,True


In [12]:
# Get Test Results

results = model.predict(df[features])
y_pred = pd.DataFrame(columns=labels
                      , data=results
                      , index=df.index)

In [13]:
y_pred['no_classification'] = y_pred.apply(lambda x: 1 if sum(x) == 0 else 0, axis=1)

In [14]:
y_pred

Unnamed: 0,Drawing,Installation,Painting,Photograph,Print,Sculpture,decorative_object,furniture,no_classification
0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
95,1,0,0,0,0,0,0,0,0
96,1,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,1
98,1,0,0,0,0,0,0,0,0


In [18]:
x = y_pred.stack()
results = pd.Series(pd.Categorical(x[x!=0].index.get_level_values(1)))

results[60:]

60    no_classification
61            Sculpture
62            Sculpture
63         Installation
64         Installation
65    no_classification
66            Sculpture
67            Sculpture
68    no_classification
69    no_classification
70    no_classification
71            Sculpture
72    no_classification
73            Sculpture
74    no_classification
75    no_classification
76    decorative_object
77            Sculpture
78            Sculpture
79            Sculpture
80            Sculpture
81    no_classification
82            Sculpture
83            Sculpture
84    no_classification
85            Sculpture
86             Painting
87    no_classification
88              Drawing
89    no_classification
90              Drawing
91         Installation
92            Sculpture
93    no_classification
94                Print
95              Drawing
96              Drawing
97    no_classification
98              Drawing
99            Sculpture
dtype: category
Categories (8, object): 

In [17]:
utils.view_images_from_url(df, url_col, subsample=False)

0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


47


48


49


50


51


52


53


54


55


56


57


58


59


60


61


62


63


64


65


66


67


68


69


70


71


72


73


74


75


76


77


78


79


80


81


82


83


84


85


86


87


88


89


90


91


92


93


94


95


96


97


98


99


## Manual Assesment

Accuracy: 0.64

Many bronze sculptures are not classifiedclassified as sculpture/decorative object.
Drawing misclassified as installation
When there's a little depth to the painting it can be classed as 3D
There's an edge case where a painted wooden sculpture was classified as a painting.

Feedback:
* Need to add 'terracotta' to decorative object


In [19]:
for key, values in df['details'].items():
    print (key)
    print(values)
    print ("")
    print ("")

0
["signed and dated Wifredo Lam, 1970 (faintly on the lower right and again on the reverse)\r","oil on canvas\r","18 x 14 7/8 in. (45.7 x 37.7 cm.)\r","Painted in 1970."]


1
["signed and dated Sander 1964 (on the reverse) \r","oil on canvas \r","60 x 54\u00bc in. (152.4 x 137.7 cm.) \r","Painted in 1964. "]


2
["signed in Arabic, signed M.Sabry (lower left) \r","pastel and charcoal on paper \r","17 3/8 x 25 1/8in. (44 x 64cm.) \r","Executed circa late 1950s "]


3
["signed, titled in Japanese and dated YAYOI KUSAMA 1980 (on the reverse); signed and dated Yayoi Kusama 1980 (on the stretcher)\r","acrylic, graphite and fabric collage on canvas \r","51 3/8 x 64in. (130.5 x 162.6cm.) \r","Executed in 1980 "]


4
["c-print on Plexiglas \r","86 x 57\u00bc in. (218.4 x 145.4 cm.) \r","Executed in 2012. This work is number two of two artists proofs aside from an edition of three. ",""]


5
["signed, titled and dated Christo 2002 The Gates (Project for Central Park, New York City) (lower left