## Feature Pipeline

In [1]:
import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engineering import functions
from utils import utils
from sklearn.pipeline import Pipeline

In [2]:
# Import data

df_picasso_test = pd.read_csv('./data/data-1589261181890-picasso-christies-test.csv')

In [3]:
# assign column names

title_desc_col = 'title'
medium_col = 'details'
dimension_col = 'details'
url_col = 'image_url'


In [4]:
df_transformed_1 = functions.text_based_transformer(df_picasso_test
                                        , medium=medium_col
                                        , dimensions=dimension_col
                                        , title=title_desc_col)

df_transformed_2 = functions.is_drawing(df_transformed_1)

df_transformed_2['long_list_of_things'] = df_transformed_2['details'].apply(lambda x: functions.long_list_of_things(str(x)))

df = df_transformed_2



In [5]:
df.columns

Index(['index', 'sale_date', 'price', 'estimate_min', 'estimate_max',
       'currency', 'artist', 'title', 'size', 'details', 'image_url',
       'timestamp', 'auction_id', 'filename_id', 'filepath', 'filename',
       'long_list_of_things', 'installation_keyword', 'print_keyword',
       'drawing_paper_word', 'drawing_keyword', 'painting_keyword',
       'photo_keyword', 'has_3dimensions', 'furniture_keyword',
       'decorative_obj_keyword', 'is_drawing'],
      dtype='object')

In [6]:
utils.view_images_from_url(df, url_col)

26


86


2


55


75


93


16


73


54


95


53


92


78


13


7


30


22


24


33


8


In [7]:
# Load model
model_filename = "model_1_DecisionTreeClassifier.pkl"

with open('./models/'+ model_filename , 'rb') as file:
    model_file = pickle.load(file)

In [8]:
# Check model file
model_file

{'model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=9, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=10,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'features': ['long_list_of_things',
  'installation_keyword',
  'print_keyword',
  'drawing_paper_word',
  'drawing_keyword',
  'painting_keyword',
  'photo_keyword',
  'has_3dimensions',
  'furniture_keyword',
  'decorative_obj_keyword',
  'is_drawing'],
 'labels': ['Drawing',
  'Installation',
  'Painting',
  'Photograph',
  'Print',
  'Sculpture',
  'decorative_object',
  'furniture']}

In [9]:
model = model_file['model']
features = model_file['features']
labels = model_file['labels']

# Check all Model Features Exist

df[features]

Unnamed: 0,long_list_of_things,installation_keyword,print_keyword,drawing_paper_word,drawing_keyword,painting_keyword,photo_keyword,has_3dimensions,furniture_keyword,decorative_obj_keyword,is_drawing
0,True,False,False,True,True,False,False,False,False,False,True
1,True,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,True,False,False,False,False,False
3,True,False,False,True,True,True,False,True,False,False,True
4,True,False,False,True,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
95,True,False,False,True,True,True,False,False,False,False,True
96,True,False,False,True,True,True,False,False,False,False,True
97,True,False,False,True,True,False,False,False,False,False,True
98,True,False,False,False,False,False,False,True,False,False,False


In [28]:
# Get Test Results

results = model.predict(df[features])
y_pred = pd.DataFrame(columns=labels
                      , data=results
                      , index=df.index)

In [31]:
y_pred['no_classification'] = y_pred.apply(lambda x: 1 if sum(x) == 0 else 0, axis=1)

In [32]:
y_pred

Unnamed: 0,Drawing,Installation,Painting,Photograph,Print,Sculpture,decorative_object,furniture,no_classification
0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
95,1,0,0,0,0,0,0,0,0
96,1,0,0,0,0,0,0,0,0
97,1,0,0,0,0,0,0,0,0
98,0,0,0,0,0,1,0,0,0


In [41]:
x = y_pred.stack()
results = pd.Series(pd.Categorical(x[x!=0].index.get_level_values(1)))

results[60:]

60    no_classification
61            Sculpture
62             Painting
63              Drawing
64             Painting
65              Drawing
66            Sculpture
67            Sculpture
68              Drawing
69    no_classification
70            Sculpture
71              Drawing
72         Installation
73    no_classification
74            Sculpture
75             Painting
76             Painting
77              Drawing
78    no_classification
79             Painting
80            Sculpture
81         Installation
82    no_classification
83    no_classification
84             Painting
85            Sculpture
86              Drawing
87            Sculpture
88    no_classification
89                Print
90              Drawing
91    no_classification
92              Drawing
93            Sculpture
94    no_classification
95              Drawing
96              Drawing
97              Drawing
98            Sculpture
99              Drawing
dtype: category
Categories (7, object): 

In [34]:
utils.view_images_from_url(df, url_col, subsample=False)

0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


47


48


49


50


51


52


53


54


55


56


57


58


59


60


61


62


63


64


65


66


67


68


69


70


71


72


73


74


75


76


77


78


79


80


81


82


83


84


85


86


87


88


89


90


91


92


93


94


95


96


97


98


99


## Manual Assesment

Accuracy: 0.73

Drawings are mislabed as installations (likely due to the long list of items logic)
The oval style dishes and the vase looking sculptures are somethings not classified as sculpture/decorative object
There's an edge case where a painted wooden sculpture was classified as a painting.

Feedback:
* Need to add 'plate', 'plaque', 'pitcher' as decorative object
* Need to add 'bronze',  'patina', 'earthenware' for sculpture
* Need to add 'Height' for 3D


In [46]:
for key, values in df['details'].items():
    print (key)
    print(values)
    print ("")
    print ("")

0
["signed Picasso (upper right) \r","pen, brush and brown ink on paper \r","7 3/8 x 3\u00bd in. (18.7 x 8.9 cm.) ",""]


1
["stamped Madoura Plein Feu/Empreinte Originale de Picasso (underneath) \r","partially glazed ceramic plate \r","Length: 20 in. (51 cm.) \r","Conceived on 5 June 1952 and executed in an edition of 250 "]


2
["signed Picasso (lower left); dated 3.7.69 (upper left) \r","oil on corrugated cardboard \r","27\u00be x 19\u00bd in. (70.5 x 49.5 cm.) \r","Painted on 3 July 1969 "]


3
["with the signature Picasso (lower left) and dated 20.9.72 (upper left; recto); dated 21.9.72. 22.24.24 (lower left; verso)\r","felt-tip pen on paper (recto); watercolour, wash and brush and India ink on paper (verso)\r","23\u00bd x 31\u00bd in. (59.7 x 79.7 cm.) \r","Executed between the 20 and 21 September 1972 "]


4
["dated and numbered 8.11.71.IX (upper left) \r","black Cont\u00e9 crayon, colored felt-tip pens and brush and gray wash on paper\r","12 1/8 x 9 5/8 in. (31 x 24.4 cm.) \r",