In [115]:
import pandas as pd
import numpy as np
import sklearn.metrics as sm
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

In [116]:
df = pd.read_csv('rc_contextCollection_completeSet.csv', delimiter=',')

In [117]:
df_ceramic = df[df['artifact']=='ceramic']

In [118]:
%%capture
df_ceramic['pres_length'] = pd.to_numeric(df_ceramic['pres_length'], errors='coerce')
df_ceramic['pres_width'] = pd.to_numeric(df_ceramic['pres_width'], errors='coerce')
df_ceramic['pres_thick'] = pd.to_numeric(df_ceramic['pres_thick'], errors='coerce')
df_ceramic['pres_diam'] = pd.to_numeric(df_ceramic['pres_diam'], errors='coerce')

In [119]:
cols = ['classification', 'form', 'portion', 'fabric_color','pres_length','pres_width','pres_thick',
        'pres_diam','latitude', 'longitude']

categ_cols = ['classification', 'form', 'portion', 'fabric_color']

num_cols = ['pres_length', 'pres_width', 'pres_thick', 'pres_diam', 'latitude', 'longitude']

In [120]:
df2 = df_ceramic[cols]

In [150]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df3 = pd.DataFrame(imp.fit_transform(df2),
                   columns=df2.columns,
                   index=df2.index)

In [159]:
df4 = df3[categ_cols]
dt = LabelEncoder()
df5 = pd.DataFrame(columns=df4.columns, data=dt.fit_transform(df4.values.flatten()).reshape(df4.shape))
df6 = df3[num_cols]
df5.reset_index(drop=True, inplace=True)
df6.reset_index(drop=True, inplace=True)

In [153]:
bigdata = pd.concat([df5, df6], axis=1)
bigdata = bigdata.astype('float')

In [154]:
X = pd.DataFrame(bigdata[['classification', 'portion', 'fabric_color', 'pres_length', 'pres_width', 'pres_thick', 'pres_diam', 'latitude', 'longitude']])
y = pd.DataFrame(bigdata['form'])

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [126]:
params = {'n_estimators': 438, 'max_depth': 18, 'reg_alpha': 2, 'reg_lambda': 0, 'min_child_weight': 2, 'gamma': 0, 'learning_rate': 0.049135897324318896, 'colsample_bytree': 0.45999999999999996}

In [127]:
%%capture
model = XGBClassifier(params)
r = model.fit(X_train, y_train)

In [128]:
%%capture
y_pred = r.predict(X_test)

In [129]:
accuracy_score(y_test, y_pred)

0.7290322580645161

In [130]:
filename = 'pottery_classifier.joblib.pkl'

In [187]:
_ = joblib.dump(model, filename, compress=9)

In [188]:
classification_lst = list(df_ceramic.classification.unique())
classification_lst

['coarseware',
 'cookware',
 'fineware',
 'commonware',
 'pithos',
 'amphora',
 'roof tile',
 'loomweight',
 'game marker',
 'lamp',
 'statuette',
 'water pipe',
 'unguentarium',
 'lantern',
 'beehive',
 'bone',
 'kiln lining',
 'tile',
 'kiln debris',
 'lithic',
 'glass',
 'hypocaust tile',
 'plaster']

In [189]:
p_lst = list(df_ceramic.portion.unique())
p_lst

['handle',
 'bodysherd',
 'rim',
 'base',
 'profile',
 'shoulder',
 'ringfoot',
 'toe',
 'handle attachment',
 nan,
 'tile',
 'rim and handle',
 'rim fragment',
 'stamped fragment',
 'vertical neck',
 'neck',
 'square cube',
 'trefoil rim',
 'flaring rim',
 'strap handle',
 'partial',
 'hooked rim',
 'foot',
 'rim with profile',
 'incurved rim',
 'ring base',
 'nozzle',
 'false ring foot',
 'rim and handle attachment',
 'beehive',
 'neck with handle',
 'flat base',
 'bodysherd with bowing profile',
 'pitcher fragments',
 'base fragment',
 'rim and profile',
 'bodysherd with profile',
 'toe fragment',
 'handle frag',
 'rim and neck',
 'neck with handle attachment',
 'shoulder bodysherd',
 'horizontal handle',
 'toe and rim',
 'nozzle fragment',
 'rim with handle attach',
 'stamped',
 'several bases',
 'complete',
 'corner',
 'unknown',
 'multiple portions',
 'lug handle',
 'offset shoulder',
 'almond shaped rim',
 'handle with profile',
 'lip',
 'bodysherd handle / rim',
 'rim, base, bo

In [194]:
f_color_lst = list(df_ceramic.fabric_color.unique())
f_color_lst

[nan,
 'looks CS',
 'local orange',
 'CS fabric',
 'no slip',
 'very fine',
 'ringfoot',
 'local fabric',
 'whiteware',
 'orange',
 'hard gritty local',
 'small',
 'fabric looks CS',
 'dark red fabric',
 'red',
 'darb red',
 'tan',
 'brown',
 'pink',
 'dark brown',
 'pink orange',
 'orange/red',
 'orange brown',
 'red brown',
 'black',
 'light brown',
 'tan blslip',
 'white',
 'pink/blslip',
 'brown/grey',
 'grey',
 'orange/grey',
 'pinkish tan',
 'pink tan',
 'pink grey',
 'yellow white',
 'grey/pink',
 'orange/tan',
 'tan orange',
 'local tan',
 'local pink',
 'local brown',
 'exotic',
 'local yellow/white',
 'cream',
 'clear',
 'light orange',
 'flattop',
 'dark',
 'orange CS fabric',
 'gray gritty fabric',
 'pitched local fab',
 'unknown fab',
 'local orange fabric',
 'imported fabric',
 'rectangular',
 'flat everted rim',
 'flatrim',
 'horizontal everted rim',
 'grooved rim',
 'hellenistic black slip?',
 'imitation pergamene form?',
 'folded groove rim',
 'imported',
 'West Cilici

Пишем маппинги в файл для программы

In [202]:
clf_lst = [x for x in classification_lst if str(x) != 'nan']
portion_lst = [x for x in p_lst if str(x) != 'nan']
fabric_color_lst = [x for x in f_color_lst if str(x) != 'nan']
names_all = [x for x in dt.classes_ if str(x) != 'nan']

In [203]:
name_mapping = dict(zip(dt.classes_, dt.transform(dt.classes_)))
with open("C:\Projects\pottery\ctglists.py", "w") as dict_file:
    dict_file.write('name_mapping = ')
    dict_file.write(repr(name_mapping))
    dict_file.write('\n')
    dict_file.write('names_all = ')
    dict_file.write(repr(names_all))
    dict_file.write('\n')
    dict_file.write('clf_lst = ')
    dict_file.write(repr(clf_lst))
    dict_file.write('\n')
    dict_file.write('portion_lst = ')
    dict_file.write(repr(portion_lst))
    dict_file.write('\n')
    dict_file.write('fabric_color_lst = ')
    dict_file.write(repr(fabric_color_lst))

In [135]:
%%capture
#y_pred = r.predict(X_test)

In [136]:
X_test

Unnamed: 0,classification,portion,fabric_color,pres_length,pres_width,pres_thick,pres_diam,latitude,longitude
8095,35.0,84.0,365.0,8.0,2.0,1.0,10.0,36.177385,32.394768
31,134.0,84.0,426.0,8.0,2.0,1.0,10.0,36.221134,32.436667
1509,35.0,261.0,697.0,8.0,2.0,1.0,10.0,36.223264,32.456546
2980,35.0,84.0,426.0,8.0,2.0,1.0,10.0,36.168415,32.403799
5926,141.0,84.0,426.0,8.0,2.0,1.0,10.0,36.455807,32.447161
...,...,...,...,...,...,...,...,...,...
5685,141.0,551.0,426.0,12.0,2.0,2.0,10.0,36.427175,32.410385
1515,197.0,84.0,349.0,8.0,2.0,1.0,10.0,36.221221,32.465339
7477,197.0,508.0,426.0,8.0,2.0,1.0,10.0,36.175328,32.382141
1070,197.0,84.0,426.0,8.0,2.0,1.0,10.0,36.190525,32.461607


In [137]:
df = pd.DataFrame([[35.0, 84.0, 365.0, 8.0, 2.0, 1.0, 10.0, 36.177385, 32.394768]], 
                  columns=['classification','portion','fabric_color','pres_length','pres_width','pres_thick','pres_diam','latitude','longitude'])

In [138]:
df

Unnamed: 0,classification,portion,fabric_color,pres_length,pres_width,pres_thick,pres_diam,latitude,longitude
0,35.0,84.0,365.0,8.0,2.0,1.0,10.0,36.177385,32.394768


In [139]:
y_pred = r.predict(df)

In [140]:
y_pred

array([35.])