In [44]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [3]:
def labelstring(onehot: np.ndarray) -> str:
    labels = np.array([f'l{i}' for i in range(92)])
    return ' '.join(labels[onehot[0]])

In [4]:
train = np.load('clip_embeddings_train.npz')
X_train = train['embeddings'] 
y_train = train['labels_onehot']

In [40]:
xgb = XGBClassifier(random_state=111)

pipe = Pipeline(steps=[('classifier', MultiOutputClassifier(xgb))])


In [35]:
pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('xgb',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               callbacks=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               early_stopping_rounds=None,
                                                               enable_categorical=False,
                                                               eval_metric=None,
                                                               feature_types=None,
                                                               gamma=None,
                         

In [59]:
search_grid = {
    'classifier__estimator__n_estimators': [100, 200, 500],
    'classifier__estimator__learning_rate': [0.01,0.05,0.1],
    'classifier__estimator__booster': ['gbtree', 'gblinear'],
    'classifier__estimator__gamma': [0, 0.5, 1],
    'classifier__estimator__reg_alpha': [0, 0.5, 1],
    'classifier__estimator__reg_lambda': [0.5, 1, 5],
    'classifier__estimator__scale_pos_weight': [40, 60, 80, 90, 100], #default is 1, lower values than 40 were tested beforehand
    'classifier__estimator__max_delta_step': [5, 7, 10] #default is 0, lower values than 5 were tested beforehand

}

In [60]:
#clf = GridSearchCV(MultiOutputClassifier(XGBClassifier(n_jobs=-1)), params, n_jobs=-1, cv=KFold(n_splits=3), scoring='f1') #GridSearch is too slow
clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_grid, scoring='f1_macro', refit='f1_macro', error_score='raise', cv=KFold(n_splits=3),
                         n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)
print('Best score:', clf.best_score_)
print('Best score:', clf.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max_delta_step" } are not used.

Parameters: { "gamma", "max

In [62]:
print('Best score:', clf.cv_results_)

Best score: {'mean_fit_time': array([ 62.13891244,  10.47386781, 101.21870224, 129.85644579,
       502.63166761,  96.32724778, 247.16782212,  41.09370875,
       523.7465806 , 363.54213142]), 'std_fit_time': array([  0.17350486,   1.53346057,   1.89283739,   8.60227756,
         5.11473678,  12.76480613,  17.60288055,   4.55973921,
        10.38225148, 114.45790044]), 'mean_score_time': array([0.60443568, 0.43466663, 0.66533478, 0.53394667, 0.67622073,
       0.84887513, 0.584946  , 0.50725293, 0.64711428, 0.6163427 ]), 'std_score_time': array([0.08356486, 0.04553418, 0.18219786, 0.04333244, 0.04450806,
       0.19363687, 0.02929463, 0.03107154, 0.02736006, 0.04450605]), 'param_classifier__estimator__scale_pos_weight': masked_array(data=[40, 80, 80, 100, 80, 90, 90, 90, 80, 90],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_classifier__estimator__reg_lambda': masked_

In [63]:
prediction_on_train = clf.predict(X_train)

In [64]:
print(f'f1 score: {f1_score(y_train, prediction_on_train, average="macro"):.3f}')
print(clf.score(X_train, y_train))

f1 score: 0.652
0.651736031373183


In [65]:
test = np.load('clip_embeddings_test.npz', allow_pickle=True)
X_test = test['embeddings']
test_img_ids = test['img_id']

testdf = pd.read_csv('test.csv')

In [66]:
labelsdf = pd.read_csv('labels.csv')
testlabels = []

for img_id in testdf['image_id']:
    if img_id in test_img_ids:
        prediction = clf.predict(test['embeddings'][img_id == test_img_ids].reshape(1,-1))
        predicted_labels = labelstring(prediction.astype(bool))
        if len(predicted_labels) == 0:
            testlabels.append('l1')
        else:
            testlabels.append(predicted_labels)
        print('='*40)
        print(img_id)
        print(labelsdf.loc[labelsdf.label_id.isin(testlabels[-1].split(' ')), 'object'])
        print('='*40)
    else:
        # default label for the missing images in our test set
        print('test image', img_id, 'missing from images')
        testlabels.append('l0')


testdf['labels'] = testlabels
testdf.to_csv('kea_submissions/clip_xgboost_cv1.csv', index=False)

img102.jpg
0                                people
1                                 trees
3                              building
5                             buildings
6                                 water
7                                  road
9                                  cars
10                               nature
12                             pavement
15                           lamp posts
20                               bridge
23                               street
24                   town hall of tartu
26                        estonian flag
28    the sculpture of kissing students
32                             blue sky
39                                flags
41                    drone photography
48                             fountain
59                                 park
80                               lights
86                                tents
Name: object, dtype: object
img103.jpg
0         people
1          trees
2          grass
3       building
5 