In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score

In [12]:
def labelstring(onehot: np.ndarray) -> str:
    labels = np.array([f'l{i}' for i in range(92)])
    return ' '.join(labels[onehot[0]])

In [14]:
train = np.load('clip_embeddings_train.npz')
X_train = train['embeddings'] 
y_train = train['labels_onehot']

In [22]:
clf1 = LogisticRegression(dual=False, solver='liblinear', class_weight='balanced', random_state=1) # NB! Dual False
clf2 = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=1)
clf3 = CalibratedClassifierCV(SVC(C=2.0, class_weight='balanced', random_state=1))
clf4 = XGBClassifier(scale_pos_weight=90, max_delta_step=5, random_state=1)
clf5 = XGBClassifier(scale_pos_weight=80, random_state=1)

In [23]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3), ('xgb1', clf4), ('xgb2', clf5)],voting='soft')
final = MultiOutputClassifier(eclf)
final.fit(X_train, y_train)



In [25]:
prediction_on_train = final.predict(X_train)

In [26]:
print(f'f1 score: {f1_score(y_train, prediction_on_train, average="macro"):.3f}')
print(final.score(X_train, y_train))

f1 score: 1.000
1.0


In [27]:
test = np.load('clip_embeddings_test.npz', allow_pickle=True)
X_test = test['embeddings']
test_img_ids = test['img_id']

testdf = pd.read_csv('test.csv')

In [28]:
labelsdf = pd.read_csv('labels.csv')
testlabels = []

for img_id in testdf['image_id']:
    if img_id in test_img_ids:
        prediction = final.predict(test['embeddings'][img_id == test_img_ids].reshape(1,-1))
        predicted_labels = labelstring(prediction.astype(bool))
        if len(predicted_labels) == 0:
            testlabels.append('l1')
        else:
            testlabels.append(predicted_labels)
        print('='*40)
        print(img_id)
        print(labelsdf.loc[labelsdf.label_id.isin(testlabels[-1].split(' ')), 'object'])
        print('='*40)
    else:
        # default label for the missing images in our test set
        print('test image', img_id, 'missing from images')
        testlabels.append('l0')


testdf['labels'] = testlabels
testdf.to_csv('kea_submissions/stacking1.csv', index=False)

img102.jpg
1                  trees
5              buildings
24    town hall of tartu
39                 flags
Name: object, dtype: object
img103.jpg
0    people
1     trees
2     grass
7      road
Name: object, dtype: object
img11.jpg
1        trees
4         snow
5    buildings
Name: object, dtype: object
img113.jpg
10     nature
13    flowers
Name: object, dtype: object
img114.jpg
51    plants
Name: object, dtype: object
img121.jpg
0        people
1         trees
5     buildings
6         water
19       clouds
22         sand
34        sunny
Name: object, dtype: object
img126.jpg
0    people
1     trees
6     water
Name: object, dtype: object
img131.jpg
1    trees
Name: object, dtype: object
img136.jpg
44    glasses
Name: object, dtype: object
img137.jpg
1    trees
4     snow
Name: object, dtype: object
img139.jpg
1    trees
Name: object, dtype: object
img140.jpg
1     trees
22     sand
Name: object, dtype: object
img15.jpg
53    green garbage can
91                 pole
Name: objec