In [1]:
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image

# Чтение эмбедингов 

In [2]:
embeddings_path = 'embeds.npy'
data_path = 'test_data/test_data/'
targets_path = 'class_mapping.csv'

In [3]:
labels = list(os.listdir(data_path))
embeds = np.load(embeddings_path)

In [4]:
def make_train_test(labels, embeds):
    answers = pd.read_csv(targets_path)
    ids = answers.Id.to_list()
    categories = answers.Category.to_list()
    
    train, train_ids, targets = [], [], []
    test, test_ids = [], []
    
    for label, embed in zip(labels, embeds):
        if label in ids:
            cat = categories[ids.index(label)]
            train.append(embed)
            train_ids.append(label)
            targets.append(cat)
        else:
            test.append(embed)
            test_ids.append(label)

    return train, train_ids, targets, test, test_ids

In [5]:
train, train_ids, targets, test, test_ids = make_train_test(labels, embeds)

In [6]:
train = np.array(train).reshape((-1, 512))
test = np.array(test).reshape((-1, 512))
targets = np.array(targets)
test_ids = np.array(test_ids)

## Submit

In [7]:
def make_submission(train, test, targets, preds):
    ans_ids = []
    ans_cats = []

    for label, ans in zip(train, targets):
        ans_ids.append(label)
        ans_cats.append(ans)

    for label, ans in zip(test, preds):
        ans_ids.append(label)
        ans_cats.append(ans)
        
    return pd.DataFrame(data=zip(ans_ids, ans_cats), columns=['Id', 'random'])

# Модели

## Просто Knn

In [8]:
from sklearn.neighbors import NearestNeighbors

In [9]:
neigh = NearestNeighbors(n_neighbors=1, metric='l2')
neigh.fit(train, targets)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l2',
                 metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                 radius=1.0)

In [10]:
preds_idx = neigh.kneighbors(test, return_distance=False).flatten()

In [11]:
preds = targets[preds_idx]

In [12]:
df = make_submission(train_ids, test_ids, targets, preds)

## Финальный результат:

In [13]:
df.to_csv('simple_knn_l2.csv', index=False)

## Добавляем трейн (не сработало, очень плохо!)

In [14]:
neigh = NearestNeighbors(n_neighbors=1, metric='cosine')
neigh.fit(train, targets)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                 radius=1.0)

In [15]:
distance, preds_idx = neigh.kneighbors(test, return_distance=True)

In [16]:
distance = distance.flatten()
preds_idx = preds_idx.flatten()

percentiles = [10, 20, 50, 80, 90]

for idx in set(preds_idx):
    thresholds = []
    for p in percentiles:
        mask = preds_idx == idx
        threshold = np.percentile(distance[mask], p)
        thresholds.append(threshold)
    plt.title(f'{idx}')
    plt.plot(percentiles, thresholds)
    plt.show()

In [17]:
percentile = 2

additional_train_path = []
additional_train = []
additional_targets = []

new_test_ids = []
new_test = []

for idx in set(targets[preds_idx]):
    mask = targets[preds_idx] == idx
    threshold = np.percentile(distance[mask], percentile)
#     print(np.min(distance[mask]), np.max(distance[mask]), threshold)
    threshold = 0.1
    confident = distance[mask] <= threshold
#     print(sum(confident))
    
    additional_train_path.extend(test_ids[mask][confident])
    additional_train.extend(test[mask][confident])
    additional_targets.extend([idx]*sum(confident))
    
    new_test_ids.extend(test_ids[mask][~confident])
    new_test.extend(test[mask][~confident])

In [18]:
new_train = np.concatenate((train, np.array(additional_train)))
new_train_ids = np.concatenate((train_ids, np.array(additional_train_path)))
new_targets = np.concatenate((targets, np.array(additional_targets)))

In [19]:
from sklearn.neighbors import KNeighborsClassifier

In [20]:
knn = KNeighborsClassifier(n_neighbors=7, metric='l2')
knn.fit(new_train, new_targets)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='l2',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [None]:
new_preds_idx = knn.predict(new_test)

In [None]:
new_preds = new_targets[new_preds_idx]

In [None]:
df = make_submission(new_train_ids, new_test_ids, new_targets, new_preds)

In [None]:
df.to_csv('clever_submission.csv', index=False)
# вышло хуже всего