In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from label_to_cat import LABEL_TO_CAT

BATCHES_COUNT = 12091
TEST_SIZE = 1768182

PATHS = [
    '../input/densenet201_test.h5',
    '../input/predict_probs_resnet50.h5',
    '../input/inc3_test_xDDD.h5',
    '../input/se_inc_test.h5'
]

In [2]:
%%time
stores = []
for path in PATHS:
    store = pd.HDFStore(path)
    print('{} -- {} tables'.format(path, len(store)))
    assert len(store) == BATCHES_COUNT
    stores.append(store)
keys = stores[0].keys()
print('Keys size = ', len(keys))

../input/densenet201_test.h5 -- 12091 tables
../input/predict_probs_resnet50.h5 -- 12091 tables
../input/inc3_test_xDDD.h5 -- 12091 tables
../input/se_inc_test.h5 -- 12091 tables
Keys size =  12091
CPU times: user 6min 43s, sys: 3.64 s, total: 6min 46s
Wall time: 6min 47s


In [3]:
ave_preds = []
cur_id = None
cur_sum = None

def select_tables(stores, key):
    tables = []
    for store in stores:
        table = store.select(key)
        tables.append(table)
    assert len(tables) == len(stores)
    return tables

def get_product_id(tables, start):
    res = None
    for table in tables:
        product_id = table.pr_id.iloc[start]
        if res is None:
            res = product_id
        else:
            assert res == product_id
    assert res is not None
    return res

def get_sums_10crops(tables, start):
    return [table[start : start + 10].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
               for table in tables]

def get_sums_1sum(tables, start):
    return [table[start : start + 1].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
                for table in tables]
        

for key in tqdm(keys): 
    tables = select_tables(stores, key)
    for i in range(int(tables[0].shape[0] / 10)):
        #st = 10 * i
        pr_id1 = get_product_id(tables[:2], 10*i)
        pr_id2 = get_product_id(tables[2:], i)
        assert pr_id1 == pr_id2
        product_id = pr_id1
        
        sums_10crops = get_sums_10crops(tables[:2], 10*i)
        sums_1sum = get_sums_1sum(tables[2:], i)
        
        new_sum = sum(sums_10crops + sums_1sum)
        
        if cur_id == product_id:
            assert cur_sum is not None
            cur_sum += new_sum
            continue
        elif cur_id is not None:
            assert cur_sum is not None
            cat = LABEL_TO_CAT[int(cur_sum.idxmax()) - 1]
            ave_preds.append((int(cur_id), cat))
            
        cur_id = product_id
        cur_sum = new_sum
        
if cur_id is not None:
    cat = LABEL_TO_CAT[int(cur_sum.idxmax()) - 1]
    ave_preds.append((int(cur_id), cat))
            
ave_preds_df = pd.DataFrame(data=ave_preds, columns=['_id','category_id'])
print('Dataframe shape {}; products in test {}'.format(ave_preds_df.shape, TEST_SIZE))
print('Duplicates {}; affect accuracy {}'.format(ave_preds_df.shape[0] - TEST_SIZE,
                                                float(ave_preds_df.shape[0] - TEST_SIZE) / TEST_SIZE))
ave_no_dupls = ave_preds_df.drop_duplicates(subset='_id', inplace=False)
print('No duplicated shape:', ave_no_dupls.shape)

ave_no_dupls.to_csv('../submit/four_alter_x1.csv', index=False)

100%|██████████| 12091/12091 [8:47:45<00:00,  2.62s/it]  


Dataframe shape (1769197, 2); products in test 1768182
Duplicates 1015; affect accuracy 0.0005740359306903928
No duplicated shape: (1768182, 2)
