In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from label_to_cat import LABEL_TO_CAT

BATCHES_COUNT = 12091
TEST_SIZE = 1768182

PATHS = [
    '../input/densenet_test.h5',
    '../input/predict_probs_resnet50.h5',
    '../input/inception3_test.h5',
    '../se_inc_test.h5'
]

In [None]:
%%time
stores = []
for path in PATHS:
    store = pd.HDFStore(path)
    print('{} -- {} tables'.format(path, len(store)))
    assert len(store) == BATCHES_COUNT
    stores.append(store)
keys = stores[0].keys()
print('Keys size = ', len(keys))

In [None]:
ave_preds = []
cur_id = None
cur_sum = None

def select_tables(stores, key):
    tables = []
    shape = None
    for store in stores:
        table = store.select(key)
        if shape is None:
            shape = table.shape
        else:
            assert table.shape == shape
        tables.append(table)
    assert len(tables) == len(stores)
    return tables

def get_product_id(tables, start):
    res = None
    for table in tables:
        product_id = table.pr_id.iloc[start]
        if res is None:
            res = product_id
        else:
            assert res == product_id
    assert res is not None
    return res

def get_sums_10crops(tables, start):
    return [table[start : start + 10].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
               for table in tables]

def get_sums_1sum(tables, start):
    return [table[i : i + 1].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
                for table in tables]
        

for key in tqdm(keys): 
    tables = select_tables(stores, key)
    for i in range(int(res_table.shape[0] / 10)):
        st = 10 * i
        product_id = get_product_id(tables, st)
        
        sums_10crops = get_sums_10crops(tables[:2], st)
        sums_1sum = get_sums_1sum(tables[2:], st)
        
        new_sum = sum(sums_10crops + sums_1sum)
        
        if cur_id == product_id:
            assert cur_sum is not None
            cur_sum += new_sum
            continue
        elif cur_id is not None:
            assert cur_sum is not None
            cat = LABEL_TO_CAT[int(cur_sum.idxmax()) - 1]
            ave_preds.append((int(cur_id), cat))
            
        cur_id = product_id
        cur_sum = new_sum
        
if cur_id is not None:
    cat = LABEL_TO_CAT[int(cur_sum.idxmax()) - 1]
    ave_preds.append((int(cur_id), cat))
            
ave_preds_df = pd.DataFrame(data=ave_preds, columns=['_id','category_id'])
print('Dataframe shape {}; products in test {}'.format(ave_preds_df.shape, TEST_SIZE))
print('Duplicates {}; affect accuracy {}'.format(ave_preds_df.shape[0] - TEST_SIZE,
                                                float(ave_preds_df.shape[0] - TEST_SIZE) / TEST_SIZE))
ave_no_dupls = ave_preds_df.drop_duplicates(subset='_id', inplace=False)
print('No duplicated shape:', ave_no_dupls.shape)

ave_no_dupls.to_csv('../submit/four_nets_ave.csv', index=False)