In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import gmean

from label_to_cat import LABEL_TO_CAT

BATCHES_COUNT = 12091
TEST_SIZE = 1768182

In [2]:
res50_path = '../input/resnet50_test_corr.h5'
res101_path = '../input/resnet101_test.h5'
dense_path = '../input/dense_test_corr.h5'

In [3]:
%%time
store_res50 = pd.HDFStore(res50_path)
print('{} tables in storage'.format(len(store_res50)))

12091 tables in storage
CPU times: user 38.4 s, sys: 856 ms, total: 39.2 s
Wall time: 44.1 s


In [4]:
%%time
store_res101 = pd.HDFStore(res101_path)
print('{} tables in storage'.format(len(store_res101)))

12091 tables in storage
CPU times: user 40.1 s, sys: 712 ms, total: 40.8 s
Wall time: 45.8 s


In [5]:
%%time
store_dense = pd.HDFStore(dense_path)
print('{} tables in storage'.format(len(store_res101)))

12091 tables in storage
CPU times: user 40.2 s, sys: 304 ms, total: 40.5 s
Wall time: 40.5 s


In [6]:
stores = [store_res50, store_res101, store_dense]
keys = store_res50.keys()

In [None]:
preds = []
cur_id = None
one_product_preds = None

def select_tables(stores, key):
    tables = []
    for store in stores:
        tables.append(store.select(key))
    return tables
        
def get_product_id(tables, index):
    assert len(tables) > 0
    product_id = tables[0].pr_id.iloc[index]
    for i in range(1, len(tables)):
        alt = tables[i].pr_id.iloc[index]
        assert alt == product_id
    return product_id

def get_cat_id(one_pr_preds):
    averaged = one_pr_preds[0]
    for i in range(1, len(one_pr_preds)):
        averaged += one_product_preds[i] * (1. - 0.3*i)
    return averaged.argmax()
    # return gmean(one_pr_preds).argmax()

def get_new_sum_gmean(tables, ind):
    sums = [table[ind : ind +1].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
                           for table in tables]
    return gmean(sums)

for key in tqdm(keys): 
    tables = select_tables(stores, key)
    for i in range(tables[0].shape[0]):
        product_id = get_product_id(tables, i)
        new_sum = get_new_sum_gmean(tables, i)
        if cur_id == product_id:
            assert one_product_preds is not None
            one_product_preds.append(new_sum)
            continue
        elif cur_id is not None:
            assert one_product_preds is not None
            cat = LABEL_TO_CAT[get_cat_id(one_product_preds)]
            preds.append((int(cur_id), cat))
            one_product_preds = None
        
        cur_id = product_id
        one_product_preds = [new_sum]
        
if cur_id is not None:
    cat = LABEL_TO_CAT[get_cat_id(one_product_preds)]
    preds.append((int(cur_id), cat))
            
preds_df = pd.DataFrame(data=preds, columns=['_id','category_id'])
print('Dataframe shape {}; products in test {}'.format(preds_df.shape, TEST_SIZE))
print('Duplicates {}; affect accuracy {}'.format(preds_df.shape[0] - TEST_SIZE,
                                                float(preds_df.shape[0] - TEST_SIZE) / TEST_SIZE))
no_dupls = preds_df.drop_duplicates(subset='_id', inplace=False)
print('No duplicated shape: {}; expected rows count {}'.format(no_dupls.shape, TEST_SIZE))

no_dupls.to_csv('../submit/weighted_GMEAN.csv', index=False)

  log_a = np.log(np.array(a, dtype=dtype))
 17%|█▋        | 2061/12091 [52:11<4:14:00,  1.52s/it]