In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from label_to_cat import LABEL_TO_CAT

BATCHES_COUNT = 12091
TEST_SIZE = 1768182

In [None]:
res50_path = '../input/resnet50_test_corr.h5'
res101_path = '../input/resnet101_test.h5'
dense_path = '../input/dense_test_corr.h5'

In [None]:
%%time
store_res50 = pd.HDFStore(res50_path)
print('{} tables in storage'.format(len(store_res50)))

In [None]:
%%time
store_res101 = pd.HDFStore(res101_path)
print('{} tables in storage'.format(len(store_res101)))

In [None]:
%%time
store_dense = pd.HDFStore(dense_path)
print('{} tables in storage'.format(len(store_res101)))

In [None]:
stores = [store_res50, store_res101, store_dense]
keys = store_res50.keys()

In [None]:
preds = []
cur_id = None
one_product_preds = None

def select_tables(stores, key):
    tables = []
    for store in stores:
        tables.append(store.select(key))
        
def get_product_id(tables, index):
    assert len(tables) > 0
    product_id = tables[0].pr_id.iloc[index]
    for i in range(1, len(tables)):
        alt = tables[i].pr_id.iloc[index]
        assert alt == product_id
    return product_id

def get_cat_id(one_pr_preds):
    return gmean(one_pr_preds).argmax()

for key in tqdm(keys): 
    tables = select_tables(stores, key)
    for i in range(tables[0].shape[0]):
        product_id = get_product_id(tables, i)
        new_sum = sum([table[i : i +1].drop(['pr_id', 'img_num'], axis=1, inplace=False).sum()
                           for table in tables])
        if cur_id == product_id:
            assert one_product_preds is not None
            one_product_preds.append(new_sum)
            continue
        elif cur_id is not None:
            assert one_product_preds is not None
            cat = LABEL_TO_CAT[get_cat_id(one_product_preds)]
            preds.append((int(cur_id), cat))
            one_product_preds = None
        
        cur_id = product_id
        one_product_preds = [new_sum]
        
if cur_id is not None:
    cat = LABEL_TO_CAT[get_cat_id(one_product_preds)]
    preds.append((int(cur_id), cat))
            
preds_df = pd.DataFrame(data=preds, columns=['_id','category_id'])
print('Dataframe shape {}; products in test {}'.format(preds_df.shape, TEST_SIZE))
print('Duplicates {}; affect accuracy {}'.format(preds_df.shape[0] - TEST_SIZE,
                                                float(preds_df.shape[0] - TEST_SIZE) / TEST_SIZE))
no_dupls = preds_df.drop_duplicates(subset='_id', inplace=False)
print('No duplicated shape: {}; expected rows count {}'.format(no_dupls.shape, TEST_SIZE))

no_dupls.to_csv('../submit/product_level_GMEAN.csv', index=False)

In [2]:
x = [
    pd.Series([1, 2, 3, 4]), pd.Series([5, 6, 7, 8]), pd.Series([9, 10, 11, 12])
]
x

[0    1
 1    2
 2    3
 3    4
 dtype: int64, 0    5
 1    6
 2    7
 3    8
 dtype: int64, 0     9
 1    10
 2    11
 3    12
 dtype: int64]

In [5]:
from scipy.stats import gmean

In [16]:
gmean(x, axis=0)

array([ 3.5568933 ,  4.93242415,  6.13579244,  7.26848237])