In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import cv2
from scipy.stats import rankdata
from tqdm import tqdm
from pathlib import Path
from uco.data_loader import pivot_df

In [5]:
original_dir = Path('../data/original')

In [6]:
def get_df():
    df = pd.read_csv('../data/raw/sample_submission.csv')
    df = pivot_df(df).reset_index()[['Image']]
    return df

In [150]:
# Save prediction ranks based on segmentation models
with h5py.File(original_dir / 'avg-seg-predictions.h5', 'r') as f:
    # group_names = [k for k in f.keys() if k != 'average']
    group_names = ['average']
    for c in tqdm(list(range(4))):
        df = get_df()
        for k in group_names:
            preds = f[k][:, c, :, :]
            totals = np.sum(np.square(preds), axis=1)
            totals = np.sum(totals, axis=1)
            ranks = rankdata(totals)
            df[k] = ranks
        df.set_index('Image', inplace=True)
        df['mean'] = df.apply(lambda row: row.mean(), axis=1)
        df['rank'] = rankdata(df['mean'])
        df.to_csv(original_dir / f'segmentation-ranks-c{c}.csv')

100%|██████████| 4/4 [00:20<00:00,  5.10s/it]


In [151]:
# Save prediction ranks based on classification models
with h5py.File(original_dir / 'avg-clas-predictions.h5', 'r') as f:
    k = 'average'
    for c in range(4):
        df = get_df()
        preds = f[k][:, c]
        ranks = rankdata(preds)
        df['rank'] = ranks
        df.to_csv(original_dir / f'classification-ranks-c{c}.csv', index=False)

In [152]:
df = get_df()
n_total = np.array([df.shape[0]] * 4)
target_counts = np.array([1864, 1508, 1982, 2382])
clas_cutoff = n_total - target_counts
seg_cutoff = n_total - (target_counts * [0.67, 0.91, 0.625, 0.87]).astype(np.uint32)
print(clas_cutoff, seg_cutoff)

[1834 2190 1716 1316] [2450 2326 2460 1626]


In [153]:
for c in range(4):
    df_seg  = pd.read_csv(original_dir / f'segmentation-ranks-c{c}.csv')
    df_clas = pd.read_csv(original_dir / f'classification-ranks-c{c}.csv')
    
    avg = rankdata((df_seg['rank'] + df_clas['rank']) / 2)
    df[str(c)] = avg > seg_cutoff[c]
df.to_csv(original_dir / 'selected.csv', index=False)

In [154]:
df.sum(axis=0)

Image    002f507.jpg0035ae9.jpg0038327.jpg004f759.jpg00...
0                                                     1248
1                                                     1373
2                                                     1238
3                                                     2073
dtype: object

In [155]:
df.shape

(3698, 5)

In [156]:
df['count'] = df.sum(axis=1)
df.head()

Unnamed: 0,Image,0,1,2,3,count
0,002f507.jpg,False,False,True,False,1
1,0035ae9.jpg,False,False,True,True,2
2,0038327.jpg,False,False,False,True,1
3,004f759.jpg,False,True,False,True,2
4,005ba08.jpg,False,False,True,False,1


In [157]:
df_empty = df.loc[df['count'] == 0, :]
df_empty.shape

(68, 6)