In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import gc

from starter_eda_model_funcs import MultiOutputDataGenerator
from helper import to_one_hot, preview_data_aug
from preprocessing import perform_preprocessing
from flow_kaggle import generators_from_prep, generator_wrapper
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model


# Test submission code from https://www.kaggle.com/ipythonx/keras-grapheme-gridmask-augmix-in-efficientnet
# Test data generator
def test_batch_generator(df, batch_size, SIZE, PAD, HEIGHT=137, WIDTH=236):
    num_imgs = len(df)

    for batch_start in range(0, num_imgs, batch_size):
        curr_batch_size = min(num_imgs, batch_start + batch_size) - batch_start
        idx = np.arange(batch_start, batch_start + curr_batch_size)

        names_batch = df.iloc[idx, 0].values
        imgs_batch = 255 - df.iloc[idx, 1:].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
        X_batch = np.zeros((curr_batch_size, SIZE, SIZE, 1))

        for j in range(curr_batch_size):
            img = (imgs_batch[j,]*(255.0/imgs_batch[j,].max())).astype(np.uint8)
            img = crop_resize(img, orig_height=HEIGHT, orig_width=WIDTH, target_height=SIZE, target_width=SIZE, pad=PAD)
            img = img[:, :, np.newaxis]
            X_batch[j,] = img

        yield X_batch, names_batch

VERBOSE = True


# locate the parquet files 
TEST = [
    "../input/bengaliai-cv19/test_image_data_0.parquet",
    "../input/bengaliai-cv19/test_image_data_1.parquet",
    "../input/bengaliai-cv19/test_image_data_2.parquet",
    "../input/bengaliai-cv19/test_image_data_3.parquet",
]
for fn in TEST:
    assert os.path.exists(fn), "Test parquet file not found!"

roi_options = [32, 64, 96, 128]
pad_options = [5, 5, 15,  21]
# roi_options = [32, 64]
# pad_options = [5, 5]
batch_size = 256
# models      = []


from preprocessing import bbox, crop_resize
# combine models to an ensemble
probs = {}

# --- PERFORM PREPROCESSING -> PREDICT -> CLEANUP -> repeat ---#
for roi_size, padding in zip(roi_options, pad_options):
    model_fn = f'/kaggle/input/models/model-best-{roi_size}.h5'
    print(f"Loading {model_fn}...")
    model = load_model(model_fn)
    
    preprocess_args = dict(
        image_width=roi_size + padding*2,
        image_height=roi_size + padding*2,
        padding=padding,
    )
    
    # folders
    data_path='/kaggle/input/bengaliai-cv19/'
    name = f"prep-{roi_size}-{padding}"
    prep_path = f"prep/{name}/"
    train_or_test = 'test'
    
    name = f"test_{roi_size}_{padding}/"
    # perform preprocessing
    print("Preprocess arguments:")
    print(preprocess_args)
#     perform_preprocessing(preprocess_args, out='png', prep_path=prep_path,
#                           train_or_test=train_or_test, data_path=data_path)

#     # get test generator
#     _, test_generator = generators_from_prep(datagen_args={}, preprocess_args=preprocess_args, 
#                                           show_data_aug=True, data_path=data_path, 
#                                           prep_path=prep_path, train_or_test=train_or_test)
    
#     # predict test set
#     y_preds = model.predict(generator_wrapper(test_generator), verbose=0, 
#                             steps=int(np.ceil(test_generator.n / test_generator.batch_size)))
#     # need to round up to ensure all validation samples are yielded
#     y_pred_list.append(y_preds.copy())

    # placeholders 
    row_id = []
    target = []

    # iterative over the test sets
    for fname in tqdm(TEST):
        test_ = pd.read_parquet(fname)
        assert preprocess_args['image_width'] == preprocess_args['image_height']
        test_gen = test_batch_generator(test_, batch_size=batch_size, SIZE=preprocess_args['image_width'], PAD=preprocess_args['padding'])

        for batch_x, batch_name in test_gen:
            batch_predict = model.predict(batch_x)
            for idx, name in enumerate(batch_name):
                # save probabilities
                if f"{name}_consonant_diacritic" in probs.keys():
                    probs[f"{name}_consonant_diacritic"] += batch_predict[2][idx]
                    probs[f"{name}_grapheme_root"] += batch_predict[0][idx]
                    probs[f"{name}_vowel_diacritic"] += batch_predict[1][idx]
                else:
                    probs[f"{name}_consonant_diacritic"] = batch_predict[2][idx]
                    probs[f"{name}_grapheme_root"] = batch_predict[0][idx]
                    probs[f"{name}_vowel_diacritic"] = batch_predict[1][idx]
                
#                 row_id += [
#                     f"{name}_consonant_diacritic",
#                     f"{name}_grapheme_root",
#                     f"{name}_vowel_diacritic",
#                 ]
#                 target += [
#                     np.argmax(batch_predict[2], axis=1)[idx],
#                     np.argmax(batch_predict[0], axis=1)[idx],
#                     np.argmax(batch_predict[1], axis=1)[idx],
#                 ]

        del test_
        gc.collect()


#     df_sample = pd.DataFrame(
#         {
#             'row_id': row_id,
#             'target': target
#         },
#         columns = ['row_id','target'] 
#     )
    
    # cleanup
    K.clear_session() # remove model for faster loading of next model
    gc.collect()

# --- COMBINE PREDICTIONS --- #
for k, val in probs.items():
    probs[k] = np.argmax(val)

df_sample = pd.DataFrame(
    {
        'row_id': list(probs.keys()),
        'target': list(probs.values())
    },
    columns = ['row_id','target'] 
)
# y_pred_root = np.mean([y_pred[0] for y_pred in y_pred_list], axis=0)
# y_pred_vowe = np.mean([y_pred[1] for y_pred in y_pred_list], axis=0)
# y_pred_cons = np.mean([y_pred[2] for y_pred in y_pred_list], axis=0)

df_sample.to_csv('submission.csv', index=False)
print(df_sample.head())

/kaggle/input/bengaliai-cv19/train.csv
/kaggle/input/bengaliai-cv19/train_image_data_2.parquet
/kaggle/input/bengaliai-cv19/test_image_data_1.parquet
/kaggle/input/bengaliai-cv19/class_map.csv
/kaggle/input/bengaliai-cv19/train_image_data_3.parquet
/kaggle/input/bengaliai-cv19/test_image_data_2.parquet
/kaggle/input/bengaliai-cv19/train_image_data_1.parquet
/kaggle/input/bengaliai-cv19/test_image_data_0.parquet
/kaggle/input/bengaliai-cv19/test.csv
/kaggle/input/bengaliai-cv19/train_image_data_0.parquet
/kaggle/input/bengaliai-cv19/sample_submission.csv
/kaggle/input/bengaliai-cv19/test_image_data_3.parquet
/kaggle/input/models/model-best-128.h5
/kaggle/input/models/model-best-64.h5
/kaggle/input/models/model-best-96.h5
/kaggle/input/models/model-best-32.h5
Loading /kaggle/input/models/model-best-32.h5...


  0%|          | 0/4 [00:00<?, ?it/s]

Preprocess arguments:
{'image_width': 42, 'image_height': 42, 'padding': 5}


100%|██████████| 4/4 [00:11<00:00,  2.95s/it]


Loading /kaggle/input/models/model-best-64.h5...


  0%|          | 0/4 [00:00<?, ?it/s]

Preprocess arguments:
{'image_width': 74, 'image_height': 74, 'padding': 5}


100%|██████████| 4/4 [00:12<00:00,  3.10s/it]


                       row_id  target
0  Test_0_consonant_diacritic       0
1        Test_0_grapheme_root       3
2      Test_0_vowel_diacritic       0
3  Test_1_consonant_diacritic       0
4        Test_1_grapheme_root      93
