# Prediction check using trained model

- Set up
- Prediction : binary classification prediction
- (If you need) Save the predicted images

## 1. Set up

In [None]:
import os
import json
import glob
import scipy
import numpy as np

# import preprocess func from processor module
from processor import preprocess
from processor import DataSet

from keras.models import model_from_json
from keras.models import model_from_yaml
from keras.utils import np_utils
from keras.preprocessing import image

import pandas as pd
import glob

In [None]:
BASE_DATA_DIR=''

MODEL_NAME = ""
CATEGORY_DICT_CSV=os.path.join(BASE_DATA_DIR, "178_dict.csv")

TEST_DATA_DIRS = [ os.path.join(BASE_DATA_DIR, "valid") + "/*"]

In [None]:
from category import load_category_dict
from category import lookup_index
from category import category_matcher

In [None]:
catdict = load_category_dict(CATEGORY_DICT_CSV)
target_index = lookup_index(catdict, '')
hamburg_pred = category_matcher(target_index)

In [None]:
class DataTest(object):
    '''
    Data preparation for prediction test
    '''
    def __init__(self):
        self.dataSet = DataSet()
    
    def get_data_paths(self,dirs):
        file_paths = []
        for elem in dirs:
            paths = glob.glob(os.path.normpath("{}/*.jpg").format(elem))
            file_paths.extend(paths)
        return file_paths
        
    def chunked(self, iterable, n):
        return [iterable[x:x + n] for x in range(0, len(iterable), n)]
    
    def preprocess_data(self, file_paths, pred):
        test_labels = list(map(pred, file_paths))
        test_paths = file_paths
        test_data = self.dataSet.files_to_dataset(file_paths)

        return test_data, test_labels, test_paths
    
    def get_N_sample(self, file_paths, N):
        import random
        index = random.sample(range(len(file_paths)), N)
        samples = [file_paths[i] for i in index]
        return samples

In [None]:
from modelutils import load_model
class Model(object):
    '''
    Model loading and prediction methods
    '''
    def __init__(self, model_prefix):
        self.model = load_model(model_prefix)
        
    def predict_raw(self, data_chunk, batch_size=32):
        prediction = self.model.predict(data_chunk, batch_size)
        return prediction
    

In [None]:
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
from visualize import plot_image_list

### Load model and create test dataset

In [None]:
%%time

MODEL = Model(MODEL_NAME)

In [None]:
TEST_DATASET = DataTest()
DATA_PATHS = TEST_DATASET.get_data_paths(TEST_DATA_DIRS)

Check with small dataset.

In [None]:
data_paths = TEST_DATASET.get_data_paths(TEST_DATA_DIRS)
samples = TEST_DATASET.get_N_sample(DATA_PATHS, 3)

test_data, test_label, test_path = TEST_DATASET.preprocess_data(
    samples, hamburg_pred)

In [None]:
MODEL.predict_raw(test_data)

In [None]:
test_label

## 2. Prediction Check

In [None]:
%%time

chunked_paths = TEST_DATASET.chunked(DATA_PATHS, 3000)

prediction_col = []
test_labels = []
test_paths = []

for chunk in chunked_paths:
    test_data_chunk, test_label_chunk, test_path_chunk = TEST_DATASET.preprocess_data(chunk,hamburg_pred)
    test_labels.extend(test_label_chunk)
    test_paths.extend(test_path_chunk)
    
    prediction = MODEL.predict_raw(test_data_chunk, batch_size=32)
    prediction_col.extend(prediction)

In [None]:
prediction_col[0:5]

In [None]:
RESULT_ONEVS = pd.DataFrame({
    'ans' : test_labels,
    'class0': [pair[0] for pair in prediction_col],
    'class1' : [pair[1] for pair in prediction_col],
    'filepaths' : test_paths
})

In [None]:
RESULT_ONEVS[0:5]

In [None]:
os.path.isdir("trained_model/intermediate") or os.makedirs("trained_model/intermediate")

In [None]:
# os.remove("trained_model/intermediate/onevs_2class.dat")
RESULT_ONEVS.to_pickle("trained_model/intermediate/onevs_2class_weakcollect.dat")

In [None]:
# RESULT_ONEVS = pd.read_pickle("trained_model/intermediate/onevs.dat")

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()

In [None]:
labels = RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()

In [None]:
labels = labels.sort_values(by="class1")

In [None]:
labels[0:20]

miss classified

In [None]:
# plot_image_list(labels[0:10]["filepaths"].values)

In [None]:
highconf = RESULT_ONEVS.sort_values(by="class1", ascending=False)

In [None]:
# plot_image_list(highconf[0:10]['filepaths'].values)

In [None]:
highconf[0:10]

In [None]:
highconf_miss = highconf.where(highconf["ans"] == False).dropna()

In [None]:
highconf_miss[0:10]

In [None]:
# plot_image_list(highconf_miss[0:40]['filepaths'].values)

In [None]:
highconf_miss[10:20]

### keep 1 by 3 result

In [None]:
prediction_col[0:5]

In [None]:
RESULT_ONEVS = pd.DataFrame({
    'ans' : test_labels,
    'class0': [pair[0] for pair in prediction_col],
    'class1' : [pair[1] for pair in prediction_col],
    'filepaths' : test_paths
})

In [None]:
RESULT_ONEVS[0:5]

In [None]:
os.path.isdir("trained_model/intermediate") or os.makedirs("trained_model/intermediate")

In [None]:
# os.remove("trained_model/intermediate/onevs_2class.dat")
RESULT_ONEVS.to_pickle("trained_model/intermediate/onevs_2class.dat")

In [None]:
# RESULT_ONEVS = pd.read_pickle("trained_model/intermediate/onevs.dat")

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()

In [None]:
labels = RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()

In [None]:
labels = labels.sort_values(by="class1")

In [None]:
labels[0:20]

In [None]:
# checkimg_df(labels, 0)

miss classified

In [None]:
# plot_image_list(labels[0:10]["filepaths"].values)

In [None]:
highconf = RESULT_ONEVS.sort_values(by="class1", ascending=False)

In [None]:
# plot_image_list(highconf[0:10]['filepaths'].values)

In [None]:
highconf[0:10]

In [None]:
highconf_miss = highconf.where(highconf["ans"] == False).dropna()

In [None]:
highconf_miss[0:10]

In [None]:
# plot_image_list(highconf_miss[0:40]['filepaths'].values)

In [None]:
RESULT_ONEVS.sort_values(by="class1", ascending=False)[0:100]

In [None]:
RESULT_ONEVS.sort_values(by="class1", ascending=False)[0:100][1:2]

In [None]:
RESULT_ONEVS.sort_values(by="predictions", ascending=False)[1:2]['filepaths']

In [None]:
from IPython.display import Image, display
def checkimg(fpath):
    display(Image(fpath))

In [None]:
# checkimg(RESULT_ONEVS.sort_values(by="class1", ascending=False)[0:100][1:2]['filepaths'].values[0])

In [None]:
RESULT_ONEVS.sort_values(by="class1", ascending=False)[0:100][1:2]['filepaths'].values[0]

In [None]:
preds = RESULT_ONEVS.where(RESULT_ONEVS['class0']< RESULT_ONEVS['class1']).dropna()

In [None]:
preds.where(preds['class1'] > 0.8).dropna()

In [None]:
# checkimg(preds.where(preds['class1'] > 0.8).dropna().iloc[1]["filepaths"])

In [None]:
preds.where(preds['class1'] > 0.9).dropna()

In [None]:
tmp = preds.where(preds['class1'] > 0.9).dropna()

In [None]:
tmp[0:12]

In [None]:
# checkimg(tmp.iloc[10]["filepaths"])

In [None]:
def checkimg_df(df, loc):
    checkimg(df.iloc[loc]["filepaths"])

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()['predictions'].mean()

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == False).dropna()["predictions"].mean()

In [None]:
plt.figure()

In [None]:
# RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()['predictions'].astype(float).plot()

In [None]:
# RESULT_ONEVS.where(RESULT_ONEVS["ans"] == False).dropna()['predictions'].astype(float).plot()

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["predictions"] > 0.8).dropna()[0:100]

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["predictions"] > 0.8).dropna().where(RESULT_ONEVS["ans"] == True).count()

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["predictions"] > 0.8).dropna().where(RESULT_ONEVS["ans"] == False).count()

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["predictions"] > 0.5).dropna().where(RESULT_ONEVS["ans"] == True).count()

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["predictions"] > 0.5).dropna().where(RESULT_ONEVS["ans"] == False).count()

In [None]:
RESULT_ONEVS.count()

## Evaluate data.

This code is dup to master branch's prediction_with_trained_model.ipynb. Check setup code of that file, too.

In [None]:
CHECK_DATA_DIR = ''

In [None]:
class DataCheck(object):
    '''
    Data preparation.
    '''
    def __init__(self):
        pass
    
    def get_data_paths(self,dirs):
        return list(glob.glob(os.path.normpath("{}/*.jpg").format(dirs)))
        
    def chunked(self, iterable, n):
        return [iterable[x:x + n] for x in range(0, len(iterable), n)]
    
    def preprocess_data(self, file_paths):
        test_data = []
        test_paths = []

        for file_path in file_paths:
            img = scipy.misc.imread(file_path)
            img = preprocess(img)
            test_data.append(img)

            test_paths.append(file_path)

        test_data = np.array(test_data).astype(np.float32)
        test_data = test_data.transpose((0, 1, 2, 3))

        return test_data, test_paths
    
    def get_N_sample(self, file_paths, N):
        import random
        index = random.sample(range(len(file_paths)), N)
        samples = [file_paths[i] for i in index]
        return samples

In [None]:
CHECK_DATASET = DataCheck()
DATA_PATHS = CHECK_DATASET.get_data_paths(CHECK_DATA_DIR)

In [None]:
len(DATA_PATHS)

In [None]:
%%time

chunked_paths = CHECK_DATASET.chunked(DATA_PATHS, 3000)

pred_list = []
file_paths = []

for chunk in chunked_paths:
    check_data_chunk, file_path_chunk = CHECK_DATASET.preprocess_data(chunk)
    file_paths.extend(file_path_chunk)
    
    prediction = MODEL.predict_raw(check_data_chunk, batch_size=32)
    pred_list.append(prediction)

In [None]:
len(pred_list)

In [None]:
pred_list[0].shape

In [None]:
prediction.T[0, 1:3]

In [None]:
# plt.hist(prediction)

In [None]:
np.where(prediction > 0.5)

In [None]:
prediction[831]

In [None]:
flatten = prediction.reshape([prediction.shape[0]])

In [None]:
np.where(flatten > 0.5)

In [None]:
file

In [None]:
np.where(flatten > 0.5)[0]

In [None]:
files = np.asarray(file_paths)[np.where(flatten > 0.5)]

In [None]:
len(files)

In [None]:
# plot_image_list(files)

In [None]:
files = np.asarray(file_paths)[np.where(flatten > 0.7)]

In [None]:
len(files)

In [None]:
# plot_image_list(files)

In [None]:
files = np.asarray(file_paths)[np.where(flatten > 0.9)]

In [None]:
# plot_image_list(files)

In [None]:
# checkimg(np.asarray(file_paths)[np.where(flatten > 0.5)][4])

### Obsolete try and error code

In [None]:
files = RESULT_ONEVS.sort_values(by="predictions", ascending=False)[0:100]['filepaths']

In [None]:
labels = map(hamburg_pred, DATA_PATHS)

In [None]:
hambrugpaths = [f for label, f in zip(labels, DATA_PATHS) if label]

In [None]:
len(hambrugpaths)

In [None]:
hambrugpaths[0]

In [None]:
# checkimg(hambrugpaths[116])

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()['filepaths'].iloc[0]

In [None]:
# checkimg(files.iloc[12])

In [None]:
# checkimg(RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()['filepaths'].iloc[1])

In [None]:
data_paths = TEST_DATASET.get_data_paths(TEST_DATA_DIRS)
samples = TEST_DATASET.get_N_sample(DATA_PATHS, 3)

test_data, test_label, test_path = TEST_DATASET.preprocess_data(
    samples, hamburg_pred)

In [None]:
RESULT_ONEVS.where(RESULT_ONEVS["ans"] == False).dropna()["predictions"].mean()

In [None]:
# Accuracy
np.sum( RESULT_SINGLE['ans'] == RESULT_SINGLE['predictions'] ) / len(RESULT_SINGLE)

In [None]:
# draw_cofusion_matrix(RESULT_SINGLE, CATEGORY_DICT)

### Check the misclassified images.

In [None]:
truthdf = RESULT_ONEVS.where(RESULT_ONEVS["ans"] == True).dropna()

In [None]:
truthdf[0:5]

In [None]:
# checkimg(truthdf[0:5]['filepaths'].values[0])

In [None]:
# checkimg(truthdf[0:5]['filepaths'].values[1])

In [None]:
def get_cat_dict(path):
    catdf = pd.read_csv(path)
    return {idx:catdf[catdf['idx']==idx]['name'].values[0] for idx in catdf['idx']}

In [None]:
cat_dict = get_cat_dict(CATEGORY_DICT_CSV)
cat_dict_pair = (CATEGORY_DICT, cat_dict)
cat_dict

In [None]:
def get_key_from_value(dictpair, idx):
    '''
    input : pair of category dictonary (idx_to_idxname, idxname_to_catname) and index
    output : key (category name) of corresponding index
    '''
    idx_to_idxname_dict = dictpair[0]
    key = list(idx_to_idxname_dict.keys())[
        list( idx_to_idxname_dict.values() ).index(idx)
    ]
    return dictpair[1][int(key)]

In [None]:
get_key_from_value((CATEGORY_DICT, cat_dict), 3)

In [None]:
CATEGORY_DICT["100"]

In [None]:
def df_filtered_by_ans(df,dictpair,idx,num_display):
    '''
    input : result dataframe, category dictionary, index of class, num of displaying images
    output : plot of images (correct - prediction pairs)
    '''
    ans_key = get_key_from_value(dictpair,idx)
    
    plt.ion()
    idx_to_idxname_dict = dictpair[0]
    
    for i in range(len(idx_to_idxname_dict)):
        pred_key = get_key_from_value(dictpair,i)
        
        #Get dataframe
        miss_img_paths = df.where( 
            df['ans'] == idx
        ).dropna().where(
            df['predictions'] == i
        ).dropna()['filepaths']
        
        #Transform it into a list
        miss_img_paths = [elem for elem in miss_img_paths]
        
        plt.figure(figsize = (10,17))
        gs1 = gridspec.GridSpec(1,num_display)
        gs1.update(wspace=0.025, hspace=0.05)
        
        print("ans:{0} - pred:{1}".format(ans_key,pred_key))
        for i in range(num_display):
            ax1 = plt.subplot(gs1[i])
            plt.axis('on')
            ax1.set_xticklabels([])
            ax1.set_yticklabels([])
            ax1.set_aspect('equal')
            try:
                image = plt.imread(miss_img_paths[i])
                plt.imshow(image)
            except:
                pass
        plt.show()

In [None]:
# df_filtered_by_ans(RESULT_SINGLE, (CATEGORY_DICT, cat_dict), 1, 5)

## Prediction Check : Multi classes case

In [None]:
%%time

chunked_paths = TEST_DATASET.chunked(DATA_PATHS, 4000)

PREDICTIONS_MULTI = []
test_labels = []
test_paths = []

for chunk in chunked_paths:
    test_data_chunk, test_label_chunk, test_path_chunk = TEST_DATASET.preprocess_data(chunk,CATEGORY_DICT)
    test_labels.extend(test_label_chunk)
    test_paths.extend(test_path_chunk)
    
    prediction = MODEL.predict_multiclass(test_data_chunk, batch_size=32)
    PREDICTIONS_MULTI.extend(prediction)

In [None]:
matched = [ ans in pred for ans,pred in zip(test_labels, PREDICTIONS_MULTI) ]
print( sum(matched) / len(matched) )

In [None]:
def count_predicted_num(predictions):
    '''
    input : multiclass predictions
    output : the number of predictions for each class
    '''
    from collections import Counter
    
    counter = Counter( [elem for sublist in predictions for elem in sublist] )
    pred_count = counter.most_common()
    pred_count = sorted(pred_count, key=lambda x: x[0])
    
    return pred_count

In [None]:
count_predicted_num(PREDICTIONS_MULTI)

In [None]:
def correct_ratio_in_preds(predictions, test_labels):
    '''
    input : multiclass predictions and answer labels
    output : correct ratio in each prediction
    '''
    correct_pred_ratio = []
    pred_count = count_predicted_num(predictions)
    
    for pred_cls, pred_num in pred_count:
        matched_num = [ (ans in pred) and (pred_cls in pred) 
                       for ans,pred in zip(test_labels, predictions)
                      ]
        
        correct_pred_ratio.append( 
            (get_key_from_value(CATEGORY_DICT, pred_cls), sum(matched_num)/pred_num) 
        )
    
    return correct_pred_ratio

In [None]:
correct_ratio_in_preds(PREDICTIONS_MULTI, test_labels)

In [None]:
np.mean([len(preds) for preds in PREDICTIONS_MULTI])

In [None]:
np.max([len(preds) for preds in PREDICTIONS_MULTI])

In [None]:
matched_tupple = [ (idx, ans, pred) 
               for idx, ans,pred in zip(range(len(test_labels)), test_labels, PREDICTIONS_MULTI)
               if (ans in pred)
              ]

In [None]:
missed_tupple = [ (idx, ans, pred) 
               for idx, ans,pred in zip(range(len(test_labels)), test_labels, PREDICTIONS_MULTI)
               if (ans not in pred)
              ]


In [None]:
print(len(matched_tupple), len(missed_tupple))

In [None]:
def show_missmatch(missedtups, filepaths, catdictpair, cattarget,num_display):
    '''
    input : list of missed data tupple (idx, label, [missed]), category dictionary, index of class which you want to show, num of displaying images
    output : plot of images (correct - prediction pairs)
    '''
    ans_key = get_key_from_value(catdictpair,cattarget)
    
    plt.ion()
    
    targets = [(idx, lab, preds) for idx, lab, preds in missedtups if lab == cattarget]
    
    for i in range(num_display):
        if(len(targets) <= i):
            print("too small missed: {0}, {1}".format(len(targets), i))
            return
        tup = targets[i]
        pcat_names = [get_key_from_value(catdictpair, pcat) for pcat in tup[2]]
        print("ans:{0} - pred:{1}".format(ans_key, ", ".join(pcat_names)))
    
        idx = tup[0]
        miss_img_path = filepaths[idx]
        
        plt.axis('on')
        # plt.set_xticklabels([])
        # plt.set_yticklabels([])
        #plt.set_aspect('equal')
        try:
            image = plt.imread(miss_img_path)
            plt.imshow(image)
        except:
                pass
        plt.show()

In [None]:
# show_missmatch(missed_tupple, test_paths, (CATEGORY_DICT, cat_dict), 0, 10)

In [None]:
# show_missmatch(missed_tupple, test_paths, (CATEGORY_DICT, cat_dict), 15, 10)

## 4. Save the predicted images

In [None]:
# def save_predicted_image(org_data_path, save_base_dir, result):
#     '''
#     input : 
#         org_data_path - test data dir
#         , save_base_dir - save dir
#         , result - sngle prediction result dataframe
#     output : copy the images to save dir with {ans, pred} pairs
#     '''
#     import shutil
    
#     categories = [elem.split("/")[-1] for elem in glob.glob(org_data_path)]
#     os.mkdir(save_base_dir) if not os.path.isdir(save_base_dir) else None
    
#     #Create directories for saving
#     for category in categories:
#         ans_category_dir = "{0}/{1}".format(save_base_dir, category)
#         os.mkdir(ans_category_dir) if not os.path.isdir(ans_category_dir) else None
        
#         for i_category in categories:
#             ans_pred_category_dir = "{0}/{1}".format(ans_category_dir, i_category)
#             os.mkdir(ans_pred_category_dir) if not os.path.isdir(ans_pred_category_dir) else None
    
#     #Save the images
#     for ans_v in range(len(CATEGORY_DICT)):
#         ans_df = result[result['ans'] == ans_v]
#         ans_k = get_key_from_value(CATEGORY_DICT,ans_v)
    
#         for pred_v in range(len(CATEGORY_DICT)):
#             pred_df = ans_df[ans_df['predictions'] == pred_v]
#             pred_k = get_key_from_value(CATEGORY_DICT,pred_v)
        
#             save_dir = "{0}/{1}/{2}".format(save_base_dir, ans_k, pred_k)
        
#             for filepath in pred_df['filepaths']:
#                 filename = filepath.split("/")[-1]
#                 shutil.copyfile(filepath, "{0}/{1}".format(save_dir, filename))

In [None]:
# %%time

# save_predicted_image(
#     "./data/test/*", "./pred_result/", RESULT_SINGLE
# )

Below codes are copied from 

In [None]:
users = [
]

In [None]:
from akagi.data_sources import RedshiftDataSource
from cooking_log_image_fetcher import Fetcher
from scipy.misc import imread
import os


def build_condition(users):
    conditions = []

    for row in users:
        conditions.append("""
            (
            )
        """.format(**row))

    return ' or '.join(conditions)


def fetch_cooking_log_thumbnail(users):
    with RedshiftDataSource.for_query(
        """
        ) as ds:

        for row in ds:
            upload_id, _, user_id = int(row[0]), int(row[1]), int(row[2])
            body = Fetcher.fetch_thumbnail(user_id, upload_id, env='production')
            image = imread(body)
            yield body, image, upload_id

In [None]:
import os
import json

In [None]:
with open('awskeys.txt', 'r') as outfile:
    dic = json.load(outfile)
AWS_KEY_ID = dic["AWS_KEY_ID"]
AWS_SECRET_KEY = dic["AWS_SECRET_KEY"]



In [None]:
output_dir = ''

In [None]:
assert (AWS_KEY_ID is not None) and (AWS_SECRET_KEY is not None), "SET your keys."
os.environ['AWS_ACCESS_KEY_ID'] = AWS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_KEY

for body, image, upload_id in fetch_cooking_log_thumbnail(users):

    output_filename = "%s/%s.jpg" % (output_dir, upload_id)
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)

    with open(output_filename, 'wb') as f:
        f.write(body.getvalue())

## Evaluate with data.

In [None]:
CHECK_DATA_DIR = ''

In [None]:
DATA_PATHS = list(glob.glob(os.path.normpath("{}/*.jpg").format(CHECK_DATA_DIR)))

In [None]:
dataset = DataSet()

In [None]:
check_data = dataset.files_to_dataset(DATA_PATHS)

In [None]:
arr = MODEL.predict_raw(check_data)
arr.shape

In [None]:
arr[0:4]

In [None]:
RESULT_CHECK = pd.DataFrame({
    'class0': [pair[0] for pair in arr],
    'class1' : [pair[1] for pair in arr],
    'filepaths' : DATA_PATHS
})

In [None]:
RESULT_CHECK.sort_values(by="class1", ascending=False)

In [None]:
high = RESULT_CHECK.sort_values(by="class1", ascending=False)

In [None]:
# plot_image_list(high.where(high["class1"] > 0.5).dropna()["filepaths"].values)

In [None]:
# plot_image_list(high.where(high["class1"] > 0.7).dropna()["filepaths"].values)

In [None]:
%%time

chunked_paths = CHECK_DATASET.chunked(DATA_PATHS, 3000)

pred_list = []
file_paths = []

for chunk in chunked_paths:
    check_data_chunk, file_path_chunk = CHECK_DATASET.preprocess_data(chunk,CATEGORY_DICT)
    file_paths.extend(file_path_chunk)
    
    prediction = MODEL.predict_raw(check_data_chunk, batch_size=32)
    pred_list.append(prediction)

In [None]:
len(pred_list)

In [None]:
pred_list[0].shape

In [None]:
prediction[0:3, :]

In [None]:
[max(arg) for arg in prediction[0:30, :]]

In [None]:
maxs = [max(arg) for arg in prediction[:, :]]

In [None]:
len(maxs)

In [None]:
# plt.plot(maxs)

In [None]:
# plt.hist(maxs)

In [None]:
pred_df = pd.DataFrame(prediction)

In [None]:
len(file_paths)

In [None]:
pred_df[0:3]

In [None]:
pred_df['files'] = file_paths

In [None]:
pred_df.shape

In [None]:
pred_df.iloc[0][0:178]

In [None]:
pred_df.iloc[0][178]

In [None]:
pred_df['maxval'] = pred_df.max(axis=1)

In [None]:
pred_df['argmax'] = pred_df.iloc[:, 0:178].idxmax(axis=1)

In [None]:
pred_df.iloc[0]

In [None]:
from visualize import plot_image_list

In [None]:
pred_df.shape

In [None]:
pred_df.columns[180]

In [None]:
tmp = pred_df.where(pred_df['argmax'] == 0).dropna()
tmp

In [None]:
len(tmp)

In [None]:
pred_df[[1, 3, 5]]

In [None]:
def print_top_5(df, catdictpair):
    CATNUM = df.shape[1] - 3 # files, maxval, argmax
    for i in range(len(df)):
        trans = df.iloc[i, 0:CATNUM].T.astype(float)
        top5 = trans.nlargest(5)
        respairs = ["{0}:{1:.3f}".format(get_key_from_value(catdictpair, idx), top5[idx]) for idx in top5.index]
        print(":".join((",".join(respairs), df['files'].values[i])))


In [None]:
def show_df_by_category(df, catdictpair, cattarget):
    '''
    input : prediction result df, category dictionary, index of class which you want to show, num of displaying images
    output : plot of images (correct - prediction pairs)
    '''
    cat_name = get_key_from_value(catdictpair,cattarget)    
    
    catdf = df.where( 
            df['argmax'] == cattarget
        ).dropna()
    
    
    print("category: {}".format(cat_name))
    print_top_5(catdf, catdictpair)
    plot_image_list(catdf['files'].values)
    

In [None]:
from IPython.display import Image, display
def checkimg(fpath):
    display(Image(fpath))

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 18)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 1)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 3)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 7)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 8)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 9)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 10)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 11)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 12)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 13)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 14)

In [None]:
# show_df_by_category(pred_df, cat_dict_pair, 15)

In [None]:
RESULT_DIST = './result_classified_07'

In [None]:
threshold = 0.7

filtered_df = pred_df.where( 
            pred_df['maxval'] >= threshold
        ).dropna()



In [None]:

filtered_df.shape

In [None]:
def get_catidx_from_value(catdic, idx):
    '''
    input : category dictonary and index
    output : key (category int value) of corresponding index
    '''
    idx_to_idxname_dict = catdic
    key = list(idx_to_idxname_dict.keys())[
        list( idx_to_idxname_dict.values() ).index(idx)
    ]
    return int(key)

In [None]:
import shutil

In [None]:
for i, row in filtered_df.iterrows():
    cat = row['argmax']
    catname = get_key_from_value(cat_dict_pair, cat)
    cat_path = os.path.join(RESULT_DIST, catname)
    if not os.path.isdir(cat_path.encode('utf_8')):
        os.makedirs(cat_path.encode('utf_8'))
    orgpath = row['files']
    shutil.copyfile(orgpath, os.path.join(cat_path, os.path.basename(orgpath)).encode('utf_8'))


In [None]:
pred_df.shape

### obsolete

In [None]:
%%time

chunked_paths = CHECK_DATASET.chunked(DATA_PATHS, 3000)

PREDICTIONS_SINGLE = []
file_paths = []

for chunk in chunked_paths:
    check_data_chunk, file_path_chunk = CHECK_DATASET.preprocess_data(chunk,CATEGORY_DICT)
    file_paths.extend(file_path_chunk)
    
    prediction = MODEL.predict_oneclass(check_data_chunk, batch_size=32)
    PREDICTIONS_SINGLE.extend(prediction)

In [None]:
RESULT_SINGLE = pd.DataFrame({
    'predictions' : PREDICTIONS_SINGLE,
    'filepaths' : file_paths
})

In [None]:
for i, row in filtered_df.iterrows():
    cat = row['argmax']
    catname = get_key_from_value(cat_dict_pair, cat)
    catint = get_catidx_from_value(CATEGORY_DICT, cat)
    cat_path = os.path.join(RESULT_DIST, str(catint))
    if not os.path.isdir(cat_path):
        os.makedirs(cat_path)
        with open(os.path.join(cat_path, 'category_name.txt'), 'w') as f:
            f.write(catname.encode('utf_8'))
    orgpath = row['files']
    shutil.copyfile(orgpath, os.path.join(cat_path, os.path.basename(orgpath)))
