### This notebook shows identifying the duplicated data and calculating image stats of leftover set, and finally saving usable images into a new csv file
---
[1. Loading training images](#step1)

[2. Find duplicates via phash](#step2)

[3. Create the deduplicated csv file](#step3)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!which python

In [None]:
import sys
from collections import Counter, Iterable
from typing import List, Union, Tuple
from functools import reduce
from pathlib import Path
from operator import add

from miniutils.progress_bar import parallel_progbar, progbar
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
sys.path.append("../..")
from src.data import make_one_hot
from src.train_with_template import load_training_data
from src.image import get_image_with_id, plot_rgb, plot_rgby, open_numpy
from src.data import DataPaths, label_to_string

#### 1. Loading training paths<a id='step1'/>

In [None]:
image_paths = list(DataPaths.TRAIN_COMBINED_IMAGES.glob("*")) + list(DataPaths.TRAIN_COMBINED_IMAGES_HPAv18.glob("*"))
image_paths.sort(key=lambda x: x.stem)
df = pd.concat([pd.read_csv(DataPaths.TRAIN_LABELS), pd.read_csv(DataPaths.TRAIN_LABELS_HPAv18)])
df = df.sort_values(by=['Id'])
df['image_paths'] = image_paths

In [None]:
assert np.all(df['Id'].values == [p.stem for p in df['image_paths'].values])

#### 2. Find duplicates via phash<a id='step2'/>

In [None]:
import imagehash
from PIL import Image
from miniutils.progress_bar import parallel_progbar, progbar

In [None]:
def calculate_phash(data):
    name, image_path = data
    image = open_numpy(image_path)
    phash = imagehash.phash(Image.fromarray(image.px))
    return (name, phash.hash.flatten())

In [None]:
names, image_paths = df['Id'], df['image_paths']
names_and_image_paths = zip(names, image_paths)
names_and_phashes = parallel_progbar(calculate_phash, names_and_image_paths)

Calculate phash similarities

In [None]:
names, phashes = zip(*names_and_phashes)

In [None]:
SIMILARITY_THRESHOLD = 0.75
phash_df_data = []
for i, (name, phash) in progbar(enumerate(zip(names, phashes))):
    similarities = (len(phash) - np.logical_xor(phash, phashes).sum(axis=1))/len(phash)
    for similarity, name_of_image_compared in zip(similarities, names):
        if similarity > SIMILARITY_THRESHOLD and name != name_of_image_compared:
            phash_df_data.append({
                "original_name": name,
                "compared_image_name": name_of_image_compared,
                "similarity": similarity
            })

Save the phash similarity df

In [None]:
similarity_df = pd.DataFrame(phash_df_data)
similarity_df.to_csv("phash_sim.csv", index=False)

In [None]:
similarity_df = pd.read_csv("phash_sim.csv")
sorted_similarity_df = similarity_df.sort_values(['similarity'], ascending=[False])

In [None]:
sorted_similarity_df.head()

In [None]:
sorted_similarity_df['similarity'].value_counts()

Plot some of the images that have a high phash similarity

In [None]:
get_samples_with_similarity = lambda df, similarity: df['similarity'].map(lambda x: x == similarity)
get_samples_with_similarity_above_and_equal = lambda df, similarity: df['similarity'].map(lambda x: x >= similarity)

In [None]:
# df_with_samples = sorted_similarity_df[get_samples_with_similarity(sorted_similarity_df, 0.875)]
# for i, (name1, name2) in enumerate(zip(df_with_samples['original_name'].values.tolist(),
#                                       df_with_samples['compared_image_name'].values.tolist())):
#     plot_rgby(get_image_with_id(name1))
#     plot_rgby(get_image_with_id(name2))
#     plt.show()
#     if i == 5:
#         break

Get rid of all the images that have a similarity score of 0.93750 and above

In [None]:
filtered_similarity_df = sorted_similarity_df[get_samples_with_similarity_above_and_equal(sorted_similarity_df, 0.93750)]

In [None]:
filtered_similarity_df.shape

In [None]:
filtered_names = [name for name in filtered_similarity_df['original_name'].values if "-" not in name]

In [None]:
filtered_names = []
for i, (name1, name2) in enumerate(zip(filtered_similarity_df['original_name'].values.tolist(),
                                       filtered_similarity_df['compared_image_name'].values.tolist())):
    both_samples_are_from_kaggle = "-" in name1 and "-" in name2
    sample_1_is_from_kaggle = "-" in name1
    sample_2_is_from_kaggle = "-" in name2
    if both_samples_are_from_kaggle:
        filtered_names.append(name1)
    elif sample_1_is_from_kaggle:
        filtered_names.append(name1)
    elif sample_2_is_from_kaggle
        filtered_names.append(name2)
    else:
        filtered_names.append(name1)

In [None]:
unique_filtered_names = list(set(filtered_names))

In [None]:
len(unique_filtered_names)

#### 3. Create the deduplicated csv file<a id='step3'/>

In [None]:
df = pd.concat([pd.read_csv(DataPaths.TRAIN_LABELS), pd.read_csv(DataPaths.TRAIN_LABELS_HPAv18)])

In [None]:
df_without_dupes = df[df['Id'].map(lambda x: x not in filtered_names)]

In [None]:
df_without_dupes.shape

In [None]:
df_without_dupes.to_csv(DataPaths.TRAIN_LABELS_ALL_NO_DUPES, index=False)

Just a helper function to load training data, this is later used to 

In [None]:
X, y, y_one_hot = load_training_data(
    root_image_paths=[str(DataPaths.TRAIN_COMBINED_IMAGES),str(DataPaths.TRAIN_COMBINED_IMAGES_HPAv18)],
    root_label_paths=str(DataPaths.TRAIN_LABELS_ALL_NO_DUPES),
)

In [None]:
X, y, y_one_hot = load_training_data(
    root_image_paths=str(DataPaths.TRAIN_COMBINED_IMAGES),
    root_label_paths=str(DataPaths.TRAIN_LABELS)
)

In [None]:
X[0]

In [None]:
y[0]

In [None]:
y_one_hot[0]

In [None]:
plot_rgby(open_numpy(X[0]).px)