In [None]:
import os
import json
import requests # request img from web
import shutil # save img locally
from IPython.display import Image
import pandas as pd

from imagededup.methods import CNN

In [None]:
HOME_DIR_PATH = os.path.join("/", "app")

DATA_DIR_PATH = os.path.join(HOME_DIR_PATH, "data")

RAW_DIR_PATH = os.path.join(DATA_DIR_PATH, "raw")
INTERIM_DIR_PATH = os.path.join(DATA_DIR_PATH, "interim")
PROCESSED_DIR_PATH = os.path.join(DATA_DIR_PATH, "processed")

In [None]:
# Load images.json as Python Dictionary
images_json_file_name = "images.json"

with open(os.path.join(RAW_DIR_PATH, images_json_file_name), "rb") as f:
  images_json = json.load(f)


In [None]:
# Create a DataFrame from images.json
images_df = pd.DataFrame(images_json).set_index("image_id")
images_df.head()

In [None]:
# Create images directory

images_dir_path = os.path.join(INTERIM_DIR_PATH, "images")
os.makedirs(images_dir_path, exist_ok=True)

In [None]:
groups = images_df["group_id"].unique().tolist()
print("Len of groups: ", len(groups))
groups[:5]

In [None]:
def download_images_from_group(df, group_id):
  images_group_dir_path = os.path.join(images_dir_path, group_id)
  os.makedirs(images_group_dir_path, exist_ok=True)

  images_group_df = df[df["group_id"] == group_id]

  for image_id, row in images_group_df.iterrows():
    image_url = row["image_url"]
    res = requests.get(image_url, stream=True)

    if res.status_code == 200:
      image_file_name = image_id + ".jpg"
      image_file_path = os.path.join(images_group_dir_path, image_file_name)

      with open(image_file_path, "wb") as f:
        res.raw.decode_content = True
        shutil.copyfileobj(res.raw, f)

In [None]:
download_images_from_group(images_df, groups[0])

In [None]:
results_dir_path = os.path.join(PROCESSED_DIR_PATH, "results")
os.makedirs(results_dir_path, exist_ok=True)

In [None]:
encoder = CNN()

In [None]:
def find_duplicates(encoder, group_id):
  images_group_dir_path = os.path.join(images_dir_path, group_id)
  results_file_name = group_id + ".json"
  results_file_path = os.path.join(results_dir_path, results_file_name)
  duplicates = encoder.find_duplicates(
    image_dir=images_group_dir_path,
    min_similarity_threshold=0.85,
    scores=True,
    outfile=results_file_path
  )

In [None]:
find_duplicates(encoder, groups[0])

In [None]:
def load_results(group_id):
  results_file_name = group_id + ".json"
  results_file_path = os.path.join(results_dir_path, results_file_name)

  with open(results_file_path, "rb") as f:
    results = json.load(f)

  return results

In [None]:
results = load_results(groups[0])
results

In [None]:
# display images of each result
def display_duplicate_images(group_id, first_n=5):
  results = load_results(group_id)
  images_group_dir_path = os.path.join(images_dir_path, group_id)
  results_items = list(results.items())[:first_n]
  for base_image, duplicate_images_list in results.items():
    print("##########################")
    print(f"Base image: {base_image}")
    display(Image(filename=os.path.join(images_group_dir_path, base_image), width=200, height=200))
    for index, image in enumerate(duplicate_images_list):
      image_file_path = os.path.join(images_group_dir_path, image[0])
      print(f"Duplicate image {index+1} (similarity: {image[1] * 100:.2f}): {image[0]}")
      display(Image(filename=image_file_path, width=200, height=200))
    print("##########################")

In [None]:
display_duplicate_images(groups[0])

In [None]:
def delete_images_from_group(group_id):
  images_group_dir_path = os.path.join(images_dir_path, group_id)
  shutil.rmtree(images_group_dir_path)

In [None]:
delete_images_from_group(groups[0])