In [1]:
import os
import json
import requests # request img from web
import shutil # save img locally
from IPython.display import Image
import pandas as pd

from imagededup.methods import CNN

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:
HOME_DIR_PATH = os.path.join("/", "app")

DATA_DIR_PATH = os.path.join(HOME_DIR_PATH, "data")

RAW_DIR_PATH = os.path.join(DATA_DIR_PATH, "raw")
INTERIM_DIR_PATH = os.path.join(DATA_DIR_PATH, "interim")
PROCESSED_DIR_PATH = os.path.join(DATA_DIR_PATH, "processed")

In [3]:
# Load images.json as Python Dictionary
images_json_file_name = "images.json"

with open(os.path.join(RAW_DIR_PATH, images_json_file_name), "rb") as f:
    images_json = json.load(f)


In [4]:
# Create a DataFrame from images.json
images_df = pd.DataFrame(images_json).set_index("image_id")
images_df.head()

Unnamed: 0_level_0,group_id,image_url
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2224086878288,https://i.travelapi.com/hotels/57000000/569800...
2,2224086878288,https://i.travelapi.com/hotels/57000000/569800...
3,2224086878288,https://i.travelapi.com/hotels/57000000/569800...
4,2224086878288,https://i.travelapi.com/hotels/57000000/569800...
5,2224086878288,https://i.travelapi.com/hotels/57000000/569800...


In [5]:
# Create images directory

images_dir_path = os.path.join(INTERIM_DIR_PATH, "images")
os.makedirs(images_dir_path, exist_ok=True)

In [6]:
groups = images_df["group_id"].unique().tolist()
print("Len of groups: ", len(groups))
groups[:5]

Len of groups:  365


['2224086878288',
 '20234819560905',
 '39095384769302',
 '59718437804789',
 '74551318319906']

In [9]:
def download_images_from_group(df, group_id):
    images_group_dir_path = os.path.join(images_dir_path, group_id)
    os.makedirs(images_group_dir_path, exist_ok=True)

    images_group_df = df[df["group_id"] == group_id]

    for image_id, row in images_group_df.iterrows():
        image_url = row["image_url"]
        res = requests.get(image_url, stream=True)

        if res.status_code == 200:
            image_file_name = image_id + ".jpg"
            image_file_path = os.path.join(images_group_dir_path, image_file_name)

            with open(image_file_path, "wb") as f:
                res.raw.decode_content = True
                shutil.copyfileobj(res.raw, f)

In [10]:
download_images_from_group(images_df, groups[0])

In [11]:
results_dir_path = os.path.join(PROCESSED_DIR_PATH, "results")
os.makedirs(results_dir_path, exist_ok=True)

In [12]:
encoder = CNN()

2023-01-10 14:55:10,584: INFO Initialized: MobileNet v3 pretrained on ImageNet dataset sliced at GAP layer


In [13]:
def find_duplicates(encoder, group_id):
    images_group_dir_path = os.path.join(images_dir_path, group_id)
    results_file_name = group_id + ".json"
    results_file_path = os.path.join(results_dir_path, results_file_name)
    duplicates = encoder.find_duplicates(
        image_dir=images_group_dir_path,
        min_similarity_threshold=0.85,
        scores=True,
        outfile=results_file_path
    )

In [14]:
find_duplicates(encoder, groups[0])

2023-01-10 14:55:15,548: INFO Start: Image encoding generation
2023-01-10 14:55:26,173: INFO End: Image encoding generation
2023-01-10 14:55:26,978: INFO Start: Calculating cosine similarities...
2023-01-10 14:55:26,986: INFO End: Calculating cosine similarities.
2023-01-10 14:55:27,001: INFO Start: Saving duplicates as json!
2023-01-10 14:55:27,051: INFO End: Saving duplicates as json!


In [15]:
def load_results(group_id):
    results_file_name = group_id + ".json"
    results_file_path = os.path.join(results_dir_path, results_file_name)

    with open(results_file_path, "rb") as f:
        results = json.load(f)

    return results

In [16]:
results = load_results(groups[0])
results

{'1.jpg': [['108.jpg', 0.8954224586486816],
  ['119.jpg', 0.9999999403953552],
  ['127.jpg', 0.9999999403953552],
  ['145.jpg', 0.8954224586486816],
  ['176.jpg', 0.8954224586486816],
  ['192.jpg', 0.861783504486084],
  ['194.jpg', 0.861783504486084],
  ['202.jpg', 0.861783504486084],
  ['24.jpg', 0.861783504486084],
  ['25.jpg', 0.861783504486084],
  ['28.jpg', 0.8954224586486816],
  ['5.jpg', 0.8954224586486816],
  ['64.jpg', 0.9999999403953552],
  ['87.jpg', 0.9999999403953552]],
 '10.jpg': [['154.jpg', 1.000000238418579],
  ['158.jpg', 0.8560693264007568],
  ['178.jpg', 1.000000238418579],
  ['2.jpg', 0.8560693264007568],
  ['35.jpg', 1.000000238418579],
  ['48.jpg', 0.8560693264007568],
  ['56.jpg', 1.000000238418579],
  ['62.jpg', 0.8560693264007568],
  ['83.jpg', 0.8560693264007568]],
 '100.jpg': [['146.jpg', 0.9999997615814209],
  ['78.jpg', 0.9999997615814209],
  ['8.jpg', 0.9999997615814209],
  ['94.jpg', 0.9999997615814209]],
 '101.jpg': [['128.jpg', 0.8809066414833069],
  [

In [None]:
# display images of each result
def display_duplicate_images(group_id, first_n=5):
    results = load_results(group_id)
    images_group_dir_path = os.path.join(images_dir_path, group_id)
    results_items = list(results.items())[:first_n]
    for base_image, duplicate_images_list in results_items:
        print("##########################")
        print(f"Base image: {base_image}")
        display(Image(filename=os.path.join(images_group_dir_path, base_image), width=200, height=200))
        for index, image in enumerate(duplicate_images_list):
            image_file_path = os.path.join(images_group_dir_path, image[0])
            print(f"Duplicate image {index+1} (similarity: {image[1] * 100:.2f}): {image[0]}")
            display(Image(filename=image_file_path, width=200, height=200))
            print("##########################")

In [None]:
display_duplicate_images(groups[0])

In [None]:
# def delete_images_from_group(group_id):
#     images_group_dir_path = os.path.join(images_dir_path, group_id)
#     shutil.rmtree(images_group_dir_path)

In [None]:
# delete_images_from_group(groups[0])