In [None]:
%run ../commons/imports_cleaner.ipynb
%run ../commons/helper_db.ipynb
%run ../commons/helper_contents.ipynb

In [None]:
# GENERAL CONFIGS
create_database('blp_dataset')

cursor = conn.cursor(buffered=True)

DB_NAME = "blp_dataset"
TABLE_NAME = "products_tmp_amazon"

## First step: clear all repeated images

Using Perceptual Hashing (P-Hash) algorithm, we analyze and remove all images that have equal hashes.
We also updated the database, so we can have control over how the Dark Marketplace (DM) data was distributed. This can also help with future work.

Database fields that do not have a valid image will not be used in the training. We will only use those products that have both description and title as well as image, and vice versa, in order to maintain a balance between the results of the two networks.

Keeping the lines in the database that have no image, just placing a flag to indicate the status, we open the possibility of using this data in training, if necessary.

Also, we will copy the images that are not repeated, but has the texts into this situation, so we can use those files if necessary.

In [None]:
## CLEAR REPEATED IMAGES
MAIN_FOLDER = r"D:\images\_amazon"
folder = Path(MAIN_FOLDER)
p_img_hashes = {}
error_files = []

for image in os.listdir(folder):
    image_path = Path(f"{folder}\{image}")

    try:
        hash = imagehash.phash(Image.open(image_path))
        image_name = os.path.splitext(image)[0]

        if hash in p_img_hashes:
            os.remove(image_path)
        else:
            p_img_hashes[hash] = image
    except:
        error_files.append(image)

In [None]:
def del_image(image_path):
    os.remove(image_path)

In [None]:
def copy_image(name):
    url_image = Path(f"{Path(MAIN_FOLDER)}\{name}.jpeg")
    dst_dir = Path(f"D:\images\excluded_product\{name}.jpeg")

    if url_image.exists():
        copyfile(url_image, dst_dir)
        del_image(url_image)

In [None]:
# CLEAR REPEATED TEXTS
cursor.execute(
    "SELECT description, COUNT(description) "
    "FROM products_tmp_amazon "
    "GROUP BY description HAVING COUNT(description) > 1")

repeated = cursor.fetchall()

for res in repeated:
    cursor.execute("SELECT id FROM " + TABLE_NAME + " WHERE description = %s", (res[0],))
    to_clear = cursor.fetchall()
    cont = 0

    for item in to_clear:
        if cont > 0:
            print(f"Product {item[0]}")

            cursor.execute("DELETE FROM " + TABLE_NAME + " WHERE id = %s", (item[0],))
            conn.commit()
        else:
            print(f"Product {item[0]} saved")
        cont = cont + 1

## Next step: join the data

In [None]:
cursor.execute("SELECT id, name, description, market_name, seller_name, illegal FROM products_tmp_oxygen")
items = cursor.fetchall()

DEST_FOLDER = r"D:\images\illegal"
dest = Path(DEST_FOLDER)

error = []

for i in items:
    query = "INSERT INTO products (name, description, market_name, seller_name, illegal) VALUES (%s, %s, %s, %s, %s)"
    values = (i[1], i[2], i[3], i[4], "t")

    cursor.execute(query, values)
    conn.commit()
#     product_id = cursor.lastrowid

#     url_image = Path(f"{Path(MAIN_FOLDER)}\{i[0]}.jpeg")
#     dst_dir = Path(f"{Path(dest)}\{product_id}.jpeg")

#     if url_image.exists():
#         copyfile(url_image, dst_dir)
#         del_image(url_image)
        
#     else:
#         error.append(i[0])