In [43]:
# imports y constantes

# imports
from app.s3 import S3
from app.functions import add_metadata
import fastdup
import sqlite3
import shutil
import ijson
import os 
import json

# constantes

# Direccion de los archivos temporales
PATH_IMAGES = "trash/img"
PATH_REPORT = "trash/reports/"
PATH_JSON = "trash/s3/"
PATH_DB = "trash/db/"

# Nombre de las tablas de db
TABLE_NAME = ["origin", "alternative"]

# queries para SQLite
CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {0} (file_name VARCHAR(1024), ref VARCHAR(256))"
INSERT = "INSERT INTO {0} (file_name, ref) VALUES (?, ?)"
SELECT = "SELECT ref FROM {0} WHERE file_name = '{1}'"


In [45]:
response = {
    "bucket": "hydrahi4ai",

    "path_origin_file": "data/json/New_collector_20240112_172137.success.json",
    "path_alternative_file": "data/json/Myntra__Marianfer_Cruz_20240111_095149.success.json",

    "path_origin_img": "ajio-myntra/origin/20240112/",
    "path_alternative_img": "ajio-myntra/alternative/20240110/",

    "path_report": "ajio-myntra/reports/matchin_ajio_myntra_1.json",

    "img_per_object": 2,
    
    "setting": {
        "origin_file_name_imgs": "product_images",
        "alternative_file_name_imgs": "product_images",
        
        "ref_origin": "sku",
        "ref_alternative": "sku",
        
        "origin_search_parameter": {
            "brand": "U.S. POLO ASSN"
        },
        "alternative_search_parameter": {
            "brand": "U.S. POLO ASSN"
        }
    }
}

In [46]:
bucket = response["bucket"]
path_origin_file = response["path_origin_file"]
path_alternative_file = response["path_alternative_file"]
path_origin_img = response["path_origin_img"]
path_alternative_img = response["path_alternative_img"]
img_per_object = response["img_per_object"]
origin_file_name_imgs = response["setting"]["origin_file_name_imgs"]
alternative_file_name_imgs = response["setting"]["alternative_file_name_imgs"]
ref_origin = response["setting"]["ref_origin"]
ref_alternative = response["setting"]["ref_alternative"]

In [47]:
s3 = S3(bucket)
fd = fastdup.create("trash/fastdup/")

# s3 data path
s3_json_path =  [ path_origin_file, path_alternative_file ]

# s3 img path
s3_img_path = [path_origin_img, path_alternative_img]

# filename db and images
db_name = "trash/db/db_filename.sqlite"
filename_images = "trash/fastdup/path_images.txt"


field_name_file_images = [origin_file_name_imgs, alternative_file_name_imgs]
field_name_ref = [ref_origin, ref_alternative]


In [None]:
# creamos la db donde vamos a guardar el nombre de las imagenes junto con su referencia
with sqlite3.connect(db_name) as con:
    
    # creamos el archivo donde vamos a guardar la direccion de las imagenes 
    with open(filename_images, "w", encoding="utf-8") as file:

        for i, path in enumerate(s3_json_path):

            # descargamos el archivo de la data scrapeada
            json_path = s3.download_file(PATH_JSON, path)
            
            # abrimos el archivo i lo recorremos con un iterador por trozo
            with open(json_path, "r", encoding="utf-8") as json:
                
                # creamos la tabla donde se va guardar los nombres junto con su referencia
                con.execute(CREATE_TABLE.format(TABLE_NAME[i]))
                objets_json = ijson.items(json, "item")
                
                for obj in objets_json:
                    
                    images_name = obj[field_name_file_images[i]]
                    ref = obj[field_name_ref[i]]
                    amount = img_per_object if len(images_name) >= img_per_object else len(images_name)
                    images_name = images_name[:amount]

                    for image in images_name:
                        
                        # si el valor es false se lo salta
                        if image:
                        
                            path_s3_img = os.path.join(s3_img_path[i], image)
                            con.execute(INSERT.format(TABLE_NAME[i]), (image, ref))
                            path_local_img = s3.download_file(PATH_IMAGES, path_s3_img)
                            file.write(path_local_img+"\n")
        
            os.remove(json_path)
            con.commit()


In [None]:
# revisamos las imagenes con fastdup
fd.run(filename_images, threshold= 0.5, overwrite= True, high_accuracy= True)
# pedimos las imagenes que son invalidas
invalid_img_s3: list = fd.invalid_instances()["filename"].to_list()

In [None]:
if len(invalid_img_s3):
    for damaged_file in invalid_img_s3:
        add_metadata(damaged_file)
    
    # analizo las imagenes de nuevo
    fd.run(filename_images, threshold= 0.5, overwrite= True, high_accuracy= True)

similarity = fd.similarity()

# cambiamos el la direccion de las imagenes para que solo sea el nombre del archivo
for col_name in ["filename_from", "filename_to"]: 
    similarity[col_name] = similarity[col_name].apply(lambda x : os.path.basename(x))

In [41]:
matches = {}

with sqlite3.connect(db_name) as con:
    
    for i, row in similarity.iterrows():

        filename_from = row["filename_from"]
        code_ref_origin = con.execute(SELECT.format("origin", filename_from)).fetchone()

        if code_ref_origin:

            filename_to = row["filename_to"]
            code_ref_aternative = con.execute(SELECT.format("alternative", filename_to)).fetchone()
            
            if not code_ref_aternative:
                continue
            else:
                code_ref_origin = code_ref_origin[0]
                code_ref_aternative = code_ref_aternative[0]

                if code_ref_origin not in matches:
                    matches[code_ref_origin] = {code_ref_aternative: row["distance"]}
                else:
                    matches[code_ref_origin][code_ref_aternative] = row["distance"]

with open("trash/reports/test.json", "w", encoding="utf-8") as file:

    json.dump(matches, file, indent=4)




In [None]:
# for name in ["fastdup", "img", "reports", "s3", "db"]:
#     path = f"trash/{name}"
#     shutil.rmtree(path)
#     os.makedirs(path)