In [32]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, validator
from dotenv import load_dotenv
from typing import Union
import pandas as pd
import fastdup
import boto3
import os

app = FastAPI()

# Obtenemos los env
load_dotenv(".env")

# Crea una instancia del cliente de S3
s3 = boto3.client(
    's3',
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)

In [33]:
class Matching_images(BaseModel):
    
    bucket: str
    path_origin_file: str
    path_alternative_file: str
    path_origin_img: str
    path_alternative_img: str
    img_per_object: Union[int, None]

In [34]:
@app.post("/matching/image/")
async def matching_images(matching_data):
    bucket = matching_data["bucket"]
    path_origin_file = matching_data["path_origin_file"]
    path_alternative_file = matching_data["path_alternative_file"]
    path_origin_img = matching_data["path_origin_img"]
    path_alternative_img = matching_data["path_alternative_img"]
    img_per_object = matching_data["img_per_object"]

    # validamos que el bucket exista
    if bucket:
        buckets: list = [bucket["Name"] for bucket in s3.list_buckets()["Buckets"]]
        
        if bucket not in buckets:
            raise HTTPException(status_code=404, detail=f"El nombre del Bucket '{bucket}' esta mal escrito o no existe.")
    else:
        raise HTTPException(status_code=404, detail="Debes ingresar el nombre del Bucket")
    del buckets
    
    # validamos que los archivos origin y alternative existan
    for path, type in zip([path_origin_file, path_alternative_file],["origin-file", "alternative-file"]):
        
        if path:
            if isinstance(path, str):
                
                extencion = path.split(".")[-1]
                
                if extencion == "json":
                    try:
                        s3.head_object(Bucket=bucket, Key=path)
                    except:
                        raise HTTPException(status_code=404, detail=f"{type}: El archivo '{path}' no existe o esta mal escrito")
                else:
                    raise HTTPException(status_code=404, detail=f"{type}: El archivo '{path}' debe de ser se tipo json no de '{extencion}'")
            else:
                raise HTTPException(status_code=404, detail=f"{type}: El parametro debe de ser de tipo str no de {type(path)}")
        else:
            raise HTTPException(status_code=404, detail=f"{type}: No se puede dejar vacio este atributo.")
    del extencion
    
    # validamos que la direccion donde ese encuentran las imagenes existan
    for path, type in zip([path_origin_img, path_alternative_img],["origin-image", "alternative-image"]):
        
        if path:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=path)
            if "Contents" not in response:
                raise HTTPException(status_code=404, detail=f"{type}: La ruta ingresada no existe o esta mal escrita.")
        else:
            raise HTTPException(status_code=404, detail=f"{type}: Debes ingresar la direccion de las imagenes.")
    del response
    
    # descargamos los archivos del origin y el aternative
    PATH_TRASH: str = "../trash/s3/json/"
    
    file_origin: str = os.path.join(PATH_TRASH, path_origin_file.split("/")[-1])
    file_alternative: str = os.path.join(PATH_TRASH, path_alternative_file.split("/")[-1])
    
    # descargamos los archivos json
    for path_local, path_s3 in zip([file_origin, file_alternative], [path_origin_file, path_alternative_file]):
        
        s3.download_file(
            bucket, 
            path_s3,
            path_local
        )
    
    # pasamos los json file a df y despues borramos esos archivos
    
    df_origin: pd.DataFrame = pd.read_json(file_origin)
    os.remove(file_origin)
    
    df_aternative: pd.DataFrame = pd.read_json(file_alternative)
    os.remove(file_alternative)
    
    WORK_DIR: str = "../trash/fastdup/"
    FIELD_NAME_IMAGES: str = "product_images"
    input_dir: list = []
    
    # abquirimos el nombre de losarchivos para armar un file txt con la ruta deca imagen para pasarlo como argumento al input_dir
    list_images_name_origin: list = df_origin[FIELD_NAME_IMAGES].to_list()
    list_images_name_alternative: list = df_aternative[FIELD_NAME_IMAGES].to_list()
    
    for path_s3, list_img in zip([path_origin_img, path_alternative_img], [list_images_name_origin, list_images_name_alternative]):
        
        for images in list_img: 
            
            if img_per_object == 0:
                amount = len(list_img)
            else:
                if len(list_img) <= img_per_object:
                    amount = len(list_img)
                if len(list_img) > img_per_object:
                    amount = img_per_object
                    
            for img  in images[0:amount]:
                if img:
                    input_dir.append(
                        f"s3://{bucket}/{path_s3}{img}\n"
                    )
                
    path_files_s3: str = "../trash/fastdup/address_files_s3.txt"
    with open(path_files_s3, "w", encoding="utf8") as file:
        for path in input_dir:
            file.write(path)
        
    fd = fastdup.create(WORK_DIR)
    fd.run(path_files_s3, threshold= 0.5, overwrite= True, high_accuracy= True)
    similarity = fd.similarity()
    os.remove(path_files_s3)
    
    for col_name in ["filename_from", "filename_to"]:
        similarity[col_name] = similarity[col_name].apply(lambda x : x.split("/")[-1])

    # aqui empieza el anailisis de la data de cuales fueron las imagenes con similitud

    result = pd.DataFrame()

    # concatenamos los elemontos que se encuentren en filename_from del origin
    for i in range(0, len(df_origin)):
        
        product_images = df_origin["product_images"][i]
        search = similarity[similarity["filename_from"].isin(product_images) & ~(similarity["filename_to"].isin(product_images))]
        
        if len(search) != 0:
            
            result = pd.concat([result, search])

    result = result.reset_index(drop=True)

    # eliminamos los elementos que se encuentren en filename_to del df_origin
    for i in range(0, len(df_origin)):
        
        product_images = df_origin["product_images"][i]
        search = result[result["filename_to"].isin(product_images)].index
            
        if len(search) != 0:   
            result = result.drop(index=search)
    
    return result
    
    # return {
    #     "bucket": bucket,
    #     "path_origin_file": path_origin_file,
    #     "path_alternative_file": path_alternative_file,
    #     "path_origin_img": path_origin_img,
    #     "path_alternative_img": path_alternative_img,
    #     "img_per_object": img_per_object
    # }

In [None]:
request = {
    "bucket": "hydrahi4ai",
    "path_origin_file": "ajio-myntra/origin/20231214/New_collector_20231214_154733.success.json",
    "path_alternative_file": "ajio-myntra/alternative/20231219/Myntra__Marianfer_Cruz_20231219_195028.success.json",
    "path_origin_img": "ajio-myntra/origin/20231214/",
    "path_alternative_img": "ajio-myntra/alternative/20231219/",
    "img_per_object": 0
}

similarity_test = matching_images(request)

In [10]:
origin = pd.read_json("../data/json/New_collector_20231214_154733.success.json")
alternative  = pd.read_json("../data/json/Myntra__Marianfer_Cruz_20231213_143830.success.json")

for col_name in ["filename_from", "filename_to"]:
    similarity_test[col_name] = similarity_test[col_name].apply(lambda x : x.split("/")[-1])

In [30]:
# aqui empieza el anailisis de la data de cuales fueron las imagenes con similitud

result = pd.DataFrame()

# concatenamos los elemontos que se encuentren en filename_from del origin
for i in range(0, len(origin)):
    
    product_images = origin["product_images"][i]
    search = similarity_test[similarity_test["filename_from"].isin(product_images) & ~(similarity_test["filename_to"].isin(product_images))]
    
    if len(search) != 0:
        
        result = pd.concat([result, search])

result = result.reset_index(drop=True)

# eliminamos los elementos que se encuentren en filename_to del origin
for i in range(0, len(origin)):
    
    product_images = origin["product_images"][i]
    
    search = result[result["filename_to"].isin(product_images)].index
        
    if len(search) != 0:   
        result = result.drop(index=search)

result = result.reset_index(drop=True)

for i in range(0,len(result)):
    
    filename_origin = result["filename_from"][i]
    search_index_origin = lambda lista : filename_origin in lista
    index_orgin = origin["product_images"].apply(search_index_origin)
    ref_origin = origin.loc[index_orgin, :]["sku"]
    
    filename_alternative = result["filename_to"][i]
    search_index_aternative = lambda lista : filename_alternative in lista
    index_alternative = alternative["product_images"].apply(search_index_aternative)
    ref_alternative = alternative.loc[index_alternative, : ]["sku"]

In [31]:
result

Unnamed: 0,from,to,distance,filename_from,index_x,error_code_from,is_valid_from,fd_index_from,filename_to,index_y,error_code_to,is_valid_to,fd_index_to
0,107,938,0.961787,j_lq5dbaxrzw1czqfic.c055587471519cfa3f5265a4ddb4202bf3d91720.file_lq5dc5wi27f1l05yti.[_].jpg,107,VALID,True,107,j_lqcqq8vk11jwo3y4nk.158f76d140a861f1ccc2227cf45917d28ba50000.file_lqcr8x402evb2db4f9.[_].webp,938,VALID,True,938
1,167,878,0.971031,j_lq5dbaxrzw1czqfic.c055587471519cfa3f5265a4ddb4202bf3d91720.file_lq5dcenv25h6q9ze75.[_].jpg,167,VALID,True,167,j_lqcqq8vk11jwo3y4nk.c236d247062a358a67be586abfce62926dffa65f.file_lqcr72zs23f1u2nn07.[_].webp,878,VALID,True,878
2,501,830,0.750323,j_lq5dbaxrzw1czqfic.c055587471519cfa3f5265a4ddb4202bf3d91720.file_lq5dceo313ucnh0h1g.[_].jpg,501,VALID,True,501,j_lqcqq8vk11jwo3y4nk.d5490618f6563a2efae63455af7404faa1c81e3b.file_lqcr0izpfm0j3q8n0.[_].webp,830,VALID,True,830
3,500,830,0.736375,j_lq5dbaxrzw1czqfic.c055587471519cfa3f5265a4ddb4202bf3d91720.file_lq5dceo3r1xnx980l.[_].jpg,500,VALID,True,500,j_lqcqq8vk11jwo3y4nk.d5490618f6563a2efae63455af7404faa1c81e3b.file_lqcr0izpfm0j3q8n0.[_].webp,830,VALID,True,830
