In [1]:
import os
from dotenv import load_dotenv
import psycopg2
import pandas as pd

load_dotenv()

conn_params = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT", "5432"),
}

In [2]:
# 68599
# 27346
#295

entity_id = 72067
import pandas as pd
import psycopg2

with psycopg2.connect(**conn_params) as conn:
    with conn.cursor() as cur:
       
        cur.execute("SELECT creator_id FROM entity WHERE id = %s", (entity_id,))
        row = cur.fetchone()
        if row is None or row[0] is None:
            raise ValueError(f"No se encontró creator_id para entity.id = {entity_id}")
        creator_id = row[0]

        cur.execute("SELECT * FROM file WHERE creator_id = %s", (creator_id,))
        rows = cur.fetchall()
        columns = [desc[0] for desc in cur.description]

files_df = pd.DataFrame(rows, columns=columns)
print(f"creator_id: {creator_id} | filas encontradas en file: {len(files_df)}")
files_df.head(50)

creator_id: 85891 | filas encontradas en file: 6


Unnamed: 0,id,name,file_type_id,bin,creation_date,creator_id,key_path
0,2956827,EDWARD COLE_PELAYO_license_selfie,5,"[b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ...",2025-08-07 16:20:25.430161,85891,
1,2956828,EDWARD COLE_PELAYO_license_back,5,"[b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ...",2025-08-07 16:20:25.983139,85891,
2,2956829,EDWARD COLE_PELAYO_license,5,"[b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ...",2025-08-07 16:20:26.596045,85891,
3,2956830,2025 Certificate of Good Standing.pdf,1,"[b'%', b'P', b'D', b'F', b'-', b'1', b'.', b'7...",2025-08-07 16:21:13.505032,85891,
4,2956831,Joe Passport.pdf,1,"[b'%', b'P', b'D', b'F', b'-', b'1', b'.', b'3...",2025-08-07 16:24:05.899560,85891,
5,2956832,Erin Conrad Passport.pdf,1,"[b'%', b'P', b'D', b'F', b'-', b'1', b'.', b'3...",2025-08-07 16:28:37.228136,85891,


In [3]:
import re
import mimetypes
from pathlib import Path
import pandas as pd

OUTPUT_DIR = Path("files_bajados")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

INVALID_WIN_CHARS = r"<>:\\/\|\?\*\"" 
_invalid_chars_pattern = re.compile(f"[{re.escape(INVALID_WIN_CHARS)}]+")

def sanitize_filename(name: str) -> str:
    if not isinstance(name, str) or not name.strip():
        return ""

    cleaned = _invalid_chars_pattern.sub("_", name).strip()
    cleaned = re.sub(r"\s+", " ", cleaned)

    reserved = {"CON", "PRN", "AUX", "NUL", *(f"COM{i}" for i in range(1, 10)), *(f"LPT{i}" for i in range(1, 10))}
    if cleaned.upper() in reserved:
        cleaned = f"_{cleaned}"
    return cleaned

def ensure_unique_path(base_path: Path) -> Path:
    if not base_path.exists():
        return base_path
    stem, suffix = base_path.stem, base_path.suffix
    counter = 1
    while True:
        candidate = base_path.with_name(f"{stem}_{counter}{suffix}")
        if not candidate.exists():
            return candidate
        counter += 1


def guess_extension_from_row(row: pd.Series) -> str:
    mime = None
    for key in ("content_type", "mime_type", "mimetype", "mime"):
        if key in row and isinstance(row[key], str) and row[key].strip():
            mime = row[key].strip()
            break
    if mime:
        ext = mimetypes.guess_extension(mime, strict=False)
        if ext:
            return ext

    if "extension" in row and isinstance(row["extension"], str) and row["extension"].strip():
        ext = row["extension"].strip()
        if not ext.startswith("."):
            ext = "." + ext
        return ext
    return ""

def guess_extension_from_bytes(binary_bytes: bytes) -> str:
    # Firmas mágicas comunes
    if binary_bytes.startswith(b"\xFF\xD8\xFF"):
        return ".jpg"
    if binary_bytes.startswith(b"\x89PNG\r\n\x1a\n"):
        return ".png"
    if binary_bytes.startswith(b"GIF87a") or binary_bytes.startswith(b"GIF89a"):
        return ".gif"
    if binary_bytes.startswith(b"%PDF"):
        return ".pdf"
    if binary_bytes.startswith(b"PK\x03\x04"):
        return ".zip"  # docx/xlsx/pptx también son zip
    if binary_bytes.startswith(b"Rar!\x1a\x07\x00"):
        return ".rar"
    if binary_bytes.startswith(b"7z\xBC\xAF'\x1C"):
        return ".7z"
    return ""


saved_records = []
skipped_rows = 0

if "bin" not in files_df.columns:
    raise KeyError("La columna 'bin' no existe en files_df. Verifica el DataFrame.")

for idx, row in files_df.iterrows():
    binary_data = row["bin"]
    if pd.isna(binary_data):
        skipped_rows += 1
        continue

    if isinstance(binary_data, memoryview):
        binary_bytes = binary_data.tobytes()
    elif isinstance(binary_data, (bytes, bytearray)):
        binary_bytes = bytes(binary_data)
    else:
        skipped_rows += 1
        continue

    base_name = None
    for key in ("filename", "file_name", "name", "original_name", "original_filename", "title"):
        if key in row and isinstance(row[key], str) and row[key].strip():
            base_name = row[key].strip()
            break
    if not base_name:
        identifier = row["id"] if "id" in row and pd.notna(row["id"]) else idx
        base_name = f"file_{identifier}"

    base_name = sanitize_filename(base_name)

    # Resolución de extensión: nombre > firma mágica > metadatos > .bin
    has_name_ext = "." in Path(base_name).name and not base_name.endswith(".")
    ext_from_magic = guess_extension_from_bytes(binary_bytes)
    ext_from_row = guess_extension_from_row(row)

    if has_name_ext:
        file_name = base_name
    elif ext_from_magic:
        file_name = base_name + ext_from_magic
    elif ext_from_row:
        file_name = base_name + ext_from_row
    else:
        file_name = base_name + ".bin"

    out_path = ensure_unique_path(OUTPUT_DIR / file_name)

    with open(out_path, "wb") as f:
        f.write(binary_bytes)

    saved_records.append({
        "index": idx,
        "id": row.get("id", None),
        "creator_id": row.get("creator_id", None),
        "path": str(out_path),
        "size_bytes": len(binary_bytes),
    })

result_df = pd.DataFrame(saved_records)
print(f"Guardados: {len(saved_records)} | Omitidos: {skipped_rows} | Carpeta: {OUTPUT_DIR.resolve()}")
result_df.head(50)

Guardados: 6 | Omitidos: 0 | Carpeta: C:\Users\felip\OneDrive\Escritorio\connectionAgent\files_bajados


Unnamed: 0,index,id,creator_id,path,size_bytes
0,0,2956827,85891,files_bajados\EDWARD COLE_PELAYO_license_selfi...,423564
1,1,2956828,85891,files_bajados\EDWARD COLE_PELAYO_license_back_...,468778
2,2,2956829,85891,files_bajados\EDWARD COLE_PELAYO_license_1.jpg,426781
3,3,2956830,85891,files_bajados\2025 Certificate of Good Standin...,368738
4,4,2956831,85891,files_bajados\Joe Passport_1.pdf,3756573
5,5,2956832,85891,files_bajados\Erin Conrad Passport_1.pdf,1058424


In [4]:
# 68599
# 27346
#295
#72067
#entity_id = 72023

In [5]:
import psycopg2
import pandas as pd

with psycopg2.connect(**conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT id, entity_id, kyb_provider_id, kyb_provider_data, creation_date
            FROM com_kyb_check
            WHERE entity_id = %s
            ORDER BY creation_date DESC
            """,
            (entity_id,)
        )
        rows = cur.fetchall()
        columns = [d[0] for d in cur.description]

kyb_df = pd.DataFrame(rows, columns=columns)
kyb_df.head(10)

Unnamed: 0,id,entity_id,kyb_provider_id,kyb_provider_data,creation_date
0,7746,72067,KYCP,"{'id': 3302, 'uid': 'N4OZZD', 'result': 'Ok', ...",2025-08-07 16:31:42.045835
1,7744,72067,ALLOY,{'_links': {'self': {'href': '/v1/journeys/J-D...,2025-08-07 16:18:10.031352


In [6]:
events = kyb_df.iloc[1]['kyb_provider_data']['_embedded']['events']
evaluation_token = next((event['evaluation_token'] for event in reversed(events) if event['type'] == 'completed_evaluation'), None)
print(evaluation_token)

L-zeShTGEsXIciRVrIqIqX


In [7]:
import requests

def get_alloy_evaluation(evtoken):
    base_url = "https://api.alloy.co/v1/evaluations"
    url = f"{base_url}/{evtoken}"
    username = 'VNhUFeEXunqrCscAMyyHU04ZYpt2tvKL'
    password = "a1l2jmInL00zjhJohly6MxWdV9aCogTn"
    try:
        
        response = requests.get(
            url,
            auth=(username, password),
            headers={
                'Content-Type': 'application/json',
                'Accept': 'application/json'
            }
        )
         
        response.raise_for_status()
        return response.json()
        
    except requests.exceptions.HTTPError as e:
        print(f"Error HTTP: {e}")
        return None

In [8]:
get_alloy_evaluation(evaluation_token)

{'status_code': 201,
 'error': None,
 'timestamp': 1754584315583,
 'evaluation_token': 'L-zeShTGEsXIciRVrIqIqX',
 'entity_token': 'B-NHj45QpIRYs0feAi0u46',
 'parent_entity_token': None,
 'external_entity_id': '72067b',
 'application_token': '1sCQ66OIXtBqbyGJigyAF8EfQGFglcK2',
 'application_version_id': 3,
 'champion_challenger_id': None,
 'summary': {'result': 'success',
  'score': 1,
  'tags': ['Representative Match',
   'Business Address Matched',
   'FEIN Found',
   'Business Name Matched',
   'Secretary of State Match'],
  'outcome_reasons': [],
  'outcome': 'Approved',
  'services': {'Middesk': 'executed'},
  'alloy_fraud_score': None},
 'supplied': {'type': 'business.updated',
  'data': {'object': {'object': 'business',
    'id': '11fc4a1a-dddc-4f0f-8146-5a2cc78ae396',
    'external_id': 'B-NHj45QpIRYs0feAi0u46',
    'unique_external_id': None,
    'name': 'Cactus Communications, Inc.',
    'created_at': '2025-08-07T16:31:27.644Z',
    'updated_at': '2025-08-07T16:31:53.973Z',
  