## Agentic KYB Evaluation

In [1]:
import os
from dotenv import load_dotenv
import psycopg2
import pandas as pd

load_dotenv()

conn_params = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT", "5432"),
}

In [2]:
load_dotenv('.env.local')

conn_params = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT", "5432"),
}

try:
    with psycopg2.connect(**conn_params) as conn:
        with conn.cursor() as cur:
            cur.execute("SELECT * FROM dwh.contractor LIMIT 10")
            rows = cur.fetchall()
            columns = [desc[0] for desc in cur.description]
            
    print("✅ Connection successful!")
    print(f"Query returned {len(rows)} rows")
    print(f"Columns: {columns}")
    
    # Create DataFrame for better visualization
    test_df = pd.DataFrame(rows, columns=columns)
    print("\nFirst few rows:")
    print(test_df.head())
    
except Exception as e:
    print(f"❌ Connection failed: {e}")


✅ Connection successful!
Query returned 10 rows
Columns: ['id_contractor', 'cod_contractor', 'first_name', 'last_name', 'full_name', 'dat_birth', 'num_age', 'cod_residence_country_region', 'des_residence_country_region', 'cod_residence_country', 'des_residence_country', 'des_residence_city', 'cod_nationality_country_region', 'des_nationality_country_region', 'cod_nationality_country', 'des_nationality_country', 'is_us_person', 'des_gender', 'des_email', 'des_phone', 'des_legal_name', 'des_trade_name', 'des_contractor_type', 'is_tos_accepted', 'cod_external_kyc_check', 'des_kyc_check_status', 'dat_last_kyc_check', 'num_contracts_in_active_state', 'has_active_contract', 'has_compensation_in_last_months', 'has_compensation_in_last_month', 'des_wallet_status', 'dat_first_compensation', 'is_receiving_compensation_wallet', 'amt_monthly_compensation_usd', 'has_card', 'has_ontop_pay', 'has_positive_card_balance', 'amt_card_balance', 'dat_card_activation', 'has_positive_wallet_balance', 'amt_wa

In [3]:
# Load the CSV data and join with additional document requests
import json

# Load the CSV data
csv_df = pd.read_csv('Datos_Db.csv')

print("📊 CSV Data Overview:")
print(f"Total records: {len(csv_df)}")
print(f"Columns: {list(csv_df.columns)}")
print(f"Unique entity_ids: {csv_df['entity_id'].nunique()}")
print(f"Decisions: {csv_df['decision'].value_counts().to_dict()}")

# Get unique entity_ids from CSV
entity_ids = csv_df['entity_id'].unique()
print(f"\nEntity IDs to query: {entity_ids}")

# Convert to tuple for SQL IN clause
entity_ids_tuple = tuple(entity_ids)


📊 CSV Data Overview:
Total records: 11
Columns: ['id', 'job_id', 'entity_id', 'status', 'decision', 'risk_score', 'justification', 'llm_analysis', 'llm_model', 'components', 'created_at', 'updated_at']
Unique entity_ids: 11
Decisions: {'Manual Review': 8, 'Accepted': 3}

Entity IDs to query: [72067 73245 73260 73256 73298 73103 73366 73369 73390 73279 73373]


In [None]:
# Query for additional document requests for the entity_ids
try:
    with psycopg2.connect(**conn_params) as conn:
        with conn.cursor() as cur:
            # First, let's check the structure of the document request table
            cur.execute("""
                SELECT column_name, data_type 
                FROM information_schema.columns 
                WHERE table_schema = 'raw_data' 
                AND table_name = 'raw_com_client_onboarding_document_request'
            """)
            columns_info = cur.fetchall()
            
            print("📋 Table Structure for raw.com_client_onboarding_document_request:")
            for col_name, data_type in columns_info:
                print(f"  - {col_name}: {data_type}")
            
            print("\n" + "="*60)
            
            # Convert numpy int64 to regular Python ints for psycopg2 compatibility
            entity_ids_list = [int(entity_id) for entity_id in entity_ids]
            print(f"Entity IDs to query (converted to int): {entity_ids_list}")
            
            # Query for document requests for our entity_ids
            cur.execute("""
                SELECT 
                    entity_id,
                    document_description,
                    document_title,
                    created_at,
                    status
                FROM raw_data.raw_com_client_onboarding_document_request 
                WHERE entity_id = ANY(%s)
            """, (entity_ids_list,))
            
            doc_requests = cur.fetchall()
            doc_columns = [desc[0] for desc in cur.description]
            
            print(f"\n📊 Document Request Results:")
            print(f"Found {len(doc_requests)} entities with document requests")
            
            if doc_requests:
                doc_df = pd.DataFrame(doc_requests, columns=doc_columns)
                print("\nDocument requests by entity:")
                print(doc_df)
                
                # Check which entities from our CSV have document requests
                entities_with_requests = set([row[0] for row in doc_requests])
                entities_in_csv = set(entity_ids)
                
                print(f"\n🔍 Analysis:")
                print(f"Entities in CSV: {len(entities_in_csv)}")
                print(f"Entities with document requests: {len(entities_with_requests)}")
                print(f"Entities with BOTH CSV data AND document requests: {len(entities_in_csv.intersection(entities_with_requests))}")
                print(f"Entities with CSV data but NO document requests: {len(entities_in_csv - entities_with_requests)}")
                
                # Show which entities don't have document requests
                no_requests = entities_in_csv - entities_with_requests
                if no_requests:
                    print(f"\nEntities without document requests: {sorted(no_requests)}")
            else:
                print("❌ No document requests found for any of the entity_ids")                
except Exception as e:
    print(f"❌ Query failed: {e}")


📋 Table Structure for raw.com_client_onboarding_document_request:
  - document_file_id: integer
  - entity_id: integer
  - user_id: integer
  - id: integer
  - written_answer: character varying
  - type: character varying
  - status: character varying
  - document_description: character varying
  - document_title: character varying
  - updated_at: timestamp without time zone
  - created_at: timestamp without time zone

Entity IDs to query (converted to int): [72067, 73245, 73260, 73256, 73298, 73103, 73366, 73369, 73390, 73279, 73373]

📊 Document Request Results:
Found 0 entities with document requests
❌ No document requests found for any of the entity_ids
