In [309]:
from load_data import load_clients
import phonenumbers
from pycountry import pycountry
import re
from sklearn.metrics import confusion_matrix
import numpy as np
from datetime import datetime
from countryinfo import CountryInfo
from email_validator import validate_email, EmailNotValidError
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
import pgeocode
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from xgboost import XGBClassifier
import numpy as np
from datetime import timedelta
import ollama

In [310]:
def is_high_school_graduation_year_valid(graduation_year: int, birth_date: str, min_age: int = 16, max_age: int = 21) -> bool:
    """
    Check if the graduation year is reasonable given a person's birth date.

    Args:
        graduation_year (int): The year the person graduated high school.
        birth_date (str): Birth date in "YYYY-MM-DD" format.
        min_age (int): Minimum expected age at graduation (default: 16)
        max_age (int): Maximum expected age at graduation (default: 23)

    Returns:
        bool: True if the graduation year is within a reasonable age range.
    """
    try:
        birth_year = datetime.strptime(birth_date, "%Y-%m-%d").year
        age_at_graduation = graduation_year - birth_year
        return min_age <= age_at_graduation <= max_age, "High School Graduation Inconsistent"
    except ValueError:
        print("Invalid birth date format. Use YYYY-MM-DD.")
        return False, "Invalid Birth Date Format. Use YYYY-MM-DD"

In [311]:
def generate_wealth_summary_text(client_profile):
    currency = client_profile['client_profile']['currency']
    aum = client_profile['client_profile']['aum']
    inheritance = client_profile['client_profile']['inheritance_details']
    real_estate_list = client_profile['client_profile']["real_estate_details"]
    return (currency, aum, inheritance, real_estate_list)

In [312]:

def get_alpha_3_country_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3  # This is the passport-style code
    except LookupError:
        return None

from countryinfo import CountryInfo

def get_nationality(country_name):
    try:
        country = CountryInfo(country_name)
        return country.info().get('demonym', 'Unknown')
    except:
        return 'Unknown'



    
def check_multiple_country_const(client):
    country = client['passport']['country']
    country_code = client['passport']['country_code']
    nationality = client['passport']['nationality']

    #get international country code as written on passport with country name

    country_code_passport = get_alpha_3_country_code(country)
    if country_code != country_code_passport:
        return False, "Country Code Mismatch"
    
    #nationality_passport = get_nationality(country)
    #if nationality != nationality_passport:
    #    return False, "Nationality Mismatch"
    return True, None

In [313]:
def could_be_valid_email(email):
    try:
        validate_email(email, check_deliverability=False)
        return True
    except EmailNotValidError:
        return False

def compare_strings_ignore_spaces(str1, str2):
    clean1 = str1.replace(" ", "")
    clean2 = str2.replace(" ", "")
    return clean1 == clean2

def check_name(client):
    flags = True
    if client['passport']['first_name'] != client['account_form']['first_name']:
        flags = False
    if client['passport']['last_name'] != client['account_form']['last_name']:
        flags = False
    if client['passport']['middle_name'] != client['account_form']['middle_name']:
        flags = False
    return flags, "Passport Name Mismatch" 

def check_full_name(client):
    flags = True
    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['account_form']['name']) == False:
        flags = False

    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['client_profile']['name']) == False:
        flags = False
    return flags, "Full Name Mismatch"

def check_gender(client):
    flags = True
    if client['passport']['gender'] != client['client_profile']['gender']:
        flags = False
    return flags, "Passport Gender Mismatch"

def check_passport_number(client):
    flags = True
    if client['passport']['passport_number'] != client['account_form']['passport_number']:
        flags = False
    return flags, "Passport Number Mismatch"

def check_passport_expiry_date(client):
    flags = True
    # Convert string to datetime object (adjust format if needed)
    expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    today = datetime.strptime("2021-04-10", "%Y-%m-%d").date() #2025-04-1  ?
    if expiry_date < today:
        flags = False
    return flags, "Passport Expiry Date Invalid"

def check_email_validity(client):
    flags = True
    if could_be_valid_email(client['client_profile']['email_address']) == False:
        flags = False
    return flags, "Email Address Invalid"

def check_phone_number(client):
    flags = True
    if client['account_form']['phone_number'] != client['client_profile']['phone_number']:
        flags = False
    return flags, "Phone Number Mismatch"

def check_email_address(client):
    flags = True
    if client['account_form']['email_address'] != client['client_profile']['email_address']:
        flags = False
    return flags, "Email Address Mismatch"

def check_passport_dates(client):
    messages = []
    passport_birth_date = datetime.strptime(client['passport']['birth_date'], "%Y-%m-%d").date()
    client_birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d").date()
    passport_issued_date = datetime.strptime(client['passport']['passport_issue_date'], "%Y-%m-%d").date()
    client_issued_date = datetime.strptime(client['client_profile']['passport_issue_date'], "%Y-%m-%d").date()
    passport_expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    client_expiry_date = datetime.strptime(client['client_profile']['passport_expiry_date'], "%Y-%m-%d").date()
    if passport_expiry_date < passport_issued_date:
        messages.append("Passport Issued Date Invalid")
    if passport_birth_date != client_birth_date:
        messages.append("Passport Birth Date Mismatch")
    if passport_issued_date != client_issued_date:
        messages.append("Passport Issued Date Mismatch")
    if passport_expiry_date != client_expiry_date:
        messages.append("Passport Expiry Date Mismatch")
    if passport_issued_date < passport_birth_date:
        messages.append("Passport Issued Date Mismatch")
    return messages


In [314]:
def mrz_check(client):
    flags = True
    if (flags == True) and (client['passport']['last_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['first_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['middle_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['country_code'].upper() not in client['passport']['passport_mrz'][0]):
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) == 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()))): #no middle name
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) != 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['middle_name'].upper()))):
        flags = False
    if (flags == True) and (client['passport']['passport_number']+client['passport']['country_code']+client['passport']['birth_date'].replace('-','')[2:] not in client['passport']['passport_mrz'][1]):
        flags = False
    return flags, "MRZ Mismatch"

In [315]:
def currency_match(client):
    flags = True
    if client['account_form']['currency'] != client['client_profile']['currency']:
        flags = False
    return flags, "Currency Mismatch"

In [316]:
def domicile_validator(client):
    """
    Validates if the country of domicile in client profile corresponds to the country of domicile in account form
    """
    return client['account_form']['country_of_domicile'] == client['client_profile']['country_of_domicile'], "Country of Domicile Mismatch"

def address_validator(client):
    """
    Validates if the address in client profile corresponds to the address in account form
    """
    return client['account_form']['address'] == client['client_profile']['address'], "Address Mismatch"

# #_nomi_cache = {}

# def get_country_code(country_name):
#     """
#     Convert a full country name (e.g., 'Spain') to ISO Alpha-2 code (e.g., 'ES').

#     Returns:
#         str or None: ISO Alpha-2 code or None if not found.
#     """
#     try:
#         return pycountry.countries.lookup(country_name).alpha_2
#     except LookupError:
#         return None

# def get_nomi_instance(country_code):
#     """
#     Return a cached pgeocode.Nominatim instance for the given country.
#     """
#     if country_code not in _nomi_cache:
#         _nomi_cache[country_code] = pgeocode.Nominatim(country_code)
#     return _nomi_cache[country_code]

# def validate_postal_code_for_client(client):
#     """
#     Validates if the postal code exists in the provided city and country.

#     Parameters:
#         client (dict): a client info.

#     Returns:
#         bool: True if postal code matches the city in any listed country, else False.
#     """

#     # client_profile (dict): Must include 'address' and 'country_of_domicile'.
#     client_profile = client['client_profile']
#     country_names = [name.strip() for name in client_profile.get('country_of_domicile', '').split(',')]
#     address = client_profile.get('address', {})
#     city = address.get('city', '').lower()
#     postal_code = address.get('postal code', '')

#     for country_name in country_names:
#         country_code = get_country_code(country_name)
#         if not country_code:
#             continue  # Skip invalid countries

#         nomi = get_nomi_instance(country_code)
        # postal_info = nomi.query_postal_code(postal_code)['postal_code']

#         if postal_info is None:
#             continue

#         # matched_cities = [c.strip().lower() for c in postal_info.place_name.split(',')]
#         if postal_info == postal_code:
#             return True  # Valid match

#     return False  # No match found

In [317]:
def check_age(client):
    try:
        birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
        age = (datetime.now()-timedelta(5) - birth_date).days // 365
        return age >=18, "Age less than 18"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return None, "Invalid Date Format. Use YYYY-MM-DD"

In [318]:
def check_higher_education(client):
    try:
        if client['client_profile']['higher_education'] is None or len(client['client_profile']['higher_education']) == 0:
            return True, None
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['higher_education'][0]['graduation_year'] > client['client_profile']['secondary_school']['graduation_year'] and client['client_profile']['higher_education'][0]['graduation_year'] - int(birth_date.year) >17:
                return True, None
            else:
                return False, "Graduation Years Inconsistent"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False, "Invalid Date Format. Use YYYY-MM-DD"

In [319]:
def check_employment_history(client):
    try:
        if client['client_profile']['employment_history'] is None or len(client['client_profile']['employment_history']) == 0:
            return True, None
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['employment_history'][0]['start_year']  - int(birth_date.year) >16:
                return True, None
            else:
                return False, "Employment History Years Inconsistent"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False

In [320]:
def check_gender_not_null(client):
    if client["passport"]["gender"] is None or client["passport"]["gender"] == "":
        return False, "Gender Not Specified"
    return True, None

def check_mandate_not_null(client):
    if client["client_profile"]["type_of_mandate"] is None or client["client_profile"]["type_of_mandate"] == "":
        return False, "Type of Mandate Not Specified"
    return True, None

In [321]:
def check_empty_risk_profile(client):
    if client["client_profile"]["investment_risk_profile"] is None or client["client_profile"]["investment_risk_profile"] == "":
        return False, "Investment Risk Profile Not Specified"
    return True, None

In [322]:
def check_properties_sum_to_aum(client):
    try:
        if client["client_profile"]["real_estate_details"] is None or len(client["client_profile"]["real_estate_details"]) == 0 and client["client_profile"]["aum"]["real_estate_value"]>0:
            return False, "Real Estate Value Mismatch"
        else:
            value = [prop["property value"] for prop in client["client_profile"]["real_estate_details"]]
            value = sum(value)
            if value == client["client_profile"]["aum"]["real_estate_value"]:
                return True, None
            else:
                return False, "Real Estate Value Mismatch"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False, "Invalid Date Format. Use YYYY-MM-DD"

In [323]:
def clean_string(s):
    """Basic cleaning: lowercase and strip whitespace."""
    if not isinstance(s, str):
        return ""
    return s.lower().strip()

In [324]:
def check_secondary_const(client): 
    secondary = client["client_profile"]['secondary_school']['name']
    secondary = clean_string(secondary)
    if secondary not in clean_string(client["client_description"]["Education Background"]):
        return False, "Secondary School Name Mismatch"
    return True, None

In [325]:
def check_grad_const(client):
    grad_year_secondary = str(client["client_profile"]['secondary_school']['graduation_year'])
    grad_year_secondary = clean_string(grad_year_secondary)
    if grad_year_secondary not in clean_string(client["client_description"]["Education Background"]):
        flags = False, "Secondary School Graduation Year Mismatch"
    return True, None

In [326]:
def check_higher_education_const(client):
    
    flags = True
    list_of_education = []
    for higher in client["client_profile"]['higher_education']:
        list_of_education.append((higher['university'], higher['graduation_year']))
    for uni, uni_graduation_year in list_of_education:
        if clean_string(uni) not in clean_string(client["client_description"]["Education Background"]):
            return False, "University info Mismatch"
        if str(uni_graduation_year) not in clean_string(client["client_description"]["Education Background"]):
            return False, "University info Mismatch"
    return True, None
    

In [327]:
def missing_any_inheritance_detail(client):
    # inheritance details missing
    if client['client_profile']['aum']['inheritance'] != 0:
        for key, value in client['client_profile']['inheritance_details'].items():
            if value == '':
                return False, "Missing Inheritance Details"
    return True, None

In [328]:
def missing_psprt_num_client_profile(client):
    # passport number missing in client_profile
    if client['client_profile']['passport_number'] is None or client['client_profile']['passport_number'] == '':
        return False, "Missing Passport Number in Client Profile"
    return True, None

In [329]:
def missing_phone_num_client_profile(client):
    # phone number missing in client_profile
    if client['client_profile']['phone_number'] is None or client['client_profile']['phone_number'] == '':
        return False, "Missing Phone Number in Client Profile"
    return True, None

In [330]:
def missing_any_address_detail_client_profile(client):
    # any address info missing in client_profile
    for key, value in client['client_profile']['address'].items():
        if value is None or value == '':
            return False, "Missing Address Details in Client Profile"
    return True, None

In [331]:
def missing_any_employment_detail_client_profile(client):
    # any employment history missing in client_profile
    if client['client_profile']['employment_history'] != []:
        for job in client['client_profile']['employment_history']:
            for key, value in job.items():
                if value == '':
                    return False, "Missing Employment History Details in Client Profile"
    return True, None

In [332]:
def missing_education_background(client):
    # education background missing in client_profile (should be the check for any note missing in client_description from my point of view)
    if client['client_description']['Education Background'] == '':
        return False, "Missing Education Background in Client Profile"
    return True, None

In [333]:
def missing_any_psprt_info(client):
    # any info except middle name is missing in passport info
    for key, value in client['passport'].items():
        if key != 'middle_name' and value == '':   
            return False, "Missing Passport Details"
    return True, None

In [334]:
def missing_info_account_form(client):
    # any info except middle name and address is missing in account_form
    for key, value in client['account_form'].items():
        if key != 'address' and key != 'middle_name' and value == '':   
            return False, "Missing Account Form Details"
    return True, None

In [335]:
def missing_any_address_info_account_form(client):
    # any address info missing in account_form
    for key, value in client['account_form']['address'].items():
        if value is None or value == '':
            return False, "Missing Address Details in Account Form"
    return True, None

In [336]:
def check_inheritance_profession(client):
    currency, aum, inheritance, real_estate_list = generate_wealth_summary_text(client)
    if inheritance != {}:  
        if clean_string(inheritance['profession']) not in clean_string(client["client_description"]["Wealth Summary"]):
            return False, 'Inheritance Information does not match Wealth Summary'
    return True, None

In [337]:
def check_inheritance_year(client):
    currency, aum, inheritance, real_estate_list = generate_wealth_summary_text(client)
    if inheritance != {}:  
        if str(inheritance['inheritance year']) not in clean_string(client["client_description"]["Wealth Summary"]):
            return False, 'Inheritance Information does not match Wealth Summary'
    return True, None

In [338]:
def check_inheritance_relationship(client):
    currency, aum, inheritance, real_estate_list = generate_wealth_summary_text(client)
    if inheritance != {}:  
        if clean_string(inheritance['relationship']) not in clean_string(client["client_description"]["Wealth Summary"]):
            return False, 'Inheritance Information does not match Wealth Summary'
    return True, None

In [339]:
def check_inheritance_amount(client):
    currency, aum, inheritance, real_estate_list = generate_wealth_summary_text(client)
    if aum['inheritance'] != 0:  
        if (str(aum['inheritance']) + ' '+ str(currency).lower()) not in clean_string(client["client_description"]["Wealth Summary"]):
            return False, 'Inheritance amount does not match Wealth Summary'
    return True, None

In [340]:
def check_savings_amount(client):
    currency, aum, inheritance, real_estate_list = generate_wealth_summary_text(client)
    if aum['savings'] != 0:  
        if (str(aum['savings']) + ' '+ str(currency).lower()) not in clean_string(client["client_description"]["Wealth Summary"]):
            return False, 'Savings amount does not match Wealth Summary'
    return True, None

In [341]:
def check_client_nones(client):
    # Check if any of the required fields are None or empty
    if client is None:
        return False, "Missing docs"
    required_fields = [
        'passport', 'account_form', 'client_profile', 'client_description'
    ]
    for field in required_fields:
        if field not in client or client[field] is None or (isinstance(client[field], dict) and not client[field]):
            return False, f"Missing {field} information"
    return True, None

In [342]:
###Add his ID
def check_all_flags(client):

    flag = True
    error_messages = []
    #infos = {'client_id': client['client_id']}
    out, message = check_client_nones(client)
    if out==False:
        flag = False
        error_messages.append(message)
        return flag, error_messages

    out, message = is_high_school_graduation_year_valid(client['client_profile']['secondary_school']['graduation_year'], client['client_profile']['birth_date'])
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_multiple_country_const(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_name(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_full_name(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_gender(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_passport_number(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_passport_expiry_date(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_email_validity(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_phone_number(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_email_address(client)
    if out==False:
        flag = False
        error_messages.append(message)

    messages = check_passport_dates(client)
    if len(messages) != 0:
        flag = False
        for k in messages:
            error_messages.append(k)
    
    out, message = mrz_check(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = currency_match(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = domicile_validator(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = address_validator(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_age(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_higher_education(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_employment_history(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_gender_not_null(client)
    if out==False:
        flag = False
        error_messages.append(message)
     
    out, message = check_mandate_not_null(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_empty_risk_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_properties_sum_to_aum(client)
    if out==False:
        flag = False
        error_messages.append(message)   
    
    out, message = check_secondary_const(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_grad_const(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_higher_education_const(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = missing_any_inheritance_detail(client)
    if out==False:
        flag = False
        error_messages.append(message)
        
    out, message = missing_psprt_num_client_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = missing_phone_num_client_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_any_address_detail_client_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_any_employment_detail_client_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_education_background(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_any_psprt_info(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_info_account_form(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = missing_any_address_info_account_form(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_inheritance_profession(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_inheritance_year(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_inheritance_relationship(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_inheritance_amount(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_savings_amount(client)
    if out==False:
        flag = False
        error_messages.append(message)

    


    return flag, error_messages
            


        



In [343]:
def parse_boolean_answer(answer):
    # Normalize and strip whitespace/newlines
    answer_clean = str(answer).strip().lower()
    
    # Check if the cleaned answer starts with 'false' (can catch 'false\n', 'false.', etc.)
    if 'false' in answer_clean:
        return False
    elif 'true' in answer_clean:
        return True
    else:
        raise ValueError(f"Unexpected answer format: {answer_clean}")
    
    return True

In [344]:
def check_family_background(client,model):
    input_text = 'marital status:'+ str(client["client_profile"]["marital_status"])+ ". "+ "Family Background:" + str(client["client_description"]["Family Background"])
    response = ollama.chat(
        model=model,
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a logical AI assistant. Based on the user's input, check if the marital status "
                    "is coherent with the family background. Only reply with 'True' if they are coherent, otherwise reply with 'False'."
                )
            },
            {"role": "user", "content": input_text}
        ],
        options={"num_predict": 5, "temperature": 0.0})
    return response['message']['content']

In [345]:
def check_all_backgrounds(client, model):
    
    if not parse_boolean_answer(check_family_background(client,model)):
        return False, "Family Background Check Failed"
    #if not parse_boolean_answer(check_education_background(client, model)):
    #    return False, "Education Background Check Failed"
    #if not parse_boolean_answer(check_work_background(client, model)):
    #    return False, "Work Background Check Failed"
    #if not parse_boolean_answer(check_wealth_background(client, model)):
    #    return False, "Wealth Background Check Failed"
    #if not parse_boolean_answer(check_client_summary(client, model)):
    #    return False, "Client Summary Check Failed"
    return True, None

In [346]:
clients = load_clients("clients_eval.pkl")
flags_preds = []
client_errors = []

for client in clients:
    flag, error_messages = check_all_flags(client)
    flags_preds.append(flag)
    client_errors.append(error_messages)
    
    


In [347]:
#client_labels = []
#for client in clients:
#    if client['label']['label'] == 'Reject':
#        client_labels.append(False)
#    elif client['label']['label'] == 'Accept':
#        client_labels.append(True)

In [348]:
client_errors

[[],
 ['Passport Name Mismatch',
  'Graduation Years Inconsistent',
  'University info Mismatch'],
 [],
 [],
 [],
 [],
 ['Passport Expiry Date Invalid',
  'Passport Issued Date Mismatch',
  'Passport Expiry Date Mismatch',
  'Passport Issued Date Mismatch'],
 ['Investment Risk Profile Not Specified'],
 ['Graduation Years Inconsistent',
  'University info Mismatch',
  'Inheritance Information does not match Wealth Summary',
  'Inheritance Information does not match Wealth Summary'],
 ['Passport Name Mismatch', 'MRZ Mismatch'],
 [],
 ['High School Graduation Inconsistent',
  'Passport Name Mismatch',
  'Full Name Mismatch',
  'MRZ Mismatch',
  'Inheritance Information does not match Wealth Summary',
  'Inheritance Information does not match Wealth Summary'],
 [],
 [],
 ['Missing docs'],
 ['Country Code Mismatch',
  'Graduation Years Inconsistent',
  'University info Mismatch'],
 [],
 ['Passport Name Mismatch',
  'Full Name Mismatch',
  'Phone Number Mismatch',
  'MRZ Mismatch'],
 [],
 ['

In [349]:
import pandas as pd

all_errors = sorted(set(err for sublist in client_errors for err in sublist))

data = []
for errors in client_errors:
    row = [1 if error in errors else 0 for error in all_errors]
    data.append(row)

df = pd.DataFrame(data, columns=all_errors)
df.index.name = "Client"

print(df)


        Address Mismatch  Country Code Mismatch  Country of Domicile Mismatch  \
Client                                                                          
0                      0                      0                             0   
1                      0                      0                             0   
2                      0                      0                             0   
3                      0                      0                             0   
4                      0                      0                             0   
...                  ...                    ...                           ...   
995                    0                      0                             0   
996                    0                      0                             0   
997                    0                      0                             0   
998                    0                      0                             0   
999                    0    

In [350]:
#load eval data
clients_eval = load_clients("clients_eval.pkl")
model = 'phi:latest'
family_background = []
education_background = []
work_background = []
wealth_background = []
client_summ = []

def flag_clients(clients):
    flags_preds = []
    client_errors = []

    for client in clients:
        flag, error_messages = check_all_flags(client)
        flags_preds.append(flag)
        client_errors.append(error_messages)

    return flags_preds, client_errors

#remaining_indexes = []
#for i in range(len(flags_preds)):
#    if flags_preds[i] != client_labels[i] or client_labels[i] == flags_preds[i] == True:
#        remaining_indexes.append(i)
#print("Remaining indexes length: ", len(remaining_indexes))

#remaining_clients = []
#for i in remaining_indexes:
#    remaining_clients.append(clients[i])

#save the remaining clients to a new file
#import pickle
#with open('remaining_clients.pkl', 'wb') as f:
#    pickle.dump(remaining_clients, f)
    




        


In [351]:
def infer_llama(client, flags_preds, client_errors):
    for i, client in enumerate(clients):
        if flags_preds[i] == False:
            continue
        out, msg = check_all_backgrounds(client, model)
        if out == False:
            flags_preds[i] = False
            client_errors[i].append(msg)
    return flags_preds, client_errors

clients = load_clients("clients_eval.pkl")
flags_preds, client_errors = flag_clients(clients)
flags_preds, client_errors = infer_llama(clients, flags_preds, client_errors)

In [355]:
num_clients = 1000
#create a dataframe with client_0 to client_9999 and values Accept and Reject if the flags_preds is True or False
df = pd.DataFrame({
    'Client': [f'client_{i}' for i in range(num_clients)],
    'Label': ['Accept' if flags_preds[i] else 'Reject' for i in range(num_clients)]
})

#save it as csv with ; as delimiter
df.to_csv('client_labels.csv', sep=';', index=False, header=False)

In [None]:
#save the client_errors to a new file
import pickle
with open('client_errors.pkl', 'wb') as f:
    pickle.dump(client_errors, f)

['High School Graduation Inconsistent', 'Passport Name Mismatch', 'Full Name Mismatch', 'MRZ Mismatch', 'Inheritance Information does not match Wealth Summary', 'Inheritance Information does not match Wealth Summary']


In [354]:
from sklearn.metrics import confusion_matrix
import numpy as np
array_flags = np.array(flags_preds)
#array_labels = np.array(client_labels)
#confusion_matrix(np.array(flags_preds), np.array(client_labels))