In [36]:
from load_data import load_clients
import phonenumbers
from pycountry import pycountry
import re
from sklearn.metrics import confusion_matrix
import numpy as np
from datetime import datetime
from countryinfo import CountryInfo
from email_validator import validate_email, EmailNotValidError
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
import pgeocode
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from xgboost import XGBClassifier
import numpy as np
from datetime import timedelta

In [4]:
def is_high_school_graduation_year_valid(graduation_year: int, birth_date: str, min_age: int = 16, max_age: int = 21) -> bool:
    """
    Check if the graduation year is reasonable given a person's birth date.

    Args:
        graduation_year (int): The year the person graduated high school.
        birth_date (str): Birth date in "YYYY-MM-DD" format.
        min_age (int): Minimum expected age at graduation (default: 16)
        max_age (int): Maximum expected age at graduation (default: 23)

    Returns:
        bool: True if the graduation year is within a reasonable age range.
    """
    try:
        birth_year = datetime.strptime(birth_date, "%Y-%m-%d").year
        age_at_graduation = graduation_year - birth_year
        return min_age <= age_at_graduation <= max_age, "High School Graduation Inconsistent"
    except ValueError:
        print("Invalid birth date format. Use YYYY-MM-DD.")
        return False, "Invalid Birth Date Format. Use YYYY-MM-DD"

In [37]:
def check_multiple_country_consistency(codes_str, countries_str, nationalities_str):
    # Split input strings by commas and strip whitespace
    codes = [c.strip() for c in codes_str.split(',')]
    countries_list = [c.strip() for c in countries_str.split(',')]
    nationalities = [n.strip() for n in nationalities_str.split(',')]

    # Ensure all lists have the same length
    if not (len(codes) == len(countries_list) == len(nationalities)):
        return False, "Input lists are inconsistent in length"

    for i, (code, country_name, nationality) in enumerate(zip(codes, countries_list, nationalities)):
        try:
            # 1. Validate country code
            country = countries.get(alpha_2=code.upper())
            if not country:
                return False, "Invalid country code"

            # 2. Check if country name matches code
            if country.name.lower() != country_name.lower():
                return False, "Country name does not match country code"

            # 3. Check nationality using CountryInfo
            info = CountryInfo(country.name)
            expected_nationality = info.info().get("demonym", None)

            if not expected_nationality:
                return False, "Expected nationality not found"

            if nationality.lower() != expected_nationality.lower():
                return False, "Nationality does not match expected value"

        except KeyError:
            return False, "Country name not recognized"
        except Exception:
            return False, "Unexpected error occurred during validation"

    return True, "no error"

In [5]:
def check_multiple_country_consistency(codes_str, countries_str, nationalities_str):
    # Split input strings by commas and strip whitespace
    codes = [c.strip() for c in codes_str.split(',')]
    countries = [c.strip() for c in countries_str.split(',')]
    nationalities = [n.strip() for n in nationalities_str.split(',')]
    results = []

    # Ensure all lists have the same length
    if not (len(codes) == len(countries) == len(nationalities)):
        results.append("Input Lists Inconsistent Length (codes, countries, nationalities)")
        return results

    for i, (code, country_name, nationality) in enumerate(zip(codes, countries, nationalities)):
        try:
            # 1. Validate country code
            country = pycountry.countries.get(alpha_2=code.upper())
            if not country:
                results.append("Country Code Not Found")
                continue

            # 2. Check if country name matches code
            if country.name.lower() != country_name.lower():
                results.append("Mismatch: Code And Country Name Mismatch")
                continue

            # 3. Check nationality using CountryInfo
            info = CountryInfo(country.name)
            expected_nationality = info.info().get("demonym", None)

            if not expected_nationality:
                results.append("Could Not Find Nationality for Country")
                

            if nationality.lower() != expected_nationality.lower():
                results.append("Nationality Does Not Match Expected Nationality for Country")

        except KeyError:
            results.append("Database does not contain Country Name")
        except Exception as e:
            results.append(f"[{i}]Unexpected Error: {str(e)}")

    return results

In [6]:
def could_be_valid_email(email):
    try:
        validate_email(email, check_deliverability=False)
        return True
    except EmailNotValidError:
        return False

def compare_strings_ignore_spaces(str1, str2):
    clean1 = str1.replace(" ", "")
    clean2 = str2.replace(" ", "")
    return clean1 == clean2

def check_name(client):
    flags = True
    if client['passport']['first_name'] != client['account_form']['first_name']:
        flags = False
    if client['passport']['last_name'] != client['account_form']['last_name']:
        flags = False
    if client['passport']['middle_name'] != client['account_form']['middle_name']:
        flags = False
    return flags, "Passport Name Mismatch" 

def check_full_name(client):
    flags = True
    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['account_form']['name']) == False:
        flags = False

    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['client_profile']['name']) == False:
        flags = False
    return flags, "Full Name Mismatch"

def check_gender(client):
    flags = True
    if client['passport']['gender'] != client['client_profile']['gender']:
        flags = False
    return flags, "Passport Gender Mismatch"

def check_passport_number(client):
    flags = True
    if client['passport']['passport_number'] != client['account_form']['passport_number']:
        flags = False
    return flags, "Passport Number Mismatch"

def check_passport_expiry_date(client):
    flags = True
    # Convert string to datetime object (adjust format if needed)
    expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    today = datetime.strptime("2021-04-10", "%Y-%m-%d").date() #2025-04-1  ?
    if expiry_date < today:
        flags = False
    return flags, "Passport Expiry Date Invalid"

def check_email_validity(client):
    flags = True
    if could_be_valid_email(client['client_profile']['email_address']) == False:
        flags = False
    return flags, "Email Address Invalid"

def check_phone_number(client):
    flags = True
    if client['account_form']['phone_number'] != client['client_profile']['phone_number']:
        flags = False
    return flags, "Phone Number Mismatch"

def check_email_address(client):
    flags = True
    if client['account_form']['email_address'] != client['client_profile']['email_address']:
        flags = False
    return flags, "Email Address Mismatch"

def check_passport_dates(client):
    messages = []
    passport_birth_date = datetime.strptime(client['passport']['birth_date'], "%Y-%m-%d").date()
    client_birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d").date()
    passport_issued_date = datetime.strptime(client['passport']['passport_issue_date'], "%Y-%m-%d").date()
    client_issued_date = datetime.strptime(client['client_profile']['passport_issue_date'], "%Y-%m-%d").date()
    passport_expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    client_expiry_date = datetime.strptime(client['client_profile']['passport_expiry_date'], "%Y-%m-%d").date()
    if passport_expiry_date < passport_issued_date:
        messages.append("Passport Issued Date Invalid")
    if passport_birth_date != client_birth_date:
        messages.append("Passport Birth Date Mismatch")
    if passport_issued_date != client_issued_date:
        messages.append("Passport Issued Date Mismatch")
    if passport_expiry_date != client_expiry_date:
        messages.append("Passport Expiry Date Mismatch")
    if passport_issued_date < passport_birth_date:
        messages.append("Passport Issued Date Mismatch")
    return messages


In [7]:
def mrz_check(client):
    flags = True
    if (flags == True) and (client['passport']['last_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['first_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['middle_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['country_code'].upper() not in client['passport']['passport_mrz'][0]):
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) == 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()))): #no middle name
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) != 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['middle_name'].upper()))):
        flags = False
    if (flags == True) and (client['passport']['passport_number']+client['passport']['country_code']+client['passport']['birth_date'].replace('-','')[2:] not in client['passport']['passport_mrz'][1]):
        flags = False
    return flags, "MRZ Mismatch"

In [11]:
def currency_match(client):
    flags = True
    if client['account_form']['currency'] != client['client_profile']['currency']:
        flags = False
    return flags, "Currency Mismatch"

In [9]:
def domicile_validator(client):
    """
    Validates if the country of domicile in client profile corresponds to the country of domicile in account form
    """
    return client['account_form']['country_of_domicile'] == client['client_profile']['country_of_domicile'], "Country of Domicile Mismatch"

def address_validator(client):
    """
    Validates if the address in client profile corresponds to the address in account form
    """
    return client['account_form']['address'] == client['client_profile']['address'], "Address Mismatch"

# #_nomi_cache = {}

# def get_country_code(country_name):
#     """
#     Convert a full country name (e.g., 'Spain') to ISO Alpha-2 code (e.g., 'ES').

#     Returns:
#         str or None: ISO Alpha-2 code or None if not found.
#     """
#     try:
#         return pycountry.countries.lookup(country_name).alpha_2
#     except LookupError:
#         return None

# def get_nomi_instance(country_code):
#     """
#     Return a cached pgeocode.Nominatim instance for the given country.
#     """
#     if country_code not in _nomi_cache:
#         _nomi_cache[country_code] = pgeocode.Nominatim(country_code)
#     return _nomi_cache[country_code]

# def validate_postal_code_for_client(client):
#     """
#     Validates if the postal code exists in the provided city and country.

#     Parameters:
#         client (dict): a client info.

#     Returns:
#         bool: True if postal code matches the city in any listed country, else False.
#     """

#     # client_profile (dict): Must include 'address' and 'country_of_domicile'.
#     client_profile = client['client_profile']
#     country_names = [name.strip() for name in client_profile.get('country_of_domicile', '').split(',')]
#     address = client_profile.get('address', {})
#     city = address.get('city', '').lower()
#     postal_code = address.get('postal code', '')

#     for country_name in country_names:
#         country_code = get_country_code(country_name)
#         if not country_code:
#             continue  # Skip invalid countries

#         nomi = get_nomi_instance(country_code)
        # postal_info = nomi.query_postal_code(postal_code)['postal_code']

#         if postal_info is None:
#             continue

#         # matched_cities = [c.strip().lower() for c in postal_info.place_name.split(',')]
#         if postal_info == postal_code:
#             return True  # Valid match

#     return False  # No match found

In [10]:
def check_age(client):
    try:
        birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
        age = (datetime.now()-timedelta(5) - birth_date).days // 365
        return age >=18, "Age less than 18"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return None, "Invalid Date Format. Use YYYY-MM-DD"

In [12]:
def check_higher_education(client):
    try:
        if client['client_profile']['higher_education'] is None or len(client['client_profile']['higher_education']) == 0:
            return True, None
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['higher_education'][0]['graduation_year'] > client['client_profile']['secondary_school']['graduation_year'] and client['client_profile']['higher_education'][0]['graduation_year'] - int(birth_date.year) >17:
                return True, None
            else:
                return False, "Graduation Years Inconsistent"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False, "Invalid Date Format. Use YYYY-MM-DD"

In [13]:
def check_employment_history(client):
    try:
        if client['client_profile']['employment_history'] is None or len(client['client_profile']['employment_history']) == 0:
            return True, None
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['employment_history'][0]['start_year']  - int(birth_date.year) >16:
                return True, None
            else:
                return False, "Employment History Years Inconsistent"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False

In [14]:
def check_gender_not_null(client):
    if client["passport"]["gender"] is None or client["passport"]["gender"] == "":
        return False, "Gender Not Specified"
    return True, None

def check_mandate_not_null(client):
    if client["client_profile"]["type_of_mandate"] is None or client["client_profile"]["type_of_mandate"] == "":
        return False, "Type of Mandate Not Specified"
    return True, None

In [15]:
def check_empty_risk_profile(client):
    if client["client_profile"]["investment_risk_profile"] is None or client["client_profile"]["investment_risk_profile"] == "":
        return False, "Investment Risk Profile Not Specified"
    return True, None

In [16]:
def check_properties_sum_to_aum(client):
    try:
        if client["client_profile"]["real_estate_details"] is None or len(client["client_profile"]["real_estate_details"]) == 0 and client["client_profile"]["aum"]["real_estate_value"]>0:
            return False, "Real Estate Value Mismatch"
        else:
            value = [prop["property value"] for prop in client["client_profile"]["real_estate_details"]]
            value = sum(value)
            if value == client["client_profile"]["aum"]["real_estate_value"]:
                return True, None
            else:
                return False, "Real Estate Value Mismatch"
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False, "Invalid Date Format. Use YYYY-MM-DD"

In [65]:
###Add his ID
def check_all_flags(client):

    flag = True
    error_messages = []
    #infos = {'client_id': client['client_id']}

    out, message = is_high_school_graduation_year_valid(client['client_profile']['secondary_school']['graduation_year'], client['client_profile']['birth_date'])
    if out==False:
        flag = False
        error_messages.append(message)
    
    # out, message = check_multiple_country_consistency(client["passport"]["country_code"],client["passport"]["country"],client["passport"]["nationality"])
    # if out==False:
    #     flag = False
    #     error_messages.append(message)

    out, message = check_name(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_full_name(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_gender(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_passport_number(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_passport_expiry_date(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_email_validity(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_phone_number(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_email_address(client)
    if out==False:
        flag = False
        error_messages.append(message)

    messages = check_passport_dates(client)
    if len(messages) != 0:
        flag = False
        for k in messages:
            error_messages.append(k)
    
    out, message = mrz_check(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = currency_match(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = domicile_validator(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = address_validator(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_age(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_higher_education(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_employment_history(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_gender_not_null(client)
    if out==False:
        flag = False
        error_messages.append(message)
     
    out, message = check_mandate_not_null(client)
    if out==False:
        flag = False
        error_messages.append(message)
    
    out, message = check_empty_risk_profile(client)
    if out==False:
        flag = False
        error_messages.append(message)

    out, message = check_properties_sum_to_aum(client)
    if out==False:
        flag = False
        error_messages.append(message)   
        
    return flag, error_messages
            


        



In [None]:
def check_all_backgrounds(clients, model):
    
    

In [66]:
clients = load_clients("clients.pkl")
flags_preds = []
client_errors = []

for client in clients:
    flag, error_messages = check_all_flags(client)
    flags_preds.append(flag)
    client_errors.append(error_messages)
    
    


In [59]:
client_labels = []
for client in clients:
    if client['label']['label'] == 'Reject':
        client_labels.append(False)
    elif client['label']['label'] == 'Accept':
        client_labels.append(True)

In [69]:
client_errors

[[],
 ['Passport Name Mismatch', 'Phone Number Mismatch'],
 ['Passport Name Mismatch', 'Full Name Mismatch'],
 ['MRZ Mismatch'],
 ['MRZ Mismatch'],
 [],
 [],
 [],
 [],
 ['MRZ Mismatch'],
 [],
 [],
 [],
 [],
 ['High School Graduation Inconsistent'],
 [],
 ['MRZ Mismatch'],
 ['High School Graduation Inconsistent'],
 ['Passport Name Mismatch', 'Country of Domicile Mismatch'],
 ['High School Graduation Inconsistent', 'Type of Mandate Not Specified'],
 [],
 [],
 ['High School Graduation Inconsistent', 'Country of Domicile Mismatch'],
 [],
 [],
 [],
 ['High School Graduation Inconsistent'],
 [],
 [],
 [],
 [],
 ['High School Graduation Inconsistent'],
 ['Phone Number Mismatch', 'Graduation Years Inconsistent'],
 ['Phone Number Mismatch', 'Graduation Years Inconsistent'],
 ['Phone Number Mismatch', 'Graduation Years Inconsistent'],
 [],
 [],
 ['High School Graduation Inconsistent'],
 [],
 [],
 ['High School Graduation Inconsistent',
  'Phone Number Mismatch',
  'MRZ Mismatch'],
 [],
 ['Passpo

In [68]:
import pandas as pd

all_errors = sorted(set(err for sublist in client_errors for err in sublist))

data = []
for errors in client_errors:
    row = [1 if error in errors else 0 for error in all_errors]
    data.append(row)

df = pd.DataFrame(data, columns=all_errors)
df.index.name = "Client"

print(df)


        Address Mismatch  Age less than 18  Country of Domicile Mismatch  \
Client                                                                     
0                      0                 0                             0   
1                      0                 0                             0   
2                      0                 0                             0   
3                      0                 0                             0   
4                      0                 0                             0   
...                  ...               ...                           ...   
9995                   0                 0                             0   
9996                   0                 0                             0   
9997                   0                 0                             0   
9998                   0                 0                             0   
9999                   0                 0                             0   

        Cur

In [None]:
def parse_boolean_answers(answer_list):
    parsed_flags = []
    for answer in answer_list:
        # Normalize and strip whitespace/newlines
        answer_clean = str(answer).strip().lower()
        
        # Check if the cleaned answer starts with 'false' (can catch 'false\n', 'false.', etc.)
        if 'false' in answer_clean:
            parsed_flags.append(False)
        elif 'true' in answer_clean:
            parsed_flags.append(True)
        else:
            
            parsed_flags.append(False)
    
    return parsed_flags

In [None]:
#load eval data
clients_eval = load_clients("clients_eval.pkl")

def flag_clients(clients):
    flags_preds = []
    client_errors = []

    for client in clients:
        flag, error_messages = check_all_flags(client)
        flags_preds.append(flag)
        client_errors.append(error_messages)

    return flags_preds, client_errors


def infer_llama(client, flags_preds, client_errors):
    for i, client in enumerate(clients):
        if flags_preds[i] == False:
            continue

        

        


In [70]:
from sklearn.metrics import confusion_matrix
import numpy as np
array_flags = np.array(flags_preds)
array_labels = np.array(client_labels)
confusion_matrix(np.array(flags_preds), np.array(client_labels))

array([[3621,    0],
       [1371, 5008]], dtype=int64)