# 1- Import Libraries

In [1]:
from load_data import load_clients
import phonenumbers
import pycountry
import re
from sklearn.metrics import confusion_matrix
import numpy as np
from datetime import datetime
from countryinfo import CountryInfo
from email_validator import validate_email, EmailNotValidError
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
import pgeocode
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from xgboost import XGBClassifier
import numpy as np
from datetime import timedelta


# 2- Load Data

In [2]:
clients = load_clients("clients.pkl")


In [3]:
print(clients[0])

{'passport': {'first_name': 'Freja', 'middle_name': 'Katrine', 'last_name': 'Christensen', 'gender': 'F', 'country': 'Denmark', 'country_code': 'DNK', 'nationality': 'Danish', 'birth_date': '2002-04-18', 'passport_number': 'UE2130779', 'passport_mrz': ['P<DNKCHRISTENSEN<<FREJA<KATRINE<<<<<<<<<<<<<<', 'UE2130779DNK020418<<<<<<<<<<<<<<<<<<<<<<<<<<<'], 'passport_issue_date': '2017-05-11', 'passport_expiry_date': '2027-05-10'}, 'client_profile': {'name': 'Freja Katrine Christensen', 'address': {'city': 'Aalborg', 'street name': 'Strøget', 'street number': 57, 'postal code': '2044'}, 'country_of_domicile': 'Denmark', 'birth_date': '2002-04-18', 'nationality': 'Danish', 'passport_number': 'UE2130779', 'passport_issue_date': '2017-05-11', 'passport_expiry_date': '2027-05-10', 'gender': 'F', 'phone_number': '53 11 20 42', 'email_address': 'freja.christensen@yousee.dk', 'marital_status': 'single', 'secondary_school': {'name': 'Holstebro Gymnasium', 'graduation_year': 2022}, 'higher_education': 

# 3- Define check functions

In [4]:
error_messages = ['High School Graduation Inconsistent', 'Inconsistent country code, country name or nationality', 'Invalid Phone Number', 'Invalid Postal Code']

In [5]:
def is_high_school_graduation_year_valid(graduation_year: int, birth_date: str, min_age: int = 16, max_age: int = 21) -> bool:
    """
    Check if the graduation year is reasonable given a person's birth date.

    Args:
        graduation_year (int): The year the person graduated high school.
        birth_date (str): Birth date in "YYYY-MM-DD" format.
        min_age (int): Minimum expected age at graduation (default: 16)
        max_age (int): Maximum expected age at graduation (default: 23)

    Returns:
        bool: True if the graduation year is within a reasonable age range.
    """
    try:
        birth_year = datetime.strptime(birth_date, "%Y-%m-%d").year
        age_at_graduation = graduation_year - birth_year
        return min_age <= age_at_graduation <= max_age
    except ValueError:
        print("Invalid birth date format. Use YYYY-MM-DD.")
        return False

In [6]:
def check_multiple_country_consistency(codes_str, countries_str, nationalities_str):
    # Split input strings by commas and strip whitespace
    codes = [c.strip() for c in codes_str.split(',')]
    countries = [c.strip() for c in countries_str.split(',')]
    nationalities = [n.strip() for n in nationalities_str.split(',')]

    # Ensure all lists have the same length
    if not (len(codes) == len(countries) == len(nationalities)):
        return False, "Input lists must be of the same length"

    results = []

    for i, (code, country_name, nationality) in enumerate(zip(codes, countries, nationalities)):
        try:
            # 1. Validate country code
            country = pycountry.countries.get(alpha_2=code.upper())
            if not country:
                results.append((False, f"[{i}] Invalid country code: {code}"))
                continue

            # 2. Check if country name matches code
            if country.name.lower() != country_name.lower():
                results.append((False, f"[{i}] Mismatch: code '{code}' is for '{country.name}', not '{country_name}'"))
                continue

            # 3. Check nationality using CountryInfo
            info = CountryInfo(country.name)
            expected_nationality = info.info().get("demonym", None)

            if not expected_nationality:
                results.append((False, f"[{i}] Could not find nationality for '{country.name}'"))
                continue

            if nationality.lower() != expected_nationality.lower():
                results.append((False, f"[{i}] Nationality '{nationality}' does not match expected '{expected_nationality}' for {country.name}"))
                continue

            # All good for this triplet
            results.append((True, f"[{i}] All fields are consistent and valid"))

        except KeyError:
            results.append((False, f"[{i}] CountryInfo does not recognize '{country_name}'"))
        except Exception as e:
            results.append((False, f"[{i}] Error: {str(e)}"))

    return results

In [7]:
def could_be_valid_email(email):
    try:
        validate_email(email, check_deliverability=False)
        return True
    except EmailNotValidError:
        return False

def compare_strings_ignore_spaces(str1, str2):
    clean1 = str1.replace(" ", "")
    clean2 = str2.replace(" ", "")
    return clean1 == clean2

def william_flags(client):
    flags = True
    if client['passport']['first_name'] != client['account_form']['first_name']:
        flags = False
    if client['passport']['last_name'] != client['account_form']['last_name']:
        flags = False
    if client['passport']['middle_name'] != client['account_form']['middle_name']:
        flags = False
    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['account_form']['name']) == False:
        flags = False
    if compare_strings_ignore_spaces((client['passport']['first_name']+client['passport']['middle_name']+client['passport']['last_name']), client['client_profile']['name']) == False:
        flags = False
    if client['passport']['gender'] != client['client_profile']['gender']:
        flags = False

    if client['passport']['passport_number'] != client['account_form']['passport_number']:
        flags = False
    
    # Convert string to datetime object (adjust format if needed)
    expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    today = datetime.strptime("2021-04-10", "%Y-%m-%d").date() #2025-04-1  ?
    if expiry_date < today:
        flags = False
    
    if could_be_valid_email(client['client_profile']['email_address']) == False:
        flags = False
    if client['account_form']['email_address'] != client['client_profile']['email_address']:
        flags = False   
    if client['account_form']['phone_number'] != client['client_profile']['phone_number']:
        flags = False 
    passport_birth_date = datetime.strptime(client['passport']['birth_date'], "%Y-%m-%d").date()
    client_birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d").date()
    passport_issued_date = datetime.strptime(client['passport']['passport_issue_date'], "%Y-%m-%d").date()
    client_issued_date = datetime.strptime(client['client_profile']['passport_issue_date'], "%Y-%m-%d").date()
    passport_expiry_date = datetime.strptime(client['passport']['passport_expiry_date'], "%Y-%m-%d").date()
    client_expiry_date = datetime.strptime(client['client_profile']['passport_expiry_date'], "%Y-%m-%d").date()
    if passport_expiry_date < passport_issued_date:
        flags = False
    if passport_birth_date != client_birth_date:
        flags = False
    if passport_issued_date != client_issued_date:
        flags = False
    if passport_expiry_date != client_expiry_date:
        flags = False
    if passport_issued_date < passport_birth_date:
        flags = False
    return flags

In [8]:
def mrz_check(client):
    flags = True
    if (flags == True) and (client['passport']['last_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['first_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['middle_name'].upper() not in client['passport']['passport_mrz'][0] or client['passport']['country_code'].upper() not in client['passport']['passport_mrz'][0]):
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) == 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()))): #no middle name
        flags = False
    if (flags == True) and ((len(client['passport']['middle_name']) != 0) and not (client['passport']['passport_mrz'][0].index(client['passport']['country_code'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['last_name'].upper()) < client['passport']['passport_mrz'][0].index(client['passport']['middle_name'].upper()))):
        flags = False
    if (flags == True) and (client['passport']['passport_number']+client['passport']['country_code']+client['passport']['birth_date'].replace('-','')[2:] not in client['passport']['passport_mrz'][1]):
        flags = False
    return flags

In [9]:
def currency_match(client):
    flags = True
    if client['account_form']['currency'] != client['client_profile']['currency']:
        flags = False
    return flags

In [10]:
def domicile_validator(client):
    """
    Validates if the country of domicile in client profile corresponds to the country of domicile in account form
    """
    return client['account_form']['country_of_domicile'] == client['client_profile']['country_of_domicile']

def address_validator(client):
    """
    Validates if the address in client profile corresponds to the address in account form
    """
    return client['account_form']['address'] == client['client_profile']['address']

_nomi_cache = {}

def get_country_code(country_name):
    """
    Convert a full country name (e.g., 'Spain') to ISO Alpha-2 code (e.g., 'ES').

    Returns:
        str or None: ISO Alpha-2 code or None if not found.
    """
    try:
        return pycountry.countries.lookup(country_name).alpha_2
    except LookupError:
        return None

def get_nomi_instance(country_code):
    """
    Return a cached pgeocode.Nominatim instance for the given country.
    """
    if country_code not in _nomi_cache:
        _nomi_cache[country_code] = pgeocode.Nominatim(country_code)
    return _nomi_cache[country_code]

def validate_postal_code_for_client(client):
    """
    Validates if the postal code exists in the provided city and country.

    Parameters:
        client (dict): a client info.

    Returns:
        bool: True if postal code matches the city in any listed country, else False.
    """

    # client_profile (dict): Must include 'address' and 'country_of_domicile'.
    client_profile = client['client_profile']
    country_names = [name.strip() for name in client_profile.get('country_of_domicile', '').split(',')]
    address = client_profile.get('address', {})
    city = address.get('city', '').lower()
    postal_code = address.get('postal code', '')

    for country_name in country_names:
        country_code = get_country_code(country_name)
        if not country_code:
            continue  # Skip invalid countries

        nomi = get_nomi_instance(country_code)
        postal_info = nomi.query_postal_code(postal_code)['postal_code']

        if postal_info is None:
            continue

        # matched_cities = [c.strip().lower() for c in postal_info.place_name.split(',')]
        if postal_info == postal_code:
            return True  # Valid match

    return False  # No match found


In [11]:
def check_age(client):
    try:
        birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
        age = (datetime.now()-timedelta(5) - birth_date).days // 365
        return age >=18
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return None

In [12]:
def check_higher_education(client):
    try:
        if client['client_profile']['higher_education'] is None or len(client['client_profile']['higher_education']) == 0:
            return True
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['higher_education'][0]['graduation_year'] > client['client_profile']['secondary_school']['graduation_year'] and client['client_profile']['higher_education'][0]['graduation_year'] - int(birth_date.year) >17:
                return True
            else:
                return False
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False

In [13]:
def check_employment_history(client):
    try:
        if client['client_profile']['employment_history'] is None or len(client['client_profile']['employment_history']) == 0:
            return True
        else:
            birth_date = datetime.strptime(client['client_profile']['birth_date'], "%Y-%m-%d")
            if client['client_profile']['employment_history'][0]['start_year']  - int(birth_date.year) >16:
                return True
            else:
                return False
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False

In [14]:
def check_gender_not_null(client):
    if client["passport"]["gender"] is None or client["passport"]["gender"] == "":
        return False
    return True

def check_mandate_not_null(client):
    if client["client_profile"]["type_of_mandate"] is None or client["client_profile"]["type_of_mandate"] == "":
        return False
    return True

In [15]:
def check_empty_risk_profile(client):
    if client["client_profile"]["investment_risk_profile"] is None or client["client_profile"]["investment_risk_profile"] == "":
        return False
    return True

In [16]:
def check_properties_sum_to_aum(client):
    try:
        if client["client_profile"]["real_estate_details"] is None or len(client["client_profile"]["real_estate_details"]) == 0 and client["client_profile"]["aum"]["real_estate_value"]>0:
            return False
        else:
            value = [prop["property value"] for prop in client["client_profile"]["real_estate_details"]]
            value = sum(value)
            if value == client["client_profile"]["aum"]["real_estate_value"]:
                return True
            else:
                return False
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return False

# 4- Flag the dataset

In [17]:
flags_preds = []

for client in clients:
    flags = True
    if not check_multiple_country_consistency(client["passport"]["country_code"],client["passport"]["country"],client["passport"]["nationality"]) or not is_high_school_graduation_year_valid(client['client_profile']['secondary_school']['graduation_year'], client['client_profile']['birth_date']) or not validate_postal_code_for_client(client) or not domicile_validator(client) or not address_validator(client) or not currency_match(client) or not mrz_check(client) or not william_flags(client) or not check_age(client) or not check_higher_education(client) or not check_employment_history(client) or not check_gender_not_null(client) or not check_empty_risk_profile(client) or not check_properties_sum_to_aum(client):
        flags = False
    flags_preds.append(flags)

In [18]:
client_labels = []
for client in clients:
    if client['label']['label'] == 'Reject':
        client_labels.append(False)
    elif client['label']['label'] == 'Accept':
        client_labels.append(True)

# 5- Reducing training to mistakes

In [19]:
remaining_indexes = []
for i in range(len(flags_preds)):
    if flags_preds[i] != client_labels[i] or client_labels[i] == flags_preds[i] == True:
        remaining_indexes.append(i)
print("Remaining indexes length: ", len(remaining_indexes))

Remaining indexes length:  6404


In [20]:
#Only select the wrong indexes
remaining_clients = [clients[j] for j in remaining_indexes]

In [21]:
#save the remaining clients to a new file
import pickle
with open("remaining_clients.pkl", "wb") as f:
    pickle.dump(remaining_clients, f)

# 6- Encode Data

In [22]:
import pickle
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
from load_updated_data import load_clients
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta
from typing import List, Dict, Any, Tuple



def data_to_df(full_data:list):
    dfs = []
    for client in full_data:
        keep = [
            ['gender', 'country_code', 'birth_date'],
            ['country_of_domicile', 'nationality', 'marital_status','real_estate_details', 'investment_risk_profile',
             'higher_education', 'employment_history', 'investment_risk_profile', 'investment_horizon', 'investment_experience',
             'type_of_mandate', 'preferred_markets', 'currency'],
            [],
            [],
            ['label']
                ]
        keys=client.keys()
        res={}
        for key, subkeys in zip(keys, keep):
            d = {subkey: client[key][subkey] for subkey in subkeys }
            res = res|d
        #Convert label to 1 or 0 if accepted or rejected
        if res['label']== 'Reject':
            res['label'] = 0
        elif res['label'] == 'Accept':
            res['label'] = 1
        dfs.append(res)
        
        
    return pd.DataFrame.from_records(dfs)



country_code_encoder = LabelEncoder()
country_dom_encoder = LabelEncoder()
marital_status_encoder = LabelEncoder()
investment_experience_encoder = LabelEncoder()
currency_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
nationality_encoder = LabelEncoder()
irp_encoder = LabelEncoder()
ih_encoder = LabelEncoder()
mandate_encoder = LabelEncoder()
pref_markets_encoder=MultiLabelBinarizer()

def encode_gender(df:pd.DataFrame,encoder:LabelEncoder)-> pd.DataFrame:
    encoder.fit(df['gender'])
    df['gender'] = encoder.transform(df['gender'])
    return df

def encode_nationality(df:pd.DataFrame,encoder:LabelEncoder)-> pd.DataFrame:
    encoder.fit(df['nationality'])
    df['nationality'] = encoder.transform(df['nationality'])
    return df

def encode_irp(df:pd.DataFrame,encoder:LabelEncoder)-> pd.DataFrame:
    encoder.fit(df['investment_risk_profile'])
    df['investment_risk_profile'] = encoder.transform(df['investment_risk_profile'])
    return df

def encode_ih(df:pd.DataFrame,encoder:LabelEncoder)-> pd.DataFrame:
    encoder.fit(df['investment_horizon'])
    df['investment_horizon'] = encoder.transform(df['investment_horizon'])
    return df

def encode_mandate(df:pd.DataFrame,encoder:LabelEncoder)-> pd.DataFrame:
    encoder.fit(df['type_of_mandate'])
    df['type_of_mandate'] = encoder.transform(df['type_of_mandate'])
    return df

def encode_pref_markets(df:pd.DataFrame,encoder:MultiLabelBinarizer)-> pd.DataFrame:
    encoded=encoder.fit_transform(df['preferred_markets'].to_list())
    temp_df = pd.DataFrame.from_records(encoded)
    temp_df.rename(columns = {i: f'pref_markets_{i}' for i in range(temp_df.shape[1])},inplace=True)
    return pd.concat([df,temp_df],axis=1)

def encode_country_code(df: pd.DataFrame,encoder:LabelEncoder) -> pd.DataFrame:
    encoder.fit(df['country_code'])
    df['country_code'] = encoder.transform(df['country_code'])
    return df

def get_age(birth_date: str) -> int:
    try:
        birth_date = datetime.strptime(birth_date, "%Y-%m-%d")
        age = (datetime.now()-timedelta(5) - birth_date).days // 365
        return age
    except ValueError:
        print("Invalid date format. Use YYYY-MM-DD.")
        return None

def encode_age(df: pd.DataFrame) -> pd.DataFrame:
    df['age'] = df['birth_date'].apply(get_age)
    return df

def encode_country_of_domicile(df: pd.DataFrame,encoder:LabelEncoder) -> pd.DataFrame:
    encoder.fit(df['country_of_domicile'])
    df['country_of_domicile'] = encoder.transform(df['country_of_domicile'])
    return df

def encode_marital_status(df: pd.DataFrame,encoder:LabelEncoder) -> pd.DataFrame:
    encoder.fit(df['marital_status'])
    df['marital_status'] = encoder.transform(df['marital_status'])
    return df

def encode_investment_experience(df: pd.DataFrame,encoder:LabelEncoder) -> pd.DataFrame:
    encoder.fit(df['investment_experience'])
    df['investment_experience'] = encoder.transform(df['investment_experience'])
    return df

def encode_currency(df: pd.DataFrame,encoder:LabelEncoder) -> pd.DataFrame:
    encoder.fit(df['currency'])
    df['currency'] = encoder.transform(df['currency'])
    return df

def get_higher_ed(higher_ed: list) -> int:
    if higher_ed is None or len(higher_ed) == 0:
        return 0
    else:
        return 1

def encode_higher_education(df: pd.DataFrame) -> pd.DataFrame:
    df['higher_education'] = df['higher_education'].apply(get_higher_ed)
    return df

def encode(x:pd.DataFrame)-> pd.DataFrame:
    x= encode_gender(x,gender_encoder)
    x = encode_nationality(x,nationality_encoder)
    x = encode_irp(x,irp_encoder)
    x = encode_ih(x, ih_encoder)
    x = encode_mandate(x,mandate_encoder)
    x = encode_pref_markets(x, pref_markets_encoder)
    x = encode_country_code(x,country_code_encoder)
    x = encode_age(x)
    x = encode_country_of_domicile(x,country_dom_encoder)
    x = encode_marital_status(x,marital_status_encoder)
    x = encode_investment_experience(x, investment_experience_encoder)
    x = encode_currency(x,currency_encoder)
    x = encode_higher_education(x)
    return x


def calculate_effective_experience(jobs: List[Tuple[int, int]]) -> int:
    """
    Calculates effective work experience in years, avoiding double-counting overlapping years.

    Args:
        jobs: List of (start_year, end_year) tuples. None as end_year means the job is current.

    Returns:
        Total number of unique working years across all jobs.
    """
    worked_years = set()
    for start, end in jobs:
        if start is None:
            continue
        end = end if end is not None else 2025
        worked_years.update(range(start, end))
    return len(worked_years)

def extract_numeric_features(full_data: List[Dict[str, Any]]) -> pd.DataFrame:
    """
    Extracts numeric features from a list of client dictionaries.

    Args:
        full_data: List of dictionaries, each representing a client with nested data.

    Returns:
        A pandas DataFrame containing engineered numeric features.
    """

    aum_vec = [np.sum(list(client['client_profile']['aum'].values())) for client in full_data]
    property_value_vec = [client['client_profile']['aum']['real_estate_value'] for client in full_data]
    property_count_vec = [len(client['client_profile']['real_estate_details']) for client in full_data]
    inheritance_vec = [client['client_profile']['aum']['inheritance'] for client in full_data]
    savings_vec = [client['client_profile']['aum']['savings'] for client in full_data]
    job_count_vec = [len(client['client_profile']['employment_history']) for client in full_data]

    # Compute property-to-cash ratio with zero-division handling
    property_to_cash_vec = [
        prop_value / (total_value - prop_value) if (total_value - prop_value) != 0 else -float('inf')
        for prop_value, total_value in zip(property_value_vec, aum_vec)
    ]
    max_ratio = np.max(property_to_cash_vec)
    property_to_cash_vec = [max_ratio if value == -float("inf") else value for value in property_to_cash_vec]

    # Inheritance / (inheritance + savings), safe against zero-division
    inheritance_to_cash_vec = [
        inheritance / (inheritance + saving) if (inheritance + saving) != 0 else 0
        for inheritance, saving in zip(inheritance_vec, savings_vec)
    ]

    # Salary & experience metrics
    current_salary_vec = []
    max_salary_vec = []
    total_work_experience_vec = []
    effective_work_experience_vec = []

    for client in full_data:
        emp_hist = client['client_profile']['employment_history']
        salary = 0
        max_salary = 0

        if len(emp_hist) == 0:
            total_work_experience = 0
            effective_work_experience = 0

        else:
            year_history = []
            min_start = np.inf
            max_end = -np.inf
            for job in emp_hist:
                if max_salary < job['salary']:
                    max_salary = job['salary']

                start, end = job['start_year'], job['end_year']
                year_history.append((start, end))

                if end is None:
                    salary += job['salary']
                    end = 2025

                if start < min_start:
                    min_start = start
                if end > max_end:
                    max_end = end

            total_work_experience = max_end - min_start
            effective_work_experience = calculate_effective_experience(year_history)

        total_work_experience_vec.append(total_work_experience)
        effective_work_experience_vec.append(effective_work_experience)
        current_salary_vec.append(salary)
        max_salary_vec.append(max_salary)

    # Savings per active work year
    saving_per_annum_vec = [
        saving / work_exp if work_exp != 0 else saving
        for saving, work_exp in zip(savings_vec, effective_work_experience_vec)
    ]

    # Current salary compared to max salary seen in career
    salary_to_max_salary_vec = [
        salary / max_salary if max_salary != 0 else 0
        for salary, max_salary in zip(current_salary_vec, max_salary_vec)
    ]

    # Construct final DataFrame
    df_numeric = pd.DataFrame({
        'aum': aum_vec,
        'property_value': property_value_vec,
        'num_properties': property_count_vec,
        'inheritance_value': inheritance_vec,
        'savings_value': savings_vec,
        'num_jobs': job_count_vec,
        'current_salary': current_salary_vec,
        'max_salary': max_salary_vec,
        'property_to_cash_ratio': property_to_cash_vec,
        'inheritance_to_cash_ratio': inheritance_to_cash_vec,
        'total_work_experience': total_work_experience_vec,
        'effective_work_experience': effective_work_experience_vec,
        'saving_per_annum': saving_per_annum_vec,
        'salary_to_max_salary_ratio': salary_to_max_salary_vec
    })

    return df_numeric

def data_for_ML(data:list) -> pd.DataFrame:
    x = extract_numeric_features(data)
    y = encode(data_to_df(data))
    return pd.concat([y,x],axis=1)

In [23]:
remaining_clients = load_clients("remaining_clients.pkl")
remaining_clients_df = data_for_ML(remaining_clients)

# 7- Training function

In [24]:
def train_xgb_model(df, feature_cols, target_col):
    """
    Trains an XGBoost classifier using selected features from a preprocessed DataFrame.

    Parameters:
    - df: pandas DataFrame with preprocessed features (e.g., already multi-hot encoded)
    - feature_cols: list of column names to use as features
    - target_col: column name of the target

    Returns:
    - model: trained XGBClassifier
    - feature_cols: list of features used (for inference)
    """
    
    X_train = df[feature_cols]
    y_train = df[target_col]
    #print datatypes for X_train and y_train
    #Save to a txt all X_train datatypes


    # Train/test split just for validation (optional)

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    return model, feature_cols


In [25]:
def prepare_features(df, feature_names):
    return df[feature_names]



In [27]:
features = ['gender', 'country_code', 'age', 'country_of_domicile',
       'nationality', 'marital_status', 'higher_education',
       'investment_risk_profile', 'investment_horizon',
       'investment_experience', 'type_of_mandate',
       'currency', 'pref_markets_0', 'pref_markets_1',
       'pref_markets_2', 'pref_markets_3', 'pref_markets_4', 'pref_markets_5',
       'pref_markets_6', 'pref_markets_7', 'pref_markets_8', 'pref_markets_9',
        'aum', 'property_value', 'num_properties', 'inheritance_value',
       'savings_value', 'num_jobs', 'current_salary', 'max_salary',
       'property_to_cash_ratio', 'inheritance_to_cash_ratio',
       'total_work_experience', 'effective_work_experience',
       'saving_per_annum', 'salary_to_max_salary_ratio']

target = 'label'
model, feature_names = train_xgb_model(remaining_clients_df, features, target)



Parameters: { "use_label_encoder" } are not used.



# 7- CV

In [52]:

#import stratifykfold
from sklearn.model_selection import StratifiedKFold
def cross_validate_xgb(df, feature_cols, target_col, cv=5, scoring='accuracy', return_train_score=False):
    """
    Performs cross-validation on an XGBoost classifier.

    Parameters:
    - df: pandas DataFrame with preprocessed features
    - feature_cols: list of columns used as input features
    - target_col: name of the target column
    - cv: number of cross-validation folds
    - scoring: scoring metric (e.g., 'accuracy', 'f1', 'roc_auc')
    - return_train_score: if True, returns training scores too

    Returns:
    - scores: dictionary of CV scores (mean, std, and optionally train/test scores)
    """
    X = df[feature_cols]
    y = df[target_col]

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    scores = cross_validate(
        model,
        X,
        y,
        cv=StratifiedKFold(n_splits=cv, shuffle=True),
        #cv=cv,
        scoring=scoring,
        return_train_score=return_train_score,
        n_jobs=-1,


        

    )
    #print confusion matrix for each fold
    for i, (train_index, test_index) in enumerate(StratifiedKFold(n_splits=cv, shuffle=True, random_state=42).split(X, y)):
        model.fit(X.iloc[train_index], y.iloc[train_index])
        y_pred = model.predict(X.iloc[test_index])
        cm = confusion_matrix(y.iloc[test_index], y_pred)
        print(f"Fold {i+1} Confusion Matrix:\n{cm}\n")
    

    #stratify cross validation
    
    
    print(f"Mean {scoring}: {np.mean(scores['test_score']):.4f} ± {np.std(scores['test_score']):.4f}")
    return scores

In [53]:
cross_validate_xgb(remaining_clients_df, features, target, cv=5, scoring='accuracy', return_train_score=True)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 1 Confusion Matrix:
[[ 59 220]
 [ 36 966]]

Fold 2 Confusion Matrix:
[[ 54 225]
 [ 29 973]]



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 3 Confusion Matrix:
[[ 56 223]
 [ 34 968]]

Fold 4 Confusion Matrix:
[[ 58 222]
 [ 28 973]]

Fold 5 Confusion Matrix:
[[ 54 225]
 [ 45 956]]

Mean accuracy: 0.7962 ± 0.0047


Parameters: { "use_label_encoder" } are not used.



{'fit_time': array([0.37877536, 0.38628435, 0.37777591, 0.3707757 , 0.39279509]),
 'score_time': array([0.00950885, 0.00851107, 0.00850844, 0.00799966, 0.00699949]),
 'test_score': array([0.80093677, 0.79234973, 0.79937549, 0.79937549, 0.7890625 ]),
 'train_score': array([0.96096037, 0.94671091, 0.95959399, 0.96135077, 0.95589383])}

# x- Generate confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
array_flags = np.array(flags_preds)
array_labels = np.array(client_labels)
confusion_matrix(np.array(flags_preds), np.array(client_labels))

array([[3578,    0],
       [1414, 5008]], dtype=int64)