In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from utils import file, plot, data, stat, model_metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, r2_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
from itertools import product
import xgboost as xgb
import time
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

## Data preprocessing

In [None]:
# Maksimalt kontekstvindu for hver sak. Sett til -1 dersom man ønsker fullt vindu, altså all informasjon før saken
max_context_window = -1

# ------------------------------------------------------------
# LOAD DATA
# ------------------------------------------------------------
konto_frame, collection_frame = file.load_konto_data(
    file_path_konto
), file.load_collection_data(file_path_collection)


# ------------------------------------------------------------
# DEFINE COLLECTION TARGETS
# ------------------------------------------------------------
ids_in_kont = konto_frame["PersonId"].unique()
collection_frame = collection_frame[collection_frame["PersonId"].isin(ids_in_kont)]
target_frame = (
    collection_frame[
        [
            "Collectionid",
            "PersonId",
            "AccountId",
            "CollectionOpenedDate",
            "CollectionClosedDate",
            "BalanceSentAmt",
            "MonthInDCA",
            "CumulativeLossAmt",
        ]
    ]
    .groupby("Collectionid")
    .tail(1)
    .set_index("Collectionid")
)
# Stenger saker når de går til overvåk
zcov_frame = collection_frame[collection_frame["MonthsInZCOV"] == 0].copy()
zcov_frame = zcov_frame.sort_values("YearMonth").drop_duplicates("Collectionid")
zcov_frame["ZCOVDate"] = zcov_frame["YearMonth"] + pd.offsets.MonthEnd(0)
target_frame["ZCOVDate"] = target_frame.index.map(
    zcov_frame.set_index("Collectionid")["ZCOVDate"]
)
target_frame["ClosedDateSetByZCOVDate"] = 0
cond = target_frame["CollectionClosedDate"].isna() & target_frame["ZCOVDate"].notna()
target_frame.loc[cond, "CollectionClosedDate"] = target_frame.loc[cond, "ZCOVDate"]
target_frame.loc[cond, "ClosedDateSetByZCOVDate"] = 1

# Flagger om sak fortsatt er åpen
target_frame["IsOpen"] = (target_frame["CollectionClosedDate"].isna()).astype(int)

# Flagger om sak er åpnet innenfor datasettet
target_frame["CollectionOpenedAfter202309"] = (
    target_frame["CollectionOpenedDate"] >= pd.Timestamp("2023-09-01")
).astype(int)

# Beregner antall dager saken har vært åpen. Merk at ZCOVDate er satt til siste dag i måneden
target_frame["DurationDays"] = (
    pd.to_datetime(target_frame["CollectionClosedDate"])
    - pd.to_datetime(target_frame["CollectionOpenedDate"])
).dt.days

# Definerer tap. Tap under 0 settes til 0. Tap over balanse sent settes til balanse
target_frame["Loss"] = (
    target_frame["CumulativeLossAmt"]
    .mask(target_frame["CumulativeLossAmt"] < 0, 0)
    .mask(
        target_frame["CumulativeLossAmt"] > target_frame["BalanceSentAmt"],
        target_frame["BalanceSentAmt"],
    )
)

# Beregner LGD som en andel av balanse
target_frame["LGD"] = target_frame["Loss"] / target_frame["BalanceSentAmt"]

# Fjerner alle saker som enten er uten balanse, er åpnet før datasettet, eller ikke er stengt
mask_filter = (
    (target_frame["BalanceSentAmt"] != 0)
    & (target_frame["CollectionOpenedAfter202309"] == 1)
    & (target_frame["IsOpen"] == 0)
)
target_frame = target_frame[mask_filter]


# ------------------------------------------------------------
# CREATE DATASET WITH KONTODATA FOR EACH COLLECTION
# ------------------------------------------------------------
rows = []
for idx, row in target_frame.iterrows():
    mask = (
        (konto_frame["PersonId"] == row["PersonId"])
        & (konto_frame["AccountId"] == row["AccountId"])
        & (konto_frame["YearMonth"] < row["CollectionOpenedDate"])
    )
    temp = konto_frame[mask].copy()
    if max_context_window != -1:
        temp = temp.tail(max_context_window)
    temp["Collectionid"] = row.name
    rows.append(temp)
feature_frame = pd.concat(rows, ignore_index=True)
feature_frame = data.reorder_column(feature_frame, "Collectionid", 3)


# ------------------------------------------------------------
# ADD FEATURES
# ------------------------------------------------------------
status_map = {
    "Normal": 0,
    "Ikke aktivert": 0,
    "Avsluttet": 0,
    "Avsluttet av kunde": 0,
    #
    "Faktura forfalt": 1,
    "Purring": 1,
    "Under avslutning": 1,
    "Under avslutning manglende KYC": 1,
    "Sperret": 1,
    "Sperret, propagert": 1,
    "Eget misbruk": 1,
    "Avsluttet inkasso, oppgjort": 1,
    #
    "Purring forfalt": 2,
    "Purring med kortsperre": 2,
    "Spesialengasjement, manuell behandling": 2,
    "Betalingsplan, nedbetaling": 2,
    #
    "Inkassovarsel": 3,
    #
    "Overført inkassobyrå": 4,
    #
    "Inkasso, overført overvåk": 5,
    "Avsluttet inkasso, med tap": 5,
    "Gjeldsordning ikke DCA Offentlig": 5,
    "Gjeldsordning ikke DCA Privat": 5,
    "Konkursbo Privat": 5,
}
feature_frame["RisikoStatus"] = feature_frame["GeneralStatusDesc"].map(status_map)
feature_frame = data.reorder_column(feature_frame, "RisikoStatus", 10)
product_map = {
    1: "SB1 GOLD MC",
    2: "Sparebank 1 Platinum MC",
    4: "SH BUSINESS VISA",
    7: "LOfavør Mastercard",
    8: "SB1 UNG MC",
    30: "SH GOLD MC",
    34: "SB1 EXTRA MC",
    38: "LOfavør Mastercard Ung",
    40: "SpareBank 1 Mastercard Elite",
}
feature_frame["ProductId"] = feature_frame["ProductId"].map(product_map)
feature_frame.rename(columns={"ProductId": "Productname"}, inplace=True)
feature_frame["Gender"] = feature_frame["Gender"].map({"F": 0, "M": 1}).astype(int)
feature_frame["AgeGroup"] = (
    feature_frame["AgeGroup"]
    .map(
        {"0 - 24": 0, "25 - 34": 1, "35 - 44": 2, "45 - 54": 3, "55 - 64": 4, "> 64": 5}
    )
    .astype(int)
)
col_rename_dict = {"Gender": "Male"}
feature_frame.rename(columns=col_rename_dict, inplace=True)


# ------------------------------------------------------------
# AGGREGATE EACH COLLECTION-SEQUENCE INTO A SINGLE ROW
# ------------------------------------------------------------
agg_dict = {
    "PersonId": "last",
    "Productname": "last",
    "DistributorId": "last",
    "AgeGroup": "last",
    "Male": "last",
    "MonthsSinceAccountCreatedNum": "last",
    "BalanceAmt": "last",
    "CreditLimitAmt": "last",
    "OverdueAmt": "last",
    "StatementClosingBalanceAmt": "last",
    "Last_Airlines": "sum",
    "Last_Amusement and Entertainment": "sum",
    "Last_Automobile / Vehicle Rental": "sum",
    "Last_Business Services": "sum",
    "Last_Clothing Stores": "sum",
    "Last_Contracted Services": "sum",
    "Last_Government Services": "sum",
    "Last_Hotels": "sum",
    "Last_Includes all lodging merchants": "sum",
    "Last_Mail Order / Telephone Order Providers": "sum",
    "Last_Miscellaneous Stores": "sum",
    "Last_Others": "sum",
    "Last_Professional Services and Membership Organizations": "sum",
    "Last_Repair Services": "sum",
    "Last_Retail Stores": "sum",
    "Last_Service Providers": "sum",
    "Last_Transportation": "sum",
    "Last_Utilities": "sum",
    "Last_Wholesale Distributors and Manufacturers": "sum",
}
feature_copy = feature_frame.copy()
agg_frame = (
    feature_copy.groupby("Collectionid")
    .agg(agg_dict)
    .reset_index()
    .set_index("Collectionid")
)
# Antall måneder med data
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["YearMonth"].nunique().rename("NumberOfMonths")
)
# Antall kontoer
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AccountId"]
    .nunique()
    .rename("NumberOfAccounts")
)
# Snitt av statusene
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .mean()
    .rename("RisikoStatusSnitt")
)
# Sum av endringer fra måned til måned
agg_frame = agg_frame.join(  # Det må håndteres at en person kan ha ulike kontoer. Ha et merge accounts steg
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .apply(lambda x: x.diff().abs().sum())
    .rename("SumRisikoStatusEndringer")
)
# Snitt endringer i måneden
agg_frame["SnittRisikoStatusEndringerPerMåned"] = (
    agg_frame["SumRisikoStatusEndringer"] / agg_frame["NumberOfMonths"]
)
# Andel av hver risikogruppe
risk_counts = (
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .value_counts()
    .unstack(fill_value=0)
)
risk_props = risk_counts.div(risk_counts.sum(axis=1), axis=0)
risk_props.columns = [f"RisikoStatus_{col}_share" for col in risk_props.columns]
agg_frame = agg_frame.join(risk_props)
# Snitt-balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"].mean().rename("BalanceAmtMean")
)
# Standardavvik balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"]
    .std()
    .fillna(0)
    .rename("BalanceAmtStd")
)

# Antall kredittgrenseøkninger
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CreditLimitIncreaseFlag"]
    .sum()
    .rename("AntallCreditLimitIncrease")
)
# Antall ganger gått til inkasso
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CollectionFlag"]
    .sum()
    .rename("AntallCollectionFlag")
)
# Snitt ganger gått til inkasso
agg_frame["SnittCollectionFlagPerMåned"] = (
    agg_frame["AntallCollectionFlag"] / agg_frame["NumberOfMonths"]
)
# Sum turnover
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverAmt"].sum().rename("SumTurnover")
)
# Snitt turnover
agg_frame["SnittTurnover"] = agg_frame["SumTurnover"] / agg_frame["NumberOfMonths"]
# Sum antall transaksjoner
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverNum"].sum().rename("TurnoverNumSum")
)
# Snitt antall transaksjoner
agg_frame["SnittTurnoverNum"] = (
    agg_frame["TurnoverNumSum"] / agg_frame["NumberOfMonths"]
)
# Snitt transaksjonsstørrelse
agg_frame["AverageTransactionSize"] = (
    agg_frame["SumTurnover"] / agg_frame["TurnoverNumSum"]
).fillna(0)
# Sum innenlandstransaksjoner
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["TurnoverDomAmt"]
    .sum()
    .rename("SumTurnoverDom")
)
agg_frame["ShareOfTurnoverIsDomestic"] = (
    agg_frame["SumTurnoverDom"] / agg_frame["SumTurnover"]
).fillna(0)

# Withdrawal sum
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["FundtransferAmt"]
    .sum()
    .rename("SumFundtransfer")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashAtmAmt"].sum().rename("SumCashAtm")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashCounterAmt"]
    .sum()
    .rename("SumCashCounter")
)
agg_frame["WithdrawalSum"] = (
    agg_frame["SumFundtransfer"] + agg_frame["SumCashAtm"] + agg_frame["SumCashCounter"]
)

# Withdrawal propotional to turnover
agg_frame["WithdrawalPropOfTurnover"] = (
    agg_frame["WithdrawalSum"] / agg_frame["SumTurnover"]
).fillna(0)

# Total expediture as propotion of kredittgrense
agg_frame["ExpenditureAsShareOfCreditlimit"] = (
    agg_frame["SumTurnover"] + agg_frame["WithdrawalSum"]
) / agg_frame["CreditLimitAmt"]


# Snitt Turnover andel av kredittgrense
feature_copy["AndelTurnoverAvKredittgrense"] = (
    feature_copy["TurnoverAmt"] / feature_copy["CreditLimitAmt"] * -1
)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelTurnoverAvKredittgrense"]
    .mean()
    .rename("SnittTurnoverAndelAvKredittgrense")
)
# Siste Balanse som andel av kredittgrense
feature_copy["AndelBalanseAvKredittgrense"] = (
    feature_copy["BalanceAmt"] / feature_copy["CreditLimitAmt"] * -1
)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelBalanseAvKredittgrense"]
    .last()
    .rename("LastAndelBalanseAvKredittgrense")
)
# Antall overtrekk
feature_copy["Overtrekk"] = (
    feature_copy["InterestEarningLendingAmt"] > feature_copy["CreditLimitAmt"]
).astype(int)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["Overtrekk"].sum().rename("AntallOvertrekk")
)
# Snitt overtrekk
agg_frame["SnittOvertrekkPerMåned"] = (
    agg_frame["AntallOvertrekk"] / agg_frame["NumberOfMonths"]
)

# Andel av overdue kontra størrelse på faktura
agg_frame["OverdueShareOfDebt"] = (
    agg_frame["OverdueAmt"] / agg_frame["StatementClosingBalanceAmt"]
).fillna(0)

# Snitt antall payment overdue
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["PaymentOverDueFlag"]
    .mean()
    .rename("SnittPaymentOverDueFlag")
)
# Snitt Revolvingflag
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RevolvingFlag"]
    .mean()
    .rename("SnittRevolvingFlag")
)

cols_to_share_by_turnover = [
    "Last_Airlines",
    "Last_Amusement and Entertainment",
    "Last_Automobile / Vehicle Rental",
    "Last_Business Services",
    "Last_Clothing Stores",
    "Last_Contracted Services",
    "Last_Government Services",
    "Last_Hotels",
    "Last_Includes all lodging merchants",
    "Last_Mail Order / Telephone Order Providers",
    "Last_Miscellaneous Stores",
    "Last_Others",
    "Last_Professional Services and Membership Organizations",
    "Last_Repair Services",
    "Last_Retail Stores",
    "Last_Service Providers",
    "Last_Transportation",
    "Last_Utilities",
    "Last_Wholesale Distributors and Manufacturers",
]
for col in cols_to_share_by_turnover:
    string = re.sub(r"^Last_", "", col)
    string = f"{string}_share_of_Turnover"
    agg_frame[string] = (agg_frame[col] / agg_frame["SumTurnover"]).fillna(0)


cols_to_drop = [
    "SumTurnoverDom",
    "SumFundtransfer",
    "SumCashAtm",
    "SumCashCounter",
] + cols_to_share_by_turnover
agg_frame = agg_frame.drop(cols_to_drop, axis=1)
agg_frame_encoded = pd.get_dummies(
    agg_frame, columns=["Productname", "DistributorId"]
).reset_index()


# ------------------------------------------------------------
# MERGE TARGET VALUE INTO FEATURE-FRAME
# ------------------------------------------------------------
feature_target_frame = agg_frame_encoded.merge(
    target_frame.reset_index()[["Collectionid", "LGD"]],
    on="Collectionid",
    how="left",
)
feature_target_frame = feature_target_frame.set_index("Collectionid", drop=True)  # ?
feature_target_frame.sort_values(by="PersonId", ascending=True).head(10)
feature_target_frame

## Split data

In [None]:
X = feature_target_frame.drop(columns=["PersonId", "LGD"])
X = X.apply(lambda x: x.astype(int) if x.dtype == "bool" else x)

y = feature_target_frame["LGD"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
classifiers = {
    # Tree-baserte modeller
    "rf": lambda: RandomForestClassifier(n_estimators=100, random_state=42),
    "gb": lambda: GradientBoostingClassifier(n_estimators=100, random_state=42),
    "ada": lambda: AdaBoostClassifier(n_estimators=100, random_state=42),
    "dt": lambda: DecisionTreeClassifier(random_state=42),
    # Modeller som bør scales
    "lr": lambda: Pipeline(
        [
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=2000, random_state=42)),
        ]
    ),
    "svc": lambda: Pipeline(
        [("scaler", StandardScaler()), ("clf", SVC(random_state=42))]
    ),
    "knn": lambda: Pipeline(
        [("scaler", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=5))]
    ),
}

def train_all_classifiers(X_train, y_train, clf_options=None):
    if clf_options is None:
        clf_options = list(classifiers.keys())

    sm = SMOTE(random_state=42)
    trained_models = {}
    training_times = {}
    print("Training classifiers...")
    print("=" * 80)

    # LGD = 0
    print("Training eq0 classifiers (LGD = 0)...")
    print("-" * 80)
    y_train_eq0 = (y_train == 0).astype(int)
    X_res_eq0, y_res_eq0 = sm.fit_resample(X_train, y_train_eq0)
    for clf_name in clf_options:
        start_time = time.time()
        clf = classifiers[clf_name]()
        clf.fit(X_res_eq0, y_res_eq0)
        elapsed = time.time() - start_time
        trained_models[f"eq0_{clf_name}"] = clf
        training_times[f"eq0_{clf_name}"] = elapsed
        print(f"  eq0_{clf_name:10s}: {elapsed:.3f}s")

    # LGD = 1
    print("\nTraining eq1 classifiers (LGD = 1)...")
    print("-" * 80)
    y_train_eq1 = (y_train == 1).astype(int)
    X_res_eq1, y_res_eq1 = sm.fit_resample(X_train, y_train_eq1)
    for clf_name in clf_options:
        start_time = time.time()
        clf = classifiers[clf_name]()
        clf.fit(X_res_eq1, y_res_eq1)
        elapsed = time.time() - start_time
        trained_models[f"eq1_{clf_name}"] = clf
        training_times[f"eq1_{clf_name}"] = elapsed
        print(f"  eq1_{clf_name:10s}: {elapsed:.3f}s")

    # Tie-breaker
    print("\nTraining tie-breaker classifiers...")
    print("-" * 80)
    mask = (y_train == 0) | (y_train == 1)
    X_tie = X_train[mask]
    y_tie = y_train[mask]
    y_tie_eq1 = (y_tie == 1).astype(int)
    X_res_tie, y_res_tie = sm.fit_resample(X_tie, y_tie_eq1)
    for clf_name in clf_options:
        start_time = time.time()
        clf = classifiers[clf_name]()
        clf.fit(X_res_tie, y_res_tie)
        elapsed = time.time() - start_time
        trained_models[f"tie_{clf_name}"] = clf
        training_times[f"tie_{clf_name}"] = elapsed
        print(f"  tie_{clf_name:10s}: {elapsed:.3f}s")

    # Regression
    print("\nTraining regression model...")
    print("-" * 80)
    middle_mask = (y_train > 0) & (y_train < 1)
    reg = GradientBoostingRegressor(
        n_estimators=30,
        max_depth=2,
        learning_rate=0.1,
        subsample=0.7,
        max_features=0.3,
        random_state=42,
    )
    start_time = time.time()
    if middle_mask.sum() > 10:
        X_middle = X_train[middle_mask]
        y_middle = y_train[middle_mask]
        reg.fit(X_middle, y_middle)
    else:
        reg.fit(X_train, y_train)
    elapsed = time.time() - start_time
    trained_models["regressor"] = reg
    training_times["regressor"] = elapsed
    print(f"  regressor  : {elapsed:.3f}s")

    total_time = sum(training_times.values())
    print("=" * 80)
    print(f"Total training time: {total_time:.3f}s")
    print("=" * 80)

    return trained_models, training_times

def predict_with_pretrained_models(X_test, trained_models, clf_eq0_name, clf_eq1_name, clf_tie_name):
    clf_eq0 = trained_models[f"eq0_{clf_eq0_name}"]
    clf_eq1 = trained_models[f"eq1_{clf_eq1_name}"]
    clf_tie = trained_models[f"tie_{clf_tie_name}"]
    reg = trained_models["regressor"]

    pred_eq0 = clf_eq0.predict(X_test)
    pred_eq1 = clf_eq1.predict(X_test)
    final_pred = np.full(len(X_test), np.nan)

    final_pred[(pred_eq0 == 1) & (pred_eq1 == 0)] = 0
    final_pred[(pred_eq0 == 0) & (pred_eq1 == 1)] = 1

    neither_mask = (pred_eq0 == 0) & (pred_eq1 == 0)
    if neither_mask.any():
        reg_pred = reg.predict(X_test[neither_mask])
        reg_pred = np.clip(reg_pred, 0, 1)
        final_pred[neither_mask] = reg_pred

    conflict_mask = (pred_eq0 == 1) & (pred_eq1 == 1)
    if conflict_mask.any():
        final_pred[conflict_mask] = clf_tie.predict(X_test[conflict_mask])

    return final_pred

def grid_search_classifiers(X_train, y_train, X_test, y_test, clf_options=None):
    if clf_options is None:
        clf_options = list(classifiers.keys())

    trained_models, training_times = train_all_classifiers(
        X_train, y_train, clf_options
    )
    results = []
    combinations = list(product(clf_options, repeat=3))

    print(f"\nTesting {len(combinations)} combinations...")    
    for i, (eq0, eq1, tie) in enumerate(combinations):
        print(
            f"Testing {i+1}/{len(combinations)}: eq0={eq0}, eq1={eq1}, tie={tie}",
            end="\r",
        )
        start_time = time.time()
        try:
            y_pred = predict_with_pretrained_models(X_test, trained_models, eq0, eq1, tie)

            combo_training_time = (
                training_times[f"eq0_{eq0}"]
                + training_times[f"eq1_{eq1}"]
                + training_times[f"tie_{tie}"]
                + training_times["regressor"]
            )

            metrics = {
                "Classifier equal 0": eq0,
                "Classifier equal 1": eq1,
                "Tie breaker": tie,
                **model_metrics.get_metrics(y_pred, y_test),
            }
            elapsed = time.time() - start_time

            metrics["Prediction time"] = elapsed
            metrics["Training time"] = combo_training_time
            metrics["Total time"] = elapsed + combo_training_time

            print(
                f"✓ [{i+1}/{len(combinations)}] eq0={eq0}, eq1={eq1}, tie={tie} - pred:{elapsed:.2f}s train:{combo_training_time:.2f}s"
            )
            results.append(metrics)
        except Exception as e:
            elapsed = time.time() - start_time
            print(
                f"✗ [{i+1}/{len(combinations)}] eq0={eq0}, eq1={eq1}, tie={tie} - Error after {elapsed:.2f}s: {e}"
            )
            continue

    results_frame = pd.DataFrame(results).sort_values(
        "Accuracy within 1%", ascending=False
    )
    return results_frame


def train_and_predict(
    X_train, y_train, X_test, clf_eq0_name, clf_eq1_name, clf_tie_name
):
    sm = SMOTE(random_state=42)

    # LGD = 0
    y_train_eq0 = (y_train == 0).astype(int)
    X_res_eq0, y_res_eq0 = sm.fit_resample(X_train, y_train_eq0)
    clf_eq0 = classifiers[clf_eq0_name]()
    clf_eq0.fit(X_res_eq0, y_res_eq0)

    # LGD = 1
    y_train_eq1 = (y_train == 1).astype(int)
    X_res_eq1, y_res_eq1 = sm.fit_resample(X_train, y_train_eq1)
    clf_eq1 = classifiers[clf_eq1_name]()
    clf_eq1.fit(X_res_eq1, y_res_eq1)

    # Tie-breaker
    mask = (y_train == 0) | (y_train == 1)
    X_tie = X_train[mask]
    y_tie = y_train[mask]
    y_tie_eq1 = (y_tie == 1).astype(int)
    X_res_tie, y_res_tie = sm.fit_resample(X_tie, y_tie_eq1)
    clf_tie = classifiers[clf_tie_name]()
    clf_tie.fit(X_res_tie, y_res_tie)

    # Regression
    middle_mask = (y_train > 0) & (y_train < 1)
    reg = GradientBoostingRegressor(
        n_estimators=30,
        max_depth=2,
        learning_rate=0.1,
        subsample=0.7,
        max_features=0.3,
        random_state=42,
    )
    if middle_mask.sum() > 10:
        X_middle = X_train[middle_mask]
        y_middle = y_train[middle_mask]
        reg.fit(X_middle, y_middle)
    else:
        reg.fit(X_train, y_train)

    # Prediction
    pred_eq0 = clf_eq0.predict(X_test)
    pred_eq1 = clf_eq1.predict(X_test)
    final_pred = np.full(len(X_test), np.nan)

    final_pred[(pred_eq0 == 1) & (pred_eq1 == 0)] = 0
    final_pred[(pred_eq0 == 0) & (pred_eq1 == 1)] = 1

    neither_mask = (pred_eq0 == 0) & (pred_eq1 == 0)
    if neither_mask.any():
        reg_pred = reg.predict(X_test[neither_mask])
        reg_pred = np.clip(reg_pred, 0, 1)
        final_pred[neither_mask] = reg_pred

    conflict_mask = (pred_eq0 == 1) & (pred_eq1 == 1)
    if conflict_mask.any():
        final_pred[conflict_mask] = clf_tie.predict(X_test[conflict_mask])

    return final_pred

In [None]:
results = grid_search_classifiers(
    X_train, y_train, X_test, y_test
)
results.to_csv(f"results/CombinationMetricsWithContext:{max_context_window}.csv", index=False)
results

In [None]:
model_metrics.plot_top_n_by_columns(results, [("Accuracy within 1%", False), ("Mean difference", True)], n=20)

In [None]:
y_pred = train_and_predict(X_train, y_train, X_test, "svc", "svc", "ada")
model_metrics.print_bin_frequencies(y_pred, y_test)
model_metrics.print_metrics(y_pred, y_test)
model_metrics.plot_bins(y_pred, y_test, bins=30)