In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from utils import file, plot, data, stat, SKNN
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from imblearn.over_sampling import SMOTE
import xgboost as xgb
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

In [None]:
# Maksimalt kontekstvindu for hver sak. Sett til -1 dersom man ønsker fullt vindu, altså all informasjon før saken
max_context_window = -1

# ------------------------------------------------------------
# LOAD DATA
# ------------------------------------------------------------
konto_frame, collection_frame = file.load_konto_data(
    file_path_konto
), file.load_collection_data(file_path_collection)


# ------------------------------------------------------------
# DEFINE COLLECTION TARGETS
# ------------------------------------------------------------
ids_in_kont = konto_frame["PersonId"].unique()
collection_frame = collection_frame[collection_frame["PersonId"].isin(ids_in_kont)]
target_frame = (
    collection_frame[
        [
            "Collectionid",
            "PersonId",
            "AccountId",
            "CollectionOpenedDate",
            "CollectionClosedDate",
            "BalanceSentAmt",
            "MonthInDCA",
            "CumulativeLossAmt",
        ]
    ]
    .groupby("Collectionid")
    .tail(1)
    .set_index("Collectionid")
)
# Stenger saker når de går til overvåk
zcov_frame = collection_frame[collection_frame["MonthsInZCOV"] == 0].copy()
zcov_frame = zcov_frame.sort_values("YearMonth").drop_duplicates("Collectionid")
zcov_frame["ZCOVDate"] = zcov_frame["YearMonth"] + pd.offsets.MonthEnd(0)
target_frame["ZCOVDate"] = target_frame.index.map(
    zcov_frame.set_index("Collectionid")["ZCOVDate"]
)
target_frame["ClosedDateSetByZCOVDate"] = 0
cond = target_frame["CollectionClosedDate"].isna() & target_frame["ZCOVDate"].notna()
target_frame.loc[cond, "CollectionClosedDate"] = target_frame.loc[cond, "ZCOVDate"]
target_frame.loc[cond, "ClosedDateSetByZCOVDate"] = 1

# Flagger om sak fortsatt er åpen
target_frame["IsOpen"] = (target_frame["CollectionClosedDate"].isna()).astype(int)

# Flagger om sak er åpnet innenfor datasettet
target_frame["CollectionOpenedAfter202309"] = (
    target_frame["CollectionOpenedDate"] >= pd.Timestamp("2023-09-01")
).astype(int)

# Beregner antall dager saken har vært åpen. Merk at ZCOVDate er satt til siste dag i måneden
target_frame["DurationDays"] = (
    pd.to_datetime(target_frame["CollectionClosedDate"])
    - pd.to_datetime(target_frame["CollectionOpenedDate"])
).dt.days

# Definerer tap. Tap under 0 settes til 0. Tap over balanse sent settes til balanse
target_frame["Loss"] = (
    target_frame["CumulativeLossAmt"]
    .mask(target_frame["CumulativeLossAmt"] < 0, 0)
    .mask(
        target_frame["CumulativeLossAmt"] > target_frame["BalanceSentAmt"],
        target_frame["BalanceSentAmt"],
    )
)

# Beregner LGD som en andel av balanse
target_frame["LGD"] = target_frame["Loss"] / target_frame["BalanceSentAmt"]

# Fjerner alle saker som enten er uten balanse, er åpnet før datasettet, eller ikke er stengt
mask_filter = (
    (target_frame["BalanceSentAmt"] != 0)
    & (target_frame["CollectionOpenedAfter202309"] == 1)
    & (target_frame["IsOpen"] == 0)
)
target_frame = target_frame[mask_filter]


# ------------------------------------------------------------
# CREATE DATASET WITH KONTODATA FOR EACH COLLECTION
# ------------------------------------------------------------
rows = []
for idx, row in target_frame.iterrows():
    mask = (
        (konto_frame["PersonId"] == row["PersonId"])
        & (konto_frame["AccountId"] == row["AccountId"])
        & (konto_frame["YearMonth"] < row["CollectionOpenedDate"])
    )
    temp = konto_frame[mask].copy()
    if max_context_window != -1:
        temp = temp.tail(max_context_window)
    temp["Collectionid"] = row.name
    rows.append(temp)
feature_frame = pd.concat(rows, ignore_index=True)
feature_frame = data.reorder_column(feature_frame, "Collectionid", 3)


# ------------------------------------------------------------
# ADD FEATURES
# ------------------------------------------------------------
status_map = {
    "Normal": 0,
    "Ikke aktivert": 0,
    "Avsluttet": 0,
    "Avsluttet av kunde": 0,
    #
    "Faktura forfalt": 1,
    "Purring": 1,
    "Under avslutning": 1,
    "Under avslutning manglende KYC": 1,
    "Sperret": 1,
    "Sperret, propagert": 1,
    "Eget misbruk": 1,
    "Avsluttet inkasso, oppgjort": 1,
    #
    "Purring forfalt": 2,
    "Purring med kortsperre": 2,
    "Spesialengasjement, manuell behandling": 2,
    "Betalingsplan, nedbetaling": 2,
    #
    "Inkassovarsel": 3,
    #
    "Overført inkassobyrå": 4,
    #
    "Inkasso, overført overvåk": 5,
    "Avsluttet inkasso, med tap": 5,
    "Gjeldsordning ikke DCA Offentlig": 5,
    "Gjeldsordning ikke DCA Privat": 5,
    "Konkursbo Privat": 5,
}
feature_frame["RisikoStatus"] = feature_frame["GeneralStatusDesc"].map(status_map)
feature_frame = data.reorder_column(feature_frame, "RisikoStatus", 10)
product_map = {
    1: "SB1 GOLD MC",
    2: "Sparebank 1 Platinum MC",
    4: "SH BUSINESS VISA",
    7: "LOfavør Mastercard",
    8: "SB1 UNG MC",
    30: "SH GOLD MC",
    34: "SB1 EXTRA MC",
    38: "LOfavør Mastercard Ung",
    40: "SpareBank 1 Mastercard Elite",
}
feature_frame["ProductId"] = feature_frame["ProductId"].map(product_map)
feature_frame.rename(columns={"ProductId": "Productname"}, inplace=True)
feature_frame["Gender"] = feature_frame["Gender"].map({"F": 0, "M": 1}).astype(int)
feature_frame["AgeGroup"] = (
    feature_frame["AgeGroup"]
    .map(
        {"0 - 24": 0, "25 - 34": 1, "35 - 44": 2, "45 - 54": 3, "55 - 64": 4, "> 64": 5}
    )
    .astype(int)
)

col_rename_dict = {"Gender": "Male"}
feature_frame.rename(columns=col_rename_dict, inplace=True)
non_seq_cols = [
    "YearMonth",
    "PeriodId",
    "PersonId",
    "AccountId",
    "Productname",
    "DistributorId",
    "AgeGroup",
    "Male",
    "Kommunenavn",
    "GeneralStatusDesc",
    "MonthsSinceAccountCreatedNum",
    "MonthsInZCOVNum",
]
feature_frame = feature_frame.loc[
    :, ~feature_frame.columns.str.startswith(("SumL3_", "SumL12_", "Last_"))
]
feature_frame

In [None]:
unique_ids = feature_frame["Collectionid"].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
X_train = feature_frame[feature_frame["Collectionid"].isin(train_ids)]
X_test = feature_frame[feature_frame["Collectionid"].isin(test_ids)]
y_train = target_frame.loc[train_ids]
y_test = target_frame.loc[test_ids]
y_train_eq0 = (y_train["LGD"] == 0).astype(int)
y_test_eq0 = (y_test["LGD"] == 0).astype(int)

In [None]:
def balance_dataset(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
    ids_eq0 = y[y == 1].index.to_numpy()  # LGD=0
    ids_neq0 = y[y == 0].index.to_numpy()  # LGD!=0
    np.random.seed(42)
    ids_eq0_sampled = np.random.choice(ids_eq0, size=len(ids_neq0), replace=False)
    balanced_ids = np.concatenate([ids_eq0_sampled, ids_neq0])
    X_bal = X[X["Collectionid"].isin(balanced_ids)]
    y_bal = y.loc[balanced_ids]
    return X_bal, y_bal
X_train_bal, y_train_bal = balance_dataset(X_train, y_train_eq0)
X_test_bal, y_test_bal = X_test, y_test_eq0

In [None]:
sknn = SKNN.SequenceKNeighborsClassifier(k=20)
sknn.fit(X_train_bal, y_train_bal, id_col="Collectionid")

In [None]:
y_pred = sknn.predict(X_test_bal)

In [None]:
df_compare = pd.DataFrame({"y_true": y_test_bal, "y_pred": y_pred})
df_compare.head(40)

In [None]:
print(classification_report(y_test_bal, y_pred))

In [None]:
cm = confusion_matrix(y_test_bal, y_pred)

# Plot with seaborn
plt.figure(figsize=(6, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    xticklabels=["Pred 0", "Pred 1"],
    yticklabels=["Actual 0", "Actual 1"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
y_pred_proba = sknn.predict_proba(X_test_bal)[:, 1]

# Compute ROC AUC
auc = roc_auc_score(y_test_bal, y_pred_proba)
print("ROC AUC:", auc)