# Baseline
I tilfeller der CumulativeLossAmt er negativ, har kunden betalt for mye. I disse tilfellene settes tapet til 0. Det er snakk om 82 tilfeller. Når tap er høyere enn sum sent, antar jeg at tapet som føres er i forbindelse med renter også? Capper den til sum sent i disse tilfellene. Er snakk om 140 tilfeller.

For sakene som er Avsluttet, inkasso oppgjort, må man sjekke om det er betalt inn på slutten. For det er rader, som 334, hvor kunden betaler inn siste måned, men taps-summen står.

In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from utils import file, plot, data, stat
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from imblearn.over_sampling import SMOTE
import xgboost as xgb
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

### Load data

In [None]:
konto_frame, collection_frame = file.load_konto_data(
    file_path_konto
), file.load_collection_data(file_path_collection)

In [None]:
collection_frame[collection_frame["PersonId"] == 1359]

### Create target frame

In [None]:
ids_in_kont = konto_frame["PersonId"].unique()
collection_frame = collection_frame[collection_frame["PersonId"].isin(ids_in_kont)]
target_frame = (
    collection_frame[
        [
            "Collectionid",
            "PersonId",
            "AccountId",
            "CollectionOpenedDate",
            "CollectionClosedDate",
            "BalanceSentAmt",
            "MonthInDCA", "CumulativeLossAmt",
        ]
    ]
    .groupby("Collectionid")
    .tail(1)
    .set_index("Collectionid")
)
# Stenger saker når de går til overvåk
zcov_frame = collection_frame[collection_frame["MonthsInZCOV"] == 0].copy()
zcov_frame = zcov_frame.sort_values("YearMonth").drop_duplicates("Collectionid")
zcov_frame["ZCOVDate"] = zcov_frame["YearMonth"] + pd.offsets.MonthEnd(0)
target_frame["ZCOVDate"] = target_frame.index.map(
    zcov_frame.set_index("Collectionid")["ZCOVDate"]
)
target_frame["ClosedDateSetByZCOVDate"] = 0
cond = target_frame["CollectionClosedDate"].isna() & target_frame["ZCOVDate"].notna()
target_frame.loc[cond, "CollectionClosedDate"] = target_frame.loc[cond, "ZCOVDate"]
target_frame.loc[cond, "ClosedDateSetByZCOVDate"] = 1

# Flagger om sak fortsatt er åpen
target_frame["IsOpen"] = (target_frame["CollectionClosedDate"].isna()).astype(int)

# Flagger om sak er åpnet innenfor datasettet
target_frame["CollectionOpenedAfter202309"] = (
    target_frame["CollectionOpenedDate"] >= pd.Timestamp("2023-09-01")
).astype(int)

# Beregner antall dager saken har vært åpen. Merk at ZCOVDate er satt til siste dag i måneden
target_frame["DurationDays"] = (
    pd.to_datetime(target_frame["CollectionClosedDate"])
    - pd.to_datetime(target_frame["CollectionOpenedDate"])
).dt.days

# Definerer tap. Tap under 0 settes til 0. Tap over balanse sent settes til balanse
target_frame["Loss"] = (
    target_frame["CumulativeLossAmt"]
    .mask(target_frame["CumulativeLossAmt"] < 0, 0)
    .mask(
        target_frame["CumulativeLossAmt"] > target_frame["BalanceSentAmt"],
        target_frame["BalanceSentAmt"],
    )
)

# Beregner LGD som en andel av balanse
target_frame["LGD"] = target_frame["Loss"] / target_frame["BalanceSentAmt"]

# Fjerner alle saker som enten er uten balanse, er åpnet før datasettet, eller ikke er stengt
mask_filter = (
    (target_frame["BalanceSentAmt"] != 0)
    & (target_frame["CollectionOpenedAfter202309"] == 1)
    & (target_frame["IsOpen"] == 0)
)
target_frame = target_frame[mask_filter]

# Legger til target som sier om saken har vært mer enn 1 måned til inkasso
target_frame["LastsMoreThan1MonthInDCA"] = (target_frame["MonthInDCA"] > 1).astype(int)

# Legger til target som sier om saken har vart mer nn 4 uker
target_frame["LastsMoreThan4Weeks"] = (target_frame["DurationDays"] > 28).astype(int)

# Legger til target som sier om LGD = 0 eller er større
target_frame["LGD>0"] = (target_frame["LGD"] > 0).astype(int)
target_frame["LGD<1"] = (target_frame["LGD"] < 1).astype(int)
target_frame["0<LGD<1"] = (
    (target_frame["LGD"] < 1) & (target_frame["LGD"] > 0)
).astype(int)

# Kolonne man gjør klassifisering på
target_col = "LGD>0"
target_frame

In [None]:
target_frame[target_col].value_counts()

### Create feature frame
Dataframe of all the konto_frame-rows for each collectionId, based on date opened

In [None]:
rows = []

for idx, row in target_frame.iterrows():
    mask = (konto_frame["PersonId"] == row["PersonId"]) & (konto_frame["AccountId"] == row["AccountId"]) & (
        konto_frame["YearMonth"] < row["CollectionOpenedDate"]
    )
    temp = konto_frame[mask].copy()
    temp["Collectionid"] = row.name
    rows.append(temp)
feature_frame = pd.concat(rows, ignore_index=True)
feature_frame = data.reorder_column(feature_frame, "Collectionid", 3)
feature_frame


### Engineer features

In [None]:
status_map = {
    "Normal": 0,
    "Ikke aktivert": 0,
    "Avsluttet": 0,
    "Avsluttet av kunde": 0,
    #
    "Faktura forfalt": 1,
    "Purring": 1,
    "Under avslutning": 1,
    "Under avslutning manglende KYC": 1,
    "Sperret": 1,
    "Sperret, propagert": 1,
    "Eget misbruk": 1,
    "Avsluttet inkasso, oppgjort": 1,
    #
    "Purring forfalt": 2,
    "Purring med kortsperre": 2,
    "Spesialengasjement, manuell behandling": 2,
    "Betalingsplan, nedbetaling": 2,
    #
    "Inkassovarsel": 3,
    #
    "Overført inkassobyrå": 4,
    #
    "Inkasso, overført overvåk": 5,
    "Avsluttet inkasso, med tap": 5,
    "Gjeldsordning ikke DCA Offentlig": 5,
    "Gjeldsordning ikke DCA Privat": 5,
    "Konkursbo Privat": 5,
}
feature_frame["RisikoStatus"] = feature_frame["GeneralStatusDesc"].map(status_map)
feature_frame = data.reorder_column(feature_frame, "RisikoStatus", 10)
product_map = {
    1: "SB1 GOLD MC",
    2: "Sparebank 1 Platinum MC",
    4: "SH BUSINESS VISA",
    7: "LOfavør Mastercard",
    8: "SB1 UNG MC",
    30: "SH GOLD MC",
    34: "SB1 EXTRA MC",
    38: "LOfavør Mastercard Ung",
    40: "SpareBank 1 Mastercard Elite",
}
feature_frame["ProductId"] = feature_frame["ProductId"].map(product_map)
feature_frame.rename(columns={"ProductId": "Productname"}, inplace=True)
feature_frame["Gender"] = feature_frame["Gender"].map(
    {"F": 0, "M": 1}
).astype(int)
feature_frame["AgeGroup"] = (
    feature_frame["AgeGroup"]
    .map(
        {"0 - 24": 0, "25 - 34": 1, "35 - 44": 2, "45 - 54": 3, "55 - 64": 4, "> 64": 5}
    )
    .astype(int)
)
col_rename_dict = {"Gender": "Male"}
feature_frame.rename(columns=col_rename_dict, inplace=True)
feature_frame

### Combine accounts

### Concatinate into one feature row per Collectionid
- Product, siste
- Distributør, siste
- Aldersgruppe, siste
- Male, siste
- Risiko-status snitt
- Risiko-status antall av hvert tall 
- Måneder siden konto åpnet, siste
- Balanse, siste
- Snittbalanse
- Balanse standardavvik
- Kredittgrense, siste
- Antall økninger i kredittgrense
- Antall collection-flag = 1
- Andel collection-flag = 1
- Sum turnover
- Turnover som en andel av kredittgrense
- Turnover som en andel av balanse
- Snitt turnover per num (altså sum av turnover delt på sum av num)
- Snitt turnoverNum
- TurnoverDom som en andel av Turnover
- Sum FundtransferAmt, CashAtmAmt og CashCounterAmt: WithdrawalAmt
- TUrnover + Withdrawal som en andel av kredittgrense
- OverdueAmt som en andel av StatementClosingBalanceAmt
- Andel PaymentOverDueFlag = 1
- ANdel RevolvingFlag = 1
- Andel av alle kategoriene på TurnoverAmt
- Antall rader
- Antall måneder siden forrige inkasso-sak

In [None]:
agg_dict = {
	"PersonId": "last",
	"Productname": "last",
	"DistributorId": "last",
	"AgeGroup": "last",
	"Male": "last",
	"MonthsSinceAccountCreatedNum": "last",
	"BalanceAmt": "last",
	"CreditLimitAmt": "last",
	"OverdueAmt": "last",
	"StatementClosingBalanceAmt": "last",
	"Last_Airlines": "sum",
    "Last_Amusement and Entertainment": "sum",
    "Last_Automobile / Vehicle Rental": "sum",
    "Last_Business Services": "sum",
    "Last_Clothing Stores": "sum",
    "Last_Contracted Services": "sum",
    "Last_Government Services": "sum",
    "Last_Hotels": "sum",
    "Last_Includes all lodging merchants": "sum",
    "Last_Mail Order / Telephone Order Providers": "sum",
    "Last_Miscellaneous Stores": "sum",
    "Last_Others": "sum",
    "Last_Professional Services and Membership Organizations": "sum",
    "Last_Repair Services": "sum",
    "Last_Retail Stores": "sum",
    "Last_Service Providers": "sum",
    "Last_Transportation": "sum",
    "Last_Utilities": "sum",
    "Last_Wholesale Distributors and Manufacturers": "sum",
	
}
feature_copy = feature_frame.copy()
agg_frame = (
    feature_copy.groupby("Collectionid")
    .agg(agg_dict)
    .reset_index()
    .set_index("Collectionid")
)
# Antall måneder med data
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["YearMonth"]
    .nunique()
    .rename("NumberOfMonths")
)
# Antall kontoer
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AccountId"]
    .nunique()
    .rename("NumberOfAccounts")
)
# Snitt av statusene
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RisikoStatus"].mean().rename("RisikoStatusSnitt")
)
# Sum av endringer fra måned til måned
agg_frame = agg_frame.join( #Det må håndteres at en person kan ha ulike kontoer. Ha et merge accounts steg
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .apply(lambda x: x.diff().abs().sum())
    .rename("SumRisikoStatusEndringer")
)
# Snitt endringer i måneden
agg_frame["SnittRisikoStatusEndringerPerMåned"] = (
    agg_frame["SumRisikoStatusEndringer"] / agg_frame["NumberOfMonths"]
)
# Andel av hver risikogruppe
risk_counts = (
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .value_counts()
    .unstack(fill_value=0)
)
risk_props = risk_counts.div(risk_counts.sum(axis=1), axis=0)
risk_props.columns = [f"RisikoStatus_{col}_share" for col in risk_props.columns]
agg_frame = agg_frame.join(risk_props)
# Snitt-balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"]
    .mean()
    .rename("BalanceAmtMean")
)
# Standardavvik balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"]
    .std()
    .fillna(0)
    .rename("BalanceAmtStd")
)

# Antall kredittgrenseøkninger
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CreditLimitIncreaseFlag"]
    .sum()
    .rename("AntallCreditLimitIncrease")
)
# Antall ganger gått til inkasso
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CollectionFlag"]
    .sum()
    .rename("AntallCollectionFlag")
)
# Snitt ganger gått til inkasso
agg_frame["SnittCollectionFlagPerMåned"] = (
    agg_frame["AntallCollectionFlag"] / agg_frame["NumberOfMonths"]
)
# Sum turnover
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverAmt"]
    .sum()
    .rename("SumTurnover")
)
# Snitt turnover
agg_frame["SnittTurnover"] = agg_frame["SumTurnover"] / agg_frame["NumberOfMonths"]
# Sum antall transaksjoner
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverNum"].sum().rename("TurnoverNumSum")
)
# Snitt antall transaksjoner
agg_frame["SnittTurnoverNum"] = agg_frame["TurnoverNumSum"] / agg_frame["NumberOfMonths"]
# Snitt transaksjonsstørrelse
agg_frame["AverageTransactionSize"] = (
    agg_frame["SumTurnover"] / agg_frame["TurnoverNumSum"]
).fillna(0)
# Sum innenlandstransaksjoner
agg_frame = agg_frame.join( #X
    feature_copy.groupby("Collectionid")["TurnoverDomAmt"]
    .sum()
    .rename("SumTurnoverDom")
)
agg_frame["ShareOfTurnoverIsDomestic"] = (
    agg_frame["SumTurnoverDom"] / agg_frame["SumTurnover"]
).fillna(0)

# Withdrawal sum
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["FundtransferAmt"]
    .sum()
    .rename("SumFundtransfer")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashAtmAmt"].sum().rename("SumCashAtm")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashCounterAmt"]
    .sum()
    .rename("SumCashCounter")
)
agg_frame["WithdrawalSum"] = (
    agg_frame["SumFundtransfer"] + agg_frame["SumCashAtm"] + agg_frame["SumCashCounter"]
)

# Withdrawal propotional to turnover
agg_frame["WithdrawalPropOfTurnover"] = (
    agg_frame["WithdrawalSum"] / agg_frame["SumTurnover"]
).fillna(0)

# Total expediture as propotion of kredittgrense
agg_frame["ExpenditureAsShareOfCreditlimit"] = (
    (agg_frame["SumTurnover"] + agg_frame["WithdrawalSum"]) / agg_frame["CreditLimitAmt"]
)


# Snitt Turnover andel av kredittgrense
feature_copy["AndelTurnoverAvKredittgrense"] = feature_copy["TurnoverAmt"] / feature_copy["CreditLimitAmt"] * -1
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelTurnoverAvKredittgrense"]
    .mean()
    .rename("SnittTurnoverAndelAvKredittgrense")
)
# Siste Balanse som andel av kredittgrense
feature_copy["AndelBalanseAvKredittgrense"] = (
    feature_copy["BalanceAmt"] / feature_copy["CreditLimitAmt"] * -1
)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelBalanseAvKredittgrense"]
    .last()
    .rename("LastAndelBalanseAvKredittgrense")
)
# Antall overtrekk
feature_copy["Overtrekk"] = (
    feature_copy["InterestEarningLendingAmt"] > feature_copy["CreditLimitAmt"]
).astype(int)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["Overtrekk"]
    .sum()
    .rename("AntallOvertrekk")
)
# Snitt overtrekk
agg_frame["SnittOvertrekkPerMåned"] = (
    agg_frame["AntallOvertrekk"] / agg_frame["NumberOfMonths"]
)

# Andel av overdue kontra størrelse på faktura
agg_frame["OverdueShareOfDebt"] = (
    agg_frame["OverdueAmt"] / agg_frame["StatementClosingBalanceAmt"]
).fillna(0)

# Snitt antall payment overdue
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["PaymentOverDueFlag"]
    .mean()
    .rename("SnittPaymentOverDueFlag")
)
# Snitt Revolvingflag
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RevolvingFlag"]
    .mean()
    .rename("SnittRevolvingFlag")
)

cols_to_share_by_turnover = [
    "Last_Airlines",
    "Last_Amusement and Entertainment",
    "Last_Automobile / Vehicle Rental",
    "Last_Business Services",
    "Last_Clothing Stores",
    "Last_Contracted Services",
    "Last_Government Services",
    "Last_Hotels",
    "Last_Includes all lodging merchants",
    "Last_Mail Order / Telephone Order Providers",
    "Last_Miscellaneous Stores",
    "Last_Others",
    "Last_Professional Services and Membership Organizations",
    "Last_Repair Services",
    "Last_Retail Stores",
    "Last_Service Providers",
    "Last_Transportation",
    "Last_Utilities",
    "Last_Wholesale Distributors and Manufacturers",
]
for col in cols_to_share_by_turnover:
    string = re.sub(r"^Last_", "", col)
    string = f"{string}_share_of_Turnover"
    agg_frame[string] = (agg_frame[col] / agg_frame["SumTurnover"]).fillna(0)


cols_to_drop = [
    "SumTurnoverDom",
    "SumFundtransfer",
    "SumCashAtm",
    "SumCashCounter",
] + cols_to_share_by_turnover
agg_frame = agg_frame.drop(cols_to_drop, axis=1)
agg_frame_encoded = pd.get_dummies(agg_frame, columns=["Productname", "DistributorId"]).reset_index()

In [None]:
feature_target_frame = agg_frame_encoded.merge(
    target_frame.reset_index()[["Collectionid", target_col]],
    on="Collectionid",
    how="left",
)
feature_target_frame = feature_target_frame.set_index("Collectionid", drop=True)  # ?
feature_target_frame.sort_values(by="PersonId", ascending=True).head(10)

In [None]:
cols_to_drop = ["PersonId", target_col]
X = feature_target_frame.drop(columns=cols_to_drop)
X = X.apply(lambda x: x.astype(int) if x.dtype == "bool" else x)
y = feature_target_frame[target_col]

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

In [None]:
def plot_model_evaluation_sns(
    model, X_test, y_test, X_train=None, top_n_features=10, model_name="Model"
):
    # --- Predict ---
    y_pred = model.predict(X_test)
    y_proba = (
        model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    )

    # --- Compute metrics ---
    cm = confusion_matrix(y_test, y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)

    # --- Feature importances ---
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
    elif hasattr(model, "coef_"):
        importances = np.abs(model.coef_[0])
    else:
        importances = None

    if importances is not None and X_train is not None:
        feat_importance_df = (
            pd.DataFrame({"Feature": X_train.columns, "Importance": importances})
            .sort_values(by="Importance", ascending=False)
            .head(top_n_features)
        )
    else:
        feat_importance_df = None

    # --- Plot subplots ---
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # 1️⃣ Confusion matrix as heatmap
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[0])
    axes[0].set_xlabel("Predicted")
    axes[0].set_ylabel("Actual")
    axes[0].set_title(f"{model_name} - Confusion Matrix")

    # 2️⃣ Feature importances
    if feat_importance_df is not None:
        axes[1].barh(
            feat_importance_df["Feature"],
            feat_importance_df["Importance"],
            color="skyblue",
        )
        axes[1].invert_yaxis()
        axes[1].set_xlabel("Importance")
        axes[1].set_title(f"{model_name} - Top {top_n_features} Features")
    else:
        axes[1].text(
            0.5,
            0.5,
            "No feature importances",
            horizontalalignment="center",
            verticalalignment="center",
        )
        axes[1].set_axis_off()

    # 3️⃣ ROC curve
    axes[2].plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.2f}")
    axes[2].plot([0, 1], [0, 1], color="grey", linestyle="--")
    axes[2].set_xlabel("False Positive Rate")
    axes[2].set_ylabel("True Positive Rate")
    axes[2].set_title(f"{model_name} - ROC Curve")
    axes[2].legend(loc="lower right")

    plt.tight_layout()
    plt.show()

### Logistic regression

In [None]:
log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))
plot_model_evaluation_sns(
    log_reg, X_test, y_test, X_train=X_train, model_name="Logistic Regression"
)

### XGBoost

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # number of trees
    max_depth=3,  # tree depth
    learning_rate=0.1,  # step size shrinkage
    subsample=0.8,  # row sampling
    colsample_bytree=0.8,  # column sampling
    random_state=42,
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
plot_model_evaluation_sns(
    xgb_model, X_test, y_test, X_train=X_train, model_name="XGBoost"
)

### Random Forest
Med hyperparameter grid search

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
plot_model_evaluation_sns(rf, X_test, y_test, X_train=X_train, model_name="Random Forest")

In [None]:
"""rf = RandomForestClassifier(random_state=42)
param_grid = {
        "n_estimators": [100, 200, 500],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_rf = grid_search.best_estimator_


y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))
plot_model_evaluation_sns(
    best_rf, X_test, y_test, X_train=X_train, model_name="Random forest"
)"""