In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from utils import file, plot, data, stat
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, r2_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
from itertools import product
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
import time
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

In [None]:
# Maksimalt kontekstvindu for hver sak. Sett til -1 dersom man ønsker fullt vindu, altså all informasjon før saken
max_context_window = -1

# ------------------------------------------------------------
# LOAD DATA
# ------------------------------------------------------------
konto_frame, collection_frame = file.load_konto_data(
    file_path_konto
), file.load_collection_data(file_path_collection)


# ------------------------------------------------------------
# DEFINE COLLECTION TARGETS
# ------------------------------------------------------------
ids_in_kont = konto_frame["PersonId"].unique()
collection_frame = collection_frame[collection_frame["PersonId"].isin(ids_in_kont)]
target_frame = (
    collection_frame[
        [
            "Collectionid",
            "PersonId",
            "AccountId",
            "CollectionOpenedDate",
            "CollectionClosedDate",
            "BalanceSentAmt",
            "MonthInDCA",
            "CumulativeLossAmt",
        ]
    ]
    .groupby("Collectionid")
    .tail(1)
    .set_index("Collectionid")
)
# Stenger saker når de går til overvåk
zcov_frame = collection_frame[collection_frame["MonthsInZCOV"] == 0].copy()
zcov_frame = zcov_frame.sort_values("YearMonth").drop_duplicates("Collectionid")
zcov_frame["ZCOVDate"] = zcov_frame["YearMonth"] + pd.offsets.MonthEnd(0)
target_frame["ZCOVDate"] = target_frame.index.map(
    zcov_frame.set_index("Collectionid")["ZCOVDate"]
)
target_frame["ClosedDateSetByZCOVDate"] = 0
cond = target_frame["CollectionClosedDate"].isna() & target_frame["ZCOVDate"].notna()
target_frame.loc[cond, "CollectionClosedDate"] = target_frame.loc[cond, "ZCOVDate"]
target_frame.loc[cond, "ClosedDateSetByZCOVDate"] = 1

# Flagger om sak fortsatt er åpen
target_frame["IsOpen"] = (target_frame["CollectionClosedDate"].isna()).astype(int)

# Flagger om sak er åpnet innenfor datasettet
target_frame["CollectionOpenedAfter202309"] = (
    target_frame["CollectionOpenedDate"] >= pd.Timestamp("2023-09-01")
).astype(int)

# Beregner antall dager saken har vært åpen. Merk at ZCOVDate er satt til siste dag i måneden
target_frame["DurationDays"] = (
    pd.to_datetime(target_frame["CollectionClosedDate"])
    - pd.to_datetime(target_frame["CollectionOpenedDate"])
).dt.days

# Definerer tap. Tap under 0 settes til 0. Tap over balanse sent settes til balanse
target_frame["Loss"] = (
    target_frame["CumulativeLossAmt"]
    .mask(target_frame["CumulativeLossAmt"] < 0, 0)
    .mask(
        target_frame["CumulativeLossAmt"] > target_frame["BalanceSentAmt"],
        target_frame["BalanceSentAmt"],
    )
)

# Beregner LGD som en andel av balanse
target_frame["LGD"] = target_frame["Loss"] / target_frame["BalanceSentAmt"]

# Fjerner alle saker som enten er uten balanse, er åpnet før datasettet, eller ikke er stengt
mask_filter = (
    (target_frame["BalanceSentAmt"] != 0)
    & (target_frame["CollectionOpenedAfter202309"] == 1)
    & (target_frame["IsOpen"] == 0)
)
target_frame = target_frame[mask_filter]


# ------------------------------------------------------------
# CREATE DATASET WITH KONTODATA FOR EACH COLLECTION
# ------------------------------------------------------------
rows = []
for idx, row in target_frame.iterrows():
    mask = (
        (konto_frame["PersonId"] == row["PersonId"])
        & (konto_frame["AccountId"] == row["AccountId"])
        & (konto_frame["YearMonth"] < row["CollectionOpenedDate"])
    )
    temp = konto_frame[mask].copy()
    if max_context_window != -1:
        temp = temp.tail(max_context_window)
    temp["Collectionid"] = row.name
    rows.append(temp)
feature_frame = pd.concat(rows, ignore_index=True)
feature_frame = data.reorder_column(feature_frame, "Collectionid", 3)


# ------------------------------------------------------------
# ADD FEATURES
# ------------------------------------------------------------
status_map = {
    "Normal": 0,
    "Ikke aktivert": 0,
    "Avsluttet": 0,
    "Avsluttet av kunde": 0,
    #
    "Faktura forfalt": 1,
    "Purring": 1,
    "Under avslutning": 1,
    "Under avslutning manglende KYC": 1,
    "Sperret": 1,
    "Sperret, propagert": 1,
    "Eget misbruk": 1,
    "Avsluttet inkasso, oppgjort": 1,
    #
    "Purring forfalt": 2,
    "Purring med kortsperre": 2,
    "Spesialengasjement, manuell behandling": 2,
    "Betalingsplan, nedbetaling": 2,
    #
    "Inkassovarsel": 3,
    #
    "Overført inkassobyrå": 4,
    #
    "Inkasso, overført overvåk": 5,
    "Avsluttet inkasso, med tap": 5,
    "Gjeldsordning ikke DCA Offentlig": 5,
    "Gjeldsordning ikke DCA Privat": 5,
    "Konkursbo Privat": 5,
}
feature_frame["RisikoStatus"] = feature_frame["GeneralStatusDesc"].map(status_map)
feature_frame = data.reorder_column(feature_frame, "RisikoStatus", 10)
product_map = {
    1: "SB1 GOLD MC",
    2: "Sparebank 1 Platinum MC",
    4: "SH BUSINESS VISA",
    7: "LOfavør Mastercard",
    8: "SB1 UNG MC",
    30: "SH GOLD MC",
    34: "SB1 EXTRA MC",
    38: "LOfavør Mastercard Ung",
    40: "SpareBank 1 Mastercard Elite",
}
feature_frame["ProductId"] = feature_frame["ProductId"].map(product_map)
feature_frame.rename(columns={"ProductId": "Productname"}, inplace=True)
feature_frame["Gender"] = feature_frame["Gender"].map({"F": 0, "M": 1}).astype(int)
feature_frame["AgeGroup"] = (
    feature_frame["AgeGroup"]
    .map(
        {"0 - 24": 0, "25 - 34": 1, "35 - 44": 2, "45 - 54": 3, "55 - 64": 4, "> 64": 5}
    )
    .astype(int)
)
col_rename_dict = {"Gender": "Male"}
feature_frame.rename(columns=col_rename_dict, inplace=True)


# ------------------------------------------------------------
# AGGREGATE EACH COLLECTION-SEQUENCE INTO A SINGLE ROW
# ------------------------------------------------------------
agg_dict = {
    "PersonId": "last",
    "Productname": "last",
    "DistributorId": "last",
    "AgeGroup": "last",
    "Male": "last",
    "MonthsSinceAccountCreatedNum": "last",
    "BalanceAmt": "last",
    "CreditLimitAmt": "last",
    "OverdueAmt": "last",
    "StatementClosingBalanceAmt": "last",
    "Last_Airlines": "sum",
    "Last_Amusement and Entertainment": "sum",
    "Last_Automobile / Vehicle Rental": "sum",
    "Last_Business Services": "sum",
    "Last_Clothing Stores": "sum",
    "Last_Contracted Services": "sum",
    "Last_Government Services": "sum",
    "Last_Hotels": "sum",
    "Last_Includes all lodging merchants": "sum",
    "Last_Mail Order / Telephone Order Providers": "sum",
    "Last_Miscellaneous Stores": "sum",
    "Last_Others": "sum",
    "Last_Professional Services and Membership Organizations": "sum",
    "Last_Repair Services": "sum",
    "Last_Retail Stores": "sum",
    "Last_Service Providers": "sum",
    "Last_Transportation": "sum",
    "Last_Utilities": "sum",
    "Last_Wholesale Distributors and Manufacturers": "sum",
}
feature_copy = feature_frame.copy()
agg_frame = (
    feature_copy.groupby("Collectionid")
    .agg(agg_dict)
    .reset_index()
    .set_index("Collectionid")
)
# Antall måneder med data
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["YearMonth"].nunique().rename("NumberOfMonths")
)
# Antall kontoer
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AccountId"]
    .nunique()
    .rename("NumberOfAccounts")
)
# Snitt av statusene
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .mean()
    .rename("RisikoStatusSnitt")
)
# Sum av endringer fra måned til måned
agg_frame = agg_frame.join(  # Det må håndteres at en person kan ha ulike kontoer. Ha et merge accounts steg
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .apply(lambda x: x.diff().abs().sum())
    .rename("SumRisikoStatusEndringer")
)
# Snitt endringer i måneden
agg_frame["SnittRisikoStatusEndringerPerMåned"] = (
    agg_frame["SumRisikoStatusEndringer"] / agg_frame["NumberOfMonths"]
)
# Andel av hver risikogruppe
risk_counts = (
    feature_copy.groupby("Collectionid")["RisikoStatus"]
    .value_counts()
    .unstack(fill_value=0)
)
risk_props = risk_counts.div(risk_counts.sum(axis=1), axis=0)
risk_props.columns = [f"RisikoStatus_{col}_share" for col in risk_props.columns]
agg_frame = agg_frame.join(risk_props)
# Snitt-balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"].mean().rename("BalanceAmtMean")
)
# Standardavvik balanse
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["BalanceAmt"]
    .std()
    .fillna(0)
    .rename("BalanceAmtStd")
)

# Antall kredittgrenseøkninger
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CreditLimitIncreaseFlag"]
    .sum()
    .rename("AntallCreditLimitIncrease")
)
# Antall ganger gått til inkasso
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["CollectionFlag"]
    .sum()
    .rename("AntallCollectionFlag")
)
# Snitt ganger gått til inkasso
agg_frame["SnittCollectionFlagPerMåned"] = (
    agg_frame["AntallCollectionFlag"] / agg_frame["NumberOfMonths"]
)
# Sum turnover
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverAmt"].sum().rename("SumTurnover")
)
# Snitt turnover
agg_frame["SnittTurnover"] = agg_frame["SumTurnover"] / agg_frame["NumberOfMonths"]
# Sum antall transaksjoner
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["TurnoverNum"].sum().rename("TurnoverNumSum")
)
# Snitt antall transaksjoner
agg_frame["SnittTurnoverNum"] = (
    agg_frame["TurnoverNumSum"] / agg_frame["NumberOfMonths"]
)
# Snitt transaksjonsstørrelse
agg_frame["AverageTransactionSize"] = (
    agg_frame["SumTurnover"] / agg_frame["TurnoverNumSum"]
).fillna(0)
# Sum innenlandstransaksjoner
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["TurnoverDomAmt"]
    .sum()
    .rename("SumTurnoverDom")
)
agg_frame["ShareOfTurnoverIsDomestic"] = (
    agg_frame["SumTurnoverDom"] / agg_frame["SumTurnover"]
).fillna(0)

# Withdrawal sum
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["FundtransferAmt"]
    .sum()
    .rename("SumFundtransfer")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashAtmAmt"].sum().rename("SumCashAtm")
)
agg_frame = agg_frame.join(  # X
    feature_copy.groupby("Collectionid")["CashCounterAmt"]
    .sum()
    .rename("SumCashCounter")
)
agg_frame["WithdrawalSum"] = (
    agg_frame["SumFundtransfer"] + agg_frame["SumCashAtm"] + agg_frame["SumCashCounter"]
)

# Withdrawal propotional to turnover
agg_frame["WithdrawalPropOfTurnover"] = (
    agg_frame["WithdrawalSum"] / agg_frame["SumTurnover"]
).fillna(0)

# Total expediture as propotion of kredittgrense
agg_frame["ExpenditureAsShareOfCreditlimit"] = (
    agg_frame["SumTurnover"] + agg_frame["WithdrawalSum"]
) / agg_frame["CreditLimitAmt"]


# Snitt Turnover andel av kredittgrense
feature_copy["AndelTurnoverAvKredittgrense"] = (
    feature_copy["TurnoverAmt"] / feature_copy["CreditLimitAmt"] * -1
)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelTurnoverAvKredittgrense"]
    .mean()
    .rename("SnittTurnoverAndelAvKredittgrense")
)
# Siste Balanse som andel av kredittgrense
feature_copy["AndelBalanseAvKredittgrense"] = (
    feature_copy["BalanceAmt"] / feature_copy["CreditLimitAmt"] * -1
)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["AndelBalanseAvKredittgrense"]
    .last()
    .rename("LastAndelBalanseAvKredittgrense")
)
# Antall overtrekk
feature_copy["Overtrekk"] = (
    feature_copy["InterestEarningLendingAmt"] > feature_copy["CreditLimitAmt"]
).astype(int)
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["Overtrekk"].sum().rename("AntallOvertrekk")
)
# Snitt overtrekk
agg_frame["SnittOvertrekkPerMåned"] = (
    agg_frame["AntallOvertrekk"] / agg_frame["NumberOfMonths"]
)

# Andel av overdue kontra størrelse på faktura
agg_frame["OverdueShareOfDebt"] = (
    agg_frame["OverdueAmt"] / agg_frame["StatementClosingBalanceAmt"]
).fillna(0)

# Snitt antall payment overdue
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["PaymentOverDueFlag"]
    .mean()
    .rename("SnittPaymentOverDueFlag")
)
# Snitt Revolvingflag
agg_frame = agg_frame.join(
    feature_copy.groupby("Collectionid")["RevolvingFlag"]
    .mean()
    .rename("SnittRevolvingFlag")
)

cols_to_share_by_turnover = [
    "Last_Airlines",
    "Last_Amusement and Entertainment",
    "Last_Automobile / Vehicle Rental",
    "Last_Business Services",
    "Last_Clothing Stores",
    "Last_Contracted Services",
    "Last_Government Services",
    "Last_Hotels",
    "Last_Includes all lodging merchants",
    "Last_Mail Order / Telephone Order Providers",
    "Last_Miscellaneous Stores",
    "Last_Others",
    "Last_Professional Services and Membership Organizations",
    "Last_Repair Services",
    "Last_Retail Stores",
    "Last_Service Providers",
    "Last_Transportation",
    "Last_Utilities",
    "Last_Wholesale Distributors and Manufacturers",
]
for col in cols_to_share_by_turnover:
    string = re.sub(r"^Last_", "", col)
    string = f"{string}_share_of_Turnover"
    agg_frame[string] = (agg_frame[col] / agg_frame["SumTurnover"]).fillna(0)


cols_to_drop = [
    "SumTurnoverDom",
    "SumFundtransfer",
    "SumCashAtm",
    "SumCashCounter",
] + cols_to_share_by_turnover
agg_frame = agg_frame.drop(cols_to_drop, axis=1)
agg_frame_encoded = pd.get_dummies(
    agg_frame, columns=["Productname", "DistributorId"]
).reset_index()


# ------------------------------------------------------------
# MERGE TARGET VALUE INTO FEATURE-FRAME
# ------------------------------------------------------------
feature_target_frame = agg_frame_encoded.merge(
    target_frame.reset_index()[["Collectionid", "LGD"]],
    on="Collectionid",
    how="left",
)
feature_target_frame = feature_target_frame.set_index("Collectionid", drop=True)  # ?
feature_target_frame = feature_target_frame.sort_values(by="PersonId", ascending=True)
feature_target_frame

In [None]:
mask = (feature_target_frame["LGD"] > 0) & (feature_target_frame["LGD"] < 1)
feature_target_frame = feature_target_frame[mask].drop(columns=["PersonId"],axis= 1)
feature_target_frame

In [None]:
X = feature_target_frame.drop(columns=["LGD"])
X = X.apply(lambda x: x.astype(int) if x.dtype == "bool" else x)

y = feature_target_frame["LGD"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================================
# Try multiple regularized models using ALL features
# ============================================
print("\n=== Training Multiple Models (ALL 80 features) ===")

models = {
    "Ridge (alpha=1)": Ridge(alpha=1.0),
    "Ridge (alpha=10)": Ridge(alpha=10.0),
    "Ridge (alpha=50)": Ridge(alpha=50.0),
    "Lasso (alpha=0.001)": Lasso(alpha=0.001, max_iter=10000),
    "Lasso (alpha=0.01)": Lasso(alpha=0.01, max_iter=10000),
    "ElasticNet (0.5)": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000),
    "ElasticNet (0.7)": ElasticNet(alpha=0.01, l1_ratio=0.7, max_iter=10000),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=30,
        max_depth=2,
        learning_rate=0.1,
        subsample=0.7,
        max_features=0.3,
        random_state=42,
    ),
    "XGBoost": XGBRegressor(
        n_estimators=40,
        max_depth=2,
        learning_rate=0.08,
        subsample=0.7,
        colsample_bytree=0.3,
        reg_alpha=2.0,
        reg_lambda=7.0,
        random_state=42,
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=100,
        max_depth=3,
        min_samples_leaf=10,
        max_features=0.3,
        random_state=42,
    ),
}

results = []

for name, model in models.items():
    # Linear models need scaled data, tree models don't
    if "Ridge" in name or "Lasso" in name or "Elastic" in name:
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results.append(
        {
            "Model": name,
            "Train R²": train_r2,
            "Test R²": test_r2,
            "R² Diff": train_r2 - test_r2,
            "Test RMSE": test_rmse,
            "Test MAE": test_mae,
        }
    )

# ============================================
# Display results sorted by test R²
# ============================================
results_df = pd.DataFrame(results).sort_values("Test R²", ascending=False)

print("\n" + "=" * 90)
print("MODEL COMPARISON (sorted by Test R²)")
print("=" * 90)
print(results_df.to_string(index=False))

# Find best model
best_model_name = results_df.iloc[0]["Model"]
best_test_r2 = results_df.iloc[0]["Test R²"]
best_diff = results_df.iloc[0]["R² Diff"]

print("\n" + "=" * 90)
print(f"BEST MODEL: {best_model_name}")
print("=" * 90)
print(f"  Test R²:        {best_test_r2:.4f}")
print(f"  R² Difference:  {best_diff:.4f}")

if best_diff > 0.15:
    print(f"  ⚠️  Still overfitting, but this is the best we can do with 160 samples")
elif best_diff > 0.05:
    print(f"  ⚠️  Some overfitting (acceptable for small datasets)")
else:
    print(f"  ✓  Good generalization")

# ============================================
# Train final model with best performer
# ============================================
print("\n=== Training Final Model ===")

# Retrain the best model
best_model = models[best_model_name]

if (
    "Ridge" in best_model_name
    or "Lasso" in best_model_name
    or "Elastic" in best_model_name
):
    best_model.fit(X_train_scaled, y_train)
    y_test_pred = best_model.predict(X_test_scaled)

    # Show feature coefficients for linear models
    coefs = pd.DataFrame(
        {"feature": X_train.columns, "coefficient": best_model.coef_}
    ).sort_values("coefficient", key=abs, ascending=False)

    print("\nTop 15 features by absolute coefficient:")
    for idx, row in coefs.head(15).iterrows():
        print(f"  {row['feature']:45s} {row['coefficient']:8.4f}")
else:
    best_model.fit(X_train, y_train)
    y_test_pred = best_model.predict(X_test)

    # Show feature importance for tree models
    if hasattr(best_model, "feature_importances_"):
        importances = pd.DataFrame(
            {"feature": X_train.columns, "importance": best_model.feature_importances_}
        ).sort_values("importance", ascending=False)

        print("\nTop 15 most important features:")
        for idx, row in importances.head(15).iterrows():
            print(f"  {row['feature']:45s} {row['importance']:8.4f}")

# ============================================
# Prediction distribution analysis
# ============================================
print(f"\n{'='*90}")
print(f"PREDICTION DISTRIBUTION")
print(f"{'='*90}")
print(f"\nActual y_test distribution:")
print(f"  Min:    {y_test.min():.4f}")
print(f"  Q1:     {y_test.quantile(0.25):.4f}")
print(f"  Median: {y_test.median():.4f}")
print(f"  Q3:     {y_test.quantile(0.75):.4f}")
print(f"  Max:    {y_test.max():.4f}")

print(f"\nPredicted y_test distribution:")
print(f"  Min:    {y_test_pred.min():.4f}")
print(f"  Q1:     {np.percentile(y_test_pred, 25):.4f}")
print(f"  Median: {np.median(y_test_pred):.4f}")
print(f"  Q3:     {np.percentile(y_test_pred, 75):.4f}")
print(f"  Max:    {y_test_pred.max():.4f}")

print("\n✓ Pipeline complete!")
print(f"\nKey insight: Using ALL features with strong regularization")
print(f"             performs better than aggressive feature selection.")
print(f"\nTo use the best model on new data:")
if (
    "Ridge" in best_model_name
    or "Lasso" in best_model_name
    or "Elastic" in best_model_name
):
    print(f"  1. Scale: X_new_scaled = scaler.transform(X_new)")
    print(f"  2. Predict: predictions = best_model.predict(X_new_scaled)")
else:
    print(f"  predictions = best_model.predict(X_new)")