In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import file, plot, data, stat, models
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

In [None]:
konto_frame, collection_frame = file.load_konto_data(file_path_konto), file.load_collection_data(file_path_collection)

In [None]:
seq_sim = models.SequenceSimilarity(normalization_method="z-score", similarity_method="cosine", aggregation_method='mean-ignore-0')
seq_sim.fit(konto_frame, sequence_cols=['BalanceAmt', 'TurnoverAmt', 'TurnoverNum', 'OverdueAmt', 'StatementEffectivePaymentsAmt'], id_col="PersonId", n_unique_ids=20)

In [None]:
seq_sim.plot_similarities()

In [None]:
top_corrs = seq_sim.get_capped_largest_correlations_dict(cap_value=0.5)

for pid, series in top_corrs.items():
    print(f"\nTop correlations for {pid}:")
    print(series)

In [None]:
collection_frame["Status"] = np.where(
    collection_frame["CollectionClosedDate"].isna(), "PENDING", "COMPLETED"
)
collection_frame = collection_frame.groupby(["PersonId"], as_index=False).tail(1)
collection_frame["Loss"] = (
    collection_frame["CumulativeLossAmt"] / collection_frame["BalanceSentAmt"] * 100
).round(2)
collection_frame["LossBool"] = (collection_frame["Loss"] > 0).astype(int)
col_frame = collection_frame[
    collection_frame["PersonId"].isin(seq_sim.get_processed_ids())
][["PersonId", "LossBool"]]
col_frame.head(40)

In [None]:
data.get_sequences_side_by_side_id_based(konto_frame, seq_sim.get_processed_ids(), "TurnoverAmt")

In [None]:
data.get_sequences_side_by_side_id_and_columns(konto_frame, [51210, 1699128], ['BalanceAmt', 'TurnoverAmt', 'TurnoverNum', 'OverdueAmt', 'StatementEffectivePaymentsAmt'])

In [None]:
collection_frame["Status"] = np.where(
    collection_frame["CollectionClosedDate"].isna(), "PENDING", "COMPLETED"
)
collection_frame = collection_frame.groupby(["PersonId"], as_index=False).tail(1)
collection_frame["Loss"] = (
    collection_frame["CumulativeLossAmt"] / collection_frame["BalanceSentAmt"] * 100
).round(2)
collection_frame["LossBool"] = (collection_frame["Loss"] > 0).astype(int)
col_frame = collection_frame[collection_frame["PersonId"].isin(seq_sim.get_processed_ids())][
    ["PersonId", "LossBool"]
]
col_frame.head(40)

In [None]:
import itertools

# Define your parameter grids
normalization_methods = ["z-score", "min-max", "robust", "mean", "none"]
similarity_methods = ["pearson", "cosine", "euclidean", "manhattan"]
aggregation_methods = ["mean", "mean-ignore-0", "median", "max", "min"]

sequence_cols = [
    "BalanceAmt",
    "TurnoverAmt",
    "TurnoverNum",
    "OverdueAmt",
    "StatementEffectivePaymentsAmt",
]
id_col = "PersonId"

for norm, sim, agg in itertools.product(
    normalization_methods, similarity_methods, aggregation_methods
):
    print(
        f"\n=== Running combination: normalization={norm}, similarity={sim}, aggregation={agg} ==="
    )

    try:
        seq_sim = models.SequenceSimilarity(
            normalization_method=norm,
            similarity_method=sim,
            aggregation_method=agg,
        )

        seq_sim.fit(
            konto_frame, sequence_cols=sequence_cols, id_col=id_col, n_unique_ids=20
        )

        title = f"{norm.upper()} | {sim.upper()} | {agg.upper()}"
        seq_sim.plot_similarities(title=title)

    except Exception as e:
        print(f"⚠️ Skipped due to error: {e}")