# Sequence-based Person Clustering
Ønsker å clustere personer basert på oppførsels-mønsteret de viser fra konto-dataen

In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import file, plot, data, stat
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../..', 'data/prod')
file_name_collection = "Collection_data.csv"
file_path_collection = os.path.join(data_folder, file_name_collection)
file_name_konto = "konto_data_trimmed.csv"
file_path_konto = os.path.join(data_folder, file_name_konto)

In [None]:
konto_frame, collection_frame = file.load_konto_data(file_path_konto), file.load_collection_data(file_path_collection)

In [None]:
konto_frame['BalanceAmt'].max(), konto_frame['BalanceAmt'].min()

In [None]:
collection_frame[collection_frame["PersonId"] == 84].head(40)
#collection_frame.head(40)

In [None]:
n = 10
interesting_column = "TurnoverNum"
first_n_ids = konto_frame["PersonId"].unique()[:n]
dfs = []
for i in first_n_ids:
	df_person = konto_frame[konto_frame["PersonId"] == i][["PersonId", interesting_column]].reset_index(drop=True)
	#df_person[interesting_column] = df_person[interesting_column] * -1
	df_person = stat.z_normalize(df_person, interesting_column)
	dfs.append(df_person)
#plot.multi_linear_step_plot(dfs, interesting_column, "PersonId")

pairwise_similarity_stepwise er O(n^2 * m) hvor n er antall personer og m er lengden på sekvensen. `symmetric_matrix`= False halverer antall operasjoner. Men minnebruket blir det samme uansett. Det lagres en float på hver posisjon, og å lagre som nan har ingen påvirkning

| n    | Tid (s) | Minne |
|------|----------|-------|
| 100  | 0.0s     | 78.9 KB   |
| 200  | 0.3s     |  314.1 KB  |
| 250  | 0.5s     |  490.2 KB  |
| 300  | 0.7s     |  705.5 KB  |
| 400  | 1.2s     |  1.2 MB  |
| 500  | 1.9s     |  1.9 MB  |
| 750  | 4.3s     |  4.3 MB  |
| 1000 | 7.9s     |   7.6 MB |
| 1250 | 12.6s    |  11.9 MB  |
| 1500 | 17.7s    |  17.2 MB  |
| 2000 | 31.8s    |  30.5 MB  |
| 2500 | 50.0s    |  47.7 MB  |
| 3000 | 1m 12.5s |  68.7 MB  |
| 5000 | 7m 14.7s (kanskje scuffed) | 190.8 MB   |
| 10000 | 40m 16.5s (kanskje scuffed) | 763 MB |



In [None]:
sim_df = stat.pairwise_similarity_stepwise(dfs, value_col=interesting_column, id_col="PersonId", symmetric_matrix=False)

In [None]:
plot.heatmap_plot(sim_df, drop_0=True)

In [None]:
s1 = konto_frame[konto_frame["PersonId"] == 1359][interesting_column].reset_index(drop=True)
s2 = konto_frame[konto_frame["PersonId"] == 2065][interesting_column].reset_index(drop=True)

# Combine into a DataFrame
df_side_by_side = pd.DataFrame({
    "First sequence": s1,
    "Second sequence": s2
})

df_side_by_side

In [None]:
konto_frame[konto_frame["PersonId"] == 1501][interesting_column]

### Multi-column stepwise similarity

In [None]:
cols = ['BalanceAmt', 'TurnoverAmt', 'TurnoverNum', 'OverdueAmt', 'StatementEffectivePaymentsAmt']
id_col = "PersonId"
n = 30
first_n_ids = konto_frame[id_col].unique()[:n]
sim_dfs = []
for col in cols:
	col_dfs = []
	for i in first_n_ids:
		df_person = konto_frame[konto_frame[id_col] == i][[id_col, col]].reset_index(drop=True)
		df_person = stat.z_normalize(df_person, col)
		col_dfs.append(df_person)
	sim_df = stat.pairwise_similarity_stepwise(col_dfs, value_col=col, id_col=id_col, symmetric_matrix=False)
	sim_dfs.append(sim_df)

In [None]:
for i, val in enumerate(sim_dfs):
	plot.heatmap_plot(val, title=cols[i], drop_0=True)
avg_df = sum(sim_dfs) / len(sim_dfs)
plot.heatmap_plot(avg_df, title="Average", drop_0=True)

In [None]:
# Combine into a DataFrame
df_side_by_side = pd.DataFrame({
    "First sequence": konto_frame[konto_frame[id_col] == 1365]["StatementEffectivePaymentsAmt"].reset_index(drop=True),
    "Second sequence": konto_frame[konto_frame[id_col] == 1605]["StatementEffectivePaymentsAmt"].reset_index(drop=True)
})

df_side_by_side

In [None]:
collection_frame['Status'] = np.where(collection_frame['CollectionClosedDate'].isna(), 'PENDING', 'COMPLETED')
collection_frame = collection_frame.groupby(['PersonId'], as_index=False).tail(1)
collection_frame['Loss'] = (collection_frame['CumulativeLossAmt']/collection_frame['BalanceSentAmt'] * 100).round(2)
collection_frame["LossBool"] = (collection_frame["Loss"] > 0).astype(int)
collection_frame[["PersonId", "Loss"]].sort_values(by=["Loss"], ascending=False).head(40)

In [None]:
collection_frame[collection_frame["PersonId"] == 794010].head(40)

In [None]:
col_frame = collection_frame[collection_frame["PersonId"].isin(first_n_ids)][["PersonId", "LossBool"]]
col_frame.head(40)


In [None]:
matrix = avg_df.copy()
np.fill_diagonal(matrix.values, np.nan)
stacked = matrix.unstack().dropna()
stacked = stacked.sort_values(ascending=False)
top10 = stacked.head(n)
top10_df = top10.reset_index()
top10_df.columns = ['person1', 'person2', 'correlation']

print(top10_df)

In [None]:
merged = (
    top10_df
    .merge(col_frame.rename(columns={'PersonId': 'person1', 'LossBool': 'LossBool1'}), on='person1', how='left')
    .merge(col_frame.rename(columns={'PersonId': 'person2', 'LossBool': 'LossBool2'}), on='person2', how='left')
)

# Add a boolean column: True if both have same LossBool value
merged['SameLossBool'] = merged['LossBool1'] == merged['LossBool2']

merged.head(40)

In [None]:
merged['cumulative_accuracy'] = merged['SameLossBool'].expanding().mean() * 100  # in %

# Plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(merged) + 1), merged['cumulative_accuracy'], marker='o')
plt.title('Accuracy of SameLossBool vs. Number of Top Correlations')
plt.xlabel('Number of Top Correlation Pairs (N)')
plt.ylabel('Accuracy (%)')
plt.grid(True)
plt.show()