# Data Analysis 2 – Login Feature Clustering

This notebook loads the transaction and behaviour datasets, extracts the specified login‑related features, computes descriptive statistics, builds a clustering model (KMeans with PCA), and visualises the results.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

In [None]:
# Paths (adjust if needed)
TX_PATH = '../docs/транзакции в Мобильном интернет Банкинге.csv'
BEH_PATH = '../docs/поведенческие паттерны клиентов.csv'

def load_csv(path):
    # CSV files have the real header on the second line (row index 1)
    # We skip the first row and use the second as column names
    for enc in ('cp1251', 'utf-8', 'latin1'):
        for sep in (';', '\t', ','):
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine='python', header=1)
                if df.shape[1] > 1:
                    return df
            except Exception:
                pass
    raise FileNotFoundError(f'Unable to read {path}')

tx = load_csv(TX_PATH)
beh = load_csv(BEH_PATH)

# Normalise column names (remove surrounding whitespace)
tx.columns = [str(c).strip() for c in tx.columns]
beh.columns = [str(c).strip() for c in beh.columns]

FileNotFoundError: Unable to read ../docs/транзакции в Мобильном интернет Банкинге.csv

In [3]:
# Identify customer id column (cst_dim_id is the canonical name)
def find_id(cols):
    candidates = ['cst_dim_id', 'cust', 'client', 'customer', 'id']
    for cand in candidates:
        for c in cols:
            if cand.lower() in str(c).lower():
                return c
    return cols[0]

cust_tx = find_id(tx.columns)
cust_beh = find_id(beh.columns)
tx['cust_id'] = tx[cust_tx].astype(str).str.strip()
beh['cust_id'] = beh[cust_beh].astype(str).str.strip()

In [None]:
# Find target label – column name must be exactly 'target' (or one of the common aliases)
def find_target(cols):
    for cand in ['target', 'is_fraud', 'fraud', 'label']:
        for c in cols:
            if cand.lower() in str(c).lower():
                return c
    # fallback: assume a column literally called 'target' exists
    if 'target' in cols:
        return 'target'
    return None

target_col = find_target(beh.columns)
print(f"Detected target column: {target_col}")

if target_col:
    # The column should already contain 0/1 values; coerce to int just in case
    beh['target'] = beh[target_col].astype(int)
else:
    beh['target'] = 0
    print('Warning: target column not found – all rows set to 0')

Detected target column: None


: 

In [None]:
# List of login‑related features required by the user
login_features = [
    'logins_last_7_days', 'logins_last_30_days',
    'login_frequency_7d', 'login_frequency_30d',
    'freq_change_7d_vs_mean', 'logins_7d_over_30d_ratio',
    'avg_login_interval_30d', 'std_login_interval_30d', 'var_login_interval_30d', 'ewm_login_interval_7d',
    'burstiness_login_interval', 'fano_factor_login_interval', 'zscore_avg_login_interval_7d'
]

# Keep only columns that actually exist (case‑insensitive match)
present_features = []
for f in login_features:
    matches = [cc for c in tx.columns if f.lower() == str(c).lower()]
    if matches:
        present_features.append(matches[0])

print('Features found:', present_features)

In [None]:
# Subset and coerce to numeric
tx_sub = tx[['cust_id'] + present_features].copy()
for col in present_features:
    tx_sub[col] = pd.to_numeric(tx_sub[col].astype(str).str.replace(',', '.'), errors='cocoerce')

# Merge with behaviour (target)
merged = tx_sub.merge(beh[['cust_id', 'target']], on='cust_id', how='left')
merged['target'] = merged['target'].fillna(0).astype(int)

print('Merged shape:', merged.shape)

In [None]:
# Descriptive statistics per class
stats = merged.groupby('target')[present_features].describe().transpose()
display(stats)

# Simple summary table (mean per class)
summary = merged.groupby('target')[present_features].mean().T
summary.columns = ['Fraud', 'Non‑Fraud']
display(summary)

In [None]:
# Clustering pipeline
X = merged[present_features].values
imp = SimpleImputer(strategy='median')
X_imp = imp.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

# PCA for 2‑D visualisation
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
merged['pca1'] = X_pca[:, 0]
merged['pca2'] = X_pca[:, 1]

# KMeans (k=4)
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
merged['cluster'] = kmeans.fit_predict(X_scaled)

print('Clustering completed. Cluster counts:')
print(merged['cluster'].value_counts())

In [None]:
# Visualise PCA coloured by fraud label
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged, x='pca1', y='pca2', hue='target', palette=['#2ecc71', '#e74cc3cc'], alpha=0.7)
plt.title('PCA of Login Features – Fraud vs Non‑Fraud')
plt.legend(title='User Type', labels=['Non‑Fraud', 'Fraud'])
plt.show()

In [None]:
# Visualise PCA coloured by cluster
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged, x='pca1', y='pca2', hue='cluster', palette='tab10', alpha=0.7)
plt.title('PCA of Login Features – KMeans Clusters')
plt.show()

In [None]:
# Mean feature values per cluster (bar chart)
cluster_means = merged.groupby('cluster')[present_features].mean()
cluster_means.T.plot(kind='bar', figsize=(12, 6))
plt.title('Mean Login Features per Cluster')
plt.ylabel('Mean value')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap of the login features
corr = merged[present_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Save artefacts
merged.to_csv('login_features_merged.csv', index=False)
cluster_summary = merged.groupby('cluster')[present_features].mean()
cluster_summary.to_csv('cluster_summary_k4.csv')
print('Files saved: login_features_merged.csv, cluster_summary_k4.csv')