<a href="https://colab.research.google.com/github/krell11/customer_churn_prediction/blob/master/ML_Cource.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
print("Contents of '/content/drive/MyDrive/Colab Notebooks/':")
print(os.listdir('/content/drive/MyDrive/Colab Notebooks/'))

In [None]:
PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/"

In [None]:
import numpy as np
import pandas as pd

In [None]:
clients = pd.read_csv(PATH + 'clients.csv')
report_dates = pd.read_csv(PATH + 'report_dates.csv', parse_dates=['report_dt'])
train_labels = pd.read_csv(PATH + 'train.csv')
transactions = pd.read_csv(PATH + 'transactions.csv', parse_dates=['transaction_dttm'])



In [None]:
clients = clients.merge(report_dates, on='report', how='left')

In [None]:
clients

In [None]:
tx = transactions.merge(clients[['user_id', 'report_dt']], on='user_id', how='inner')

In [None]:
tx['days_before'] = (tx['report_dt'] - tx['transaction_dttm']).dt.days

In [None]:
tx

In [None]:
aggs = tx.groupby('user_id').agg({
    'days_before': ['min'],
    'transaction_amt': ['count', 'sum', 'mean', 'std'],
    'mcc_code': ['nunique'],
    'currency_rk': ['nunique']
})

aggs.columns = [
    'recency',
    'total_cnt',
    'total_sum',
    'avg_check',      # Средний чек за все время
    'std_check',      # СКО
    'unique_mcc',
    'unique_currency'
]

aggs['std_check'] = aggs['std_check'].fillna(0)

features = aggs.reset_index()
features.describe()

In [None]:
aggs

In [None]:
df_full = clients.merge(features, on='user_id', how='left')

In [None]:
df_full

In [None]:
!pip install lifelines

In [None]:
num_features = [
    'recency',
    'total_cnt',
    'total_sum',
    'avg_check',
    'std_check',
    'unique_mcc',
    'unique_currency'
]
df_full[num_features] = df_full[num_features].fillna(0)

money_cols = ['total_sum', 'avg_check', 'std_check']

In [None]:
for col in money_cols:
  df_full[col] = np.log1p(df_full[col].abs())

In [None]:
cat_cols = ['employee_count_nm', 'bankemplstatus']
for col in cat_cols:
    df_full[col] = df_full[col].fillna('MISSING').astype(str)

In [None]:
labeled_df = df_full.merge(train_labels, on='user_id', how='inner')

In [None]:
labeled_df

Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

In [None]:
kmf = KaplanMeierFitter()
kmf.fit(durations=labeled_df['time'], event_observed=labeled_df['target'])

In [None]:
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title("Кривая выживаемости")
plt.xlabel("Время (дни/месяцы)")
plt.ylabel("Вероятность остаться клиентом")
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid")
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

sns.histplot(df_full['total_cnt'], bins=50, ax=axes[0], color='teal')
axes[0].set_title('total_cnt')
axes[0].set_yscale('log')

sns.histplot(df_full['total_sum'], bins=50, ax=axes[1], color='coral')
axes[1].set_title('total_sum')
axes[1].set_yscale('log')

sns.histplot(df_full['recency'], bins=30, ax=axes[2], color='purple')
axes[2].set_title('recency')

plt.tight_layout()
plt.show()

In [None]:
check_cols = ['recency', 'total_cnt', 'total_sum', 'avg_check', 'unique_mcc']

plt.figure(figsize=(10, 8))

corr = df_full[check_cols].corr(method='spearman')

sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.show()

In [None]:
plt.figure(figsize=(6, 5))
ax = sns.countplot(x='target', data=train_labels, palette='viridis')

total = len(train_labels)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2 - 0.05
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

plt.xlabel('active/churn')
plt.ylabel('Quantity')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, val_df = train_test_split(
    labeled_df,
    test_size=0.1,
    random_state=42,
    stratify=labeled_df['target']
)

In [None]:
drop_cols = ['user_id', 'report', 'report_dt', 'target', 'time']
X_train = train_df.drop(columns=drop_cols, errors='ignore')
X_val = val_df.drop(columns=drop_cols, errors='ignore')

In [None]:
y_train = train_df['time'] * (2 * train_df['target'] - 1)
y_val = val_df['time'] * (2 * val_df['target'] - 1)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
train_pool = Pool(X_train, label=y_train, cat_features=cat_cols)
val_pool = Pool(X_val, label=y_val, cat_features=cat_cols)

In [None]:
model = CatBoostRegressor(
    loss_function='Cox',
    eval_metric='Cox',
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    verbose=100,
    early_stopping_rounds=50
)

In [None]:
model.fit(train_pool, eval_set=val_pool)

In [None]:
preds_risk = model.predict(X_val)

In [None]:
!pip install scikit-survival

In [None]:
from sksurv.metrics import concordance_index_censored

In [None]:
max(preds_risk)

In [None]:
events = val_df['target'].astype(bool)
times = val_df['time']
c_index = concordance_index_censored(events, times, preds_risk)[0]

In [None]:
c_index

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score

In [None]:
risk_pred = model.predict(X_val)
y_true_binary = val_df['target'].values

thresholds = np.linspace(risk_pred.min(), risk_pred.max(), 100)
scores = [f1_score(y_true_binary, (risk_pred > t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(scores)]

print(f"Оптимальный порог риска: {best_threshold:.4f}")
print(f"Максимальный F1-score: {max(scores):.4f}")

y_pred_binary = (risk_pred > best_threshold).astype(int)

cm = confusion_matrix(y_true_binary, y_pred_binary)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Предсказание модели')
plt.ylabel('Истинный класс')
plt.title(f'Матрица ошибок (Порог = {best_threshold:.2f})')
plt.xticks([0.5, 1.5], ['Остался (0)', 'Ушел (1)'])
plt.yticks([0.5, 1.5], ['Остался (0)', 'Ушел (1)'])
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import Pool

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import Pool

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []
models = []
c_scores = []
X = labeled_df.drop(columns=drop_cols, errors='ignore')
y_stratify = labeled_df['target']
y_cox = labeled_df['time'] * (2 * labeled_df['target'] - 1)


In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_stratify)):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y_cox.iloc[train_idx], y_cox.iloc[val_idx]

    train_pool_fold = Pool(X_train_fold, label=y_train_fold, cat_features=cat_cols)
    val_pool_fold = Pool(X_val_fold, label=y_val_fold, cat_features=cat_cols)

    model_fold = CatBoostRegressor(
        loss_function='Cox',
        eval_metric='Cox',
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=False,
        early_stopping_rounds=30
    )

    model_fold.fit(train_pool_fold, eval_set=val_pool_fold)

    event_bool = y_val_fold > 0
    time_abs = y_val_fold.abs()
    risk_pred = model_fold.predict(X_val_fold)

    best_score = model_fold.get_best_score()['validation']['Cox']
    scores.append(best_score)
    c_index = concordance_index_censored(event_bool, time_abs, risk_pred)[0]
    c_scores.append(c_index)

    print(f"Fold {fold+1}: Cox Score = {best_score:.4f}")

print(f"\nСредний результат: {np.mean(scores):.4f} (std: {np.std(scores):.4f})")

In [None]:
c_scores

In [None]:

X = labeled_df.drop(columns=drop_cols, errors='ignore')
y_cox = labeled_df['time'] * (2 * labeled_df['target'] - 1)
cat_cols = ['employee_count_nm', 'bankemplstatus']

full_pool = Pool(X, label=y_cox, cat_features=cat_cols)

grid = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [4, 5, 6, 7, 8],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [500, 700, 1000]
}

model_search = CatBoostRegressor(
    loss_function='Cox',
    eval_metric='Cox',
    verbose=False,
    early_stopping_rounds=50
)



In [None]:
from google.colab import output
output.disable_custom_widget_manager()

In [None]:
randomized_search_result = model_search.randomized_search(
    grid,
    X=full_pool,
    cv=3,                 # Внутренняя кросс-валидация на 3 фолда
    n_iter=20,            # Сколько случайных комбинаций проверить
    partition_random_seed=42,
    calc_cv_statistics=True,
    search_by_train_test_split=True,
    plot=True,
    verbose=False
)
