<a href="https://colab.research.google.com/github/ithelga/bank-churn-predictor/blob/develop/Team2_HW3_Outliers_Detecting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обнаружение выбросов

In [1]:
!pip install plotly kaleido

import kaleido
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from scipy.stats import zscore, iqr
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
colors = ["#FFAFCC", "#FFC8DD", "#CDB4DB", "#BDE0FE", "#A2D2FF"]
graph_path = 'drive/MyDrive/Colab Notebooks/Bank churn predictor/graph'

data_path = 'drive/MyDrive/Colab Notebooks/Bank churn predictor/data'
extract_df = pd.read_csv(f'{data_path}/extract_dataset.csv')

# Обнаружение выбросов

In [23]:
# Числовые признаки (проверяем на выбросы)
numeric_features = ['Age', 'Balance', 'NumOfProducts', 'CreditScore NumOfProducts', 'Age^2', 'Age Balance', 'Balance NumOfProducts', 'NumOfProducts^2']

# Категориальные/бинарные признаки (не проверяем на выбросы)
categorical_features = ['Geography_Germany', 'Gender']

# Целевая переменная
target = 'Exited'

### Статистические методы

In [24]:
# Z-Score method
def detect_outliers_zscore(data, threshold=3):
    z_scores = np.abs(zscore(data))
    return z_scores > threshold

zscore_outliers = detect_outliers_zscore(extract_df[numeric_features])
extract_df['Z_outlier'] = zscore_outliers.any(axis=1)
print(f"Количество выбросов по Z-score: {extract_df['Z_outlier'].sum()}")

Количество выбросов по Z-score: 559


In [25]:
# IQR method
def detect_outliers_iqr(data, factor=2.0):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return (data < (Q1 - factor * IQR)) | (data > (Q3 + factor * IQR))

iqr_outliers = detect_outliers_iqr(extract_df[numeric_features])
extract_df['IQR_outlier'] = iqr_outliers.any(axis=1)
print(f"Количество выбросов по IQR: {extract_df['IQR_outlier'].sum()}")

Количество выбросов по IQR: 489


### Алгоритм DBSCAN

In [26]:
# DBSCAN
X_numeric = extract_df[numeric_features].values
dbscan = DBSCAN(eps=0.4, min_samples=10, n_jobs=-1)
dbscan_labels = dbscan.fit_predict(X_numeric)
extract_df['DBSCAN_outlier'] = dbscan_labels == -1
print(f"Количество выбросов по DBSCAN: {extract_df['DBSCAN_outlier'].sum()}")

Количество выбросов по DBSCAN: 2


### Визуализация

In [27]:
# Объединение выбросов
outliers_combined = extract_df['Z_outlier'] | extract_df['IQR_outlier'] | extract_df['DBSCAN_outlier']
print(f"Общее количество выбросов (Z-score, IQR или DBSCAN): {outliers_combined.sum()}")
print(f"Доля удаленных строк: {outliers_combined.mean():.2%}")

Общее количество выбросов (Z-score, IQR или DBSCAN): 730
Доля удаленных строк: 7.30%


Создадим при помощи Plotly интеративную визуализацию `violin plot`:

> Violin Plot (скрипичный график) — это визуализация, которая сочетает элементы box plot и плотности распределения (kernel density estimation, KDE). Он показывает распределение данных, медиану, квартили и плотность значений, что делает его подходящим для анализа выбросов.



In [28]:
fig = go.Figure()

# Добавляем violin plot для каждого признака
for i, feature in enumerate(numeric_features):
    fig.add_trace(go.Violin(
        y=extract_df[feature],
        x=[feature] * len(extract_df),
        name=feature,
        box_visible=True,
        meanline_visible=True,
        fillcolor=colors[i % len(colors)],
        line_color='gray',
        opacity=1
    ))

# Добавляем точки выбросов
for i, feature in enumerate(numeric_features):
    # Z-score выбросы
    z_out = extract_df[extract_df['Z_outlier']][feature]
    if not z_out.empty:
        fig.add_trace(go.Scatter(
            x=[feature] * len(z_out),
            y=z_out,
            mode='markers',
            name='Z-score Outliers',
            marker=dict(color='#f72585', size=4, symbol='circle'),
            showlegend=(i == 0)
        ))

    # IQR выбросы
    iqr_out = extract_df[extract_df['IQR_outlier']][feature]
    if not iqr_out.empty:
        fig.add_trace(go.Scatter(
            x=[feature] * len(iqr_out),
            y=iqr_out,
            mode='markers',
            name='IQR Outliers',
            marker=dict(color='#3f37c9', size=4, symbol='triangle-up'),
            showlegend=(i == 0)
        ))

    # DBSCAN выбросы
    dbscan_out = extract_df[extract_df['DBSCAN_outlier']][feature]
    if not dbscan_out.empty:
        fig.add_trace(go.Scatter(
            x=[feature] * len(dbscan_out),
            y=dbscan_out,
            mode='markers',
            name='DBSCAN Outliers',
            marker=dict(color='#7209b7', size=4, symbol='square'),
            showlegend=(i == 0)
        ))

fig.update_layout(
    title='Violin Plots with Outliers (Z-score, IQR, DBSCAN)',
    xaxis_title='Features',
    yaxis_title='Value',
    template="plotly_white",
    showlegend=True,
    height=600,
    width=1300
)

#fig.write_image(f'/{graph_path}/violin_plot_with_outliers.png')
fig.show()

### Удаление выбросов

In [39]:
cleaned_df = extract_df[~outliers_combined].copy().reset_index(drop=True)
cleaned_df.drop(['Z_outlier', 'IQR_outlier', 'DBSCAN_outlier'], axis=1, inplace=True)
print(f"Размер датасета после удаления выбросов: {len(cleaned_df)} строк")

Размер датасета после удаления выбросов: 9268 строк


Сохранение очищенного датасета

In [40]:
cleaned_df.to_csv(f'{data_path}/cleaned_dataset.csv', index=False)

Проверка распределения классов

In [41]:
print("\nРаспределение классов до удаления выбросов:")
print(extract_df[target].value_counts(normalize=True))
print("\nРаспределение классов после удаления выбросов:")
print(cleaned_df[target].value_counts(normalize=True))


Распределение классов до удаления выбросов:
Exited
0    0.796259
1    0.203741
Name: proportion, dtype: float64

Распределение классов после удаления выбросов:
Exited
0    0.81981
1    0.18019
Name: proportion, dtype: float64


# Обучение модели Логистической регрессии

Обучите модель из Спринта 2. Оцените метрики. Сделайте вывод.


In [42]:
# Данные
cleaned_df

Unnamed: 0,Geography_Germany,Age,Balance,NumOfProducts,CreditScore NumOfProducts,Age^2,Age Balance,Balance NumOfProducts,NumOfProducts^2,Gender,Exited
0,0.0,0.324324,0.000000,0.000000,0.088197,0.176904,0.000000,0.000000,0.0,0.0,1
1,0.0,0.310811,0.334031,0.000000,0.084590,0.166708,0.249049,0.107315,0.0,0.0,0
2,0.0,0.283784,0.000000,0.333333,0.343607,0.147052,0.000000,0.000000,0.2,0.0,0
3,0.0,0.351351,0.453394,0.333333,0.308197,0.198034,0.362780,0.291325,0.2,1.0,1
4,0.0,0.432432,0.000000,0.333333,0.424262,0.267322,0.000000,0.000000,0.2,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
9263,0.0,0.283784,0.000000,0.333333,0.390820,0.147052,0.000000,0.000000,0.2,1.0,0
9264,0.0,0.229730,0.228657,0.000000,0.054426,0.110688,0.145535,0.073461,0.0,1.0,0
9265,0.0,0.243243,0.000000,0.000000,0.117705,0.119410,0.000000,0.000000,0.0,0.0,1
9266,1.0,0.324324,0.299226,0.333333,0.391475,0.176904,0.228541,0.192266,0.2,1.0,1
