In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
import starbars
from scipy.stats import mannwhitneyu 
from pathlib import Path

wr.filterwarnings('ignore')

# Loading and reading dataset
work_dir = Path.cwd().parent.parent / 'Work'
df = pd.read_csv(work_dir / "BankChurners.csv")

In [43]:
df.shape

(10127, 23)

In [44]:
columns_to_drop = ['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                  'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']

df = df.drop(columns_to_drop, axis=1)
df.shape

(10127, 21)

In [45]:
print(f"Уникальные классы: {df['Attrition_Flag'].unique()}")
print(f"Количество классов: {df['Attrition_Flag'].nunique()}")

Уникальные классы: ['Existing Customer' 'Attrited Customer']
Количество классов: 2


In [46]:
df.isna().sum()

CLIENTNUM                   0
Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

In [47]:
print(df['Attrition_Flag'].value_counts())

Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64


In [48]:
numeric_columns = df.select_dtypes(include=[np.number]).columns

total_outliers = 0
total_numeric_values = 0

print("Выбросы по столбцам (метод IQR):")
print("-" * 40)

for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    outlier_percent = (outliers / len(df)) * 100
    
    total_outliers += outliers
    total_numeric_values += len(df[column])
    
    print(f"{column}: {outliers} выбросов ({outlier_percent:.1f}%)")

# Общий процент выбросов среди всех числовых значений
overall_outlier_percentage = (total_outliers / total_numeric_values) * 100

print("-" * 40)
print(f"Общее количество числовых значений: {total_numeric_values}")
print(f"Общее количество выбросов: {total_outliers}")
print(f"Общий процент выбросов: {overall_outlier_percentage:.1f}%")

Выбросы по столбцам (метод IQR):
----------------------------------------
CLIENTNUM: 0 выбросов (0.0%)
Customer_Age: 2 выбросов (0.0%)
Dependent_count: 0 выбросов (0.0%)
Months_on_book: 386 выбросов (3.8%)
Total_Relationship_Count: 0 выбросов (0.0%)
Months_Inactive_12_mon: 331 выбросов (3.3%)
Contacts_Count_12_mon: 629 выбросов (6.2%)
Credit_Limit: 984 выбросов (9.7%)
Total_Revolving_Bal: 0 выбросов (0.0%)
Avg_Open_To_Buy: 963 выбросов (9.5%)
Total_Amt_Chng_Q4_Q1: 396 выбросов (3.9%)
Total_Trans_Amt: 896 выбросов (8.8%)
Total_Trans_Ct: 2 выбросов (0.0%)
Total_Ct_Chng_Q4_Q1: 394 выбросов (3.9%)
Avg_Utilization_Ratio: 0 выбросов (0.0%)
----------------------------------------
Общее количество числовых значений: 151905
Общее количество выбросов: 4983
Общий процент выбросов: 3.3%
