In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
import starbars
from scipy.stats import mannwhitneyu 
from pathlib import Path

wr.filterwarnings('ignore')

# Loading and reading dataset
work_dir = Path.cwd().parent.parent / 'Work'
df = pd.read_csv(work_dir / "BankChurners.csv")

In [2]:
df.shape

(10127, 23)

In [3]:
columns_to_drop = ['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                  'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']

df = df.drop(columns_to_drop, axis=1)
df.shape

(10127, 21)

In [4]:
print(f"Уникальные классы: {df['Attrition_Flag'].unique()}")
print(f"Количество классов: {df['Attrition_Flag'].nunique()}")

Уникальные классы: ['Existing Customer' 'Attrited Customer']
Количество классов: 2


In [5]:
df.isna().sum()

CLIENTNUM                   0
Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

In [6]:
print(df['Attrition_Flag'].value_counts())

Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64


In [12]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
del_c = ['CLIENTNUM']
numeric_columns = [column for column in numeric_columns if column not in del_c]
emissions_count = 0
total_numeric_values = 0

for column in numeric_columns:
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    IQR = q3 - q1
    
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5 * IQR
    
    emissions = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    outlier_percent = (emissions / len(df)) * 100
    
    emissions_count += emissions
    total_numeric_values += len(df[column])
    
    print(f"{column}: {emissions} выбросов ({outlier_percent:.1f}%)")

total_emissions_percentage = (emissions_count / total_numeric_values) * 100

print(f"\n\nОбщее количество числовых значений: {total_numeric_values}")
print(f"Общее количество выбросов: {emissions_count}")
print(f"Общий процент выбросов: {total_emissions_percentage:.1f}%")

Customer_Age: 2 выбросов (0.0%)
Dependent_count: 0 выбросов (0.0%)
Months_on_book: 386 выбросов (3.8%)
Total_Relationship_Count: 0 выбросов (0.0%)
Months_Inactive_12_mon: 331 выбросов (3.3%)
Contacts_Count_12_mon: 629 выбросов (6.2%)
Credit_Limit: 984 выбросов (9.7%)
Total_Revolving_Bal: 0 выбросов (0.0%)
Avg_Open_To_Buy: 963 выбросов (9.5%)
Total_Amt_Chng_Q4_Q1: 396 выбросов (3.9%)
Total_Trans_Amt: 896 выбросов (8.8%)
Total_Trans_Ct: 2 выбросов (0.0%)
Total_Ct_Chng_Q4_Q1: 394 выбросов (3.9%)
Avg_Utilization_Ratio: 0 выбросов (0.0%)


Общее количество числовых значений: 141778
Общее количество выбросов: 4983
Общий процент выбросов: 3.5%


In [20]:
print(df.dtypes)
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
del_c = ['CLIENTNUM']
numeric_columns = [column for column in numeric_columns if column not in del_c]
categorical_columns = df.select_dtypes(include=['object']).columns

print(f"\nЧисловые признаки ({len(numeric_columns)}):")
print(numeric_columns)
print(f"Категориальные признаки ({len(categorical_columns)}):")
print(categorical_columns.tolist())

CLIENTNUM                     int64
Attrition_Flag               object
Customer_Age                  int64
Gender                       object
Dependent_count               int64
Education_Level              object
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
dtype: object

Числовые признаки (14):
['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chn