In [12]:
import pandas as pd
import numpy as np
import pandera.pandas as pa
from pandera import Column, Check


df = pd.read_csv('data/raw/UCI_Credit_Card.csv')
df


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [15]:
# Удалим дубликаты
df = df.drop_duplicates()

df.at[0, 'AGE'] = 101 #проверка ошибки

# Проверим пропуски
print("Пропуски в данных:")
print(df.isna().sum())

# Преобразуем числовые колонки
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# === 3. Feature Engineering ===
# Создадим агрегированные признаки из истории платежей
pay_cols = [f'PAY_{i}' for i in [0,2,3,4,5,6]]
df['PAY_MEAN'] = df[pay_cols].mean(axis=1)
df['PAY_MAX'] = df[pay_cols].max(axis=1)
df['PAY_MIN'] = df[pay_cols].min(axis=1)

# Биннинг возраста
df['AGE_BINNED'] = pd.cut(df['AGE'], bins=[20, 30, 40, 50, 60, 80], labels=False)

# === 4. Определяем схему данных с Pandera ===
schema = pa.DataFrameSchema({
    "ID": Column(int, Check.ge(1), nullable=False),
    "LIMIT_BAL": Column(float, Check.between(10000, 1000000), nullable=False),
    "SEX": Column(int, Check.isin([1, 2])),
    "EDUCATION": Column(int, Check.isin([0, 1, 2, 3, 4, 5, 6])),
    "MARRIAGE": Column(int, Check.isin([0, 1, 2, 3])),
    "AGE": Column(int, Check.between(18, 100)),
    **{col: Column(int, nullable=True) for col in pay_cols},
    "PAY_MEAN": Column(float),
    "PAY_MAX": Column(int),
    "PAY_MIN": Column(int),
    "AGE_BINNED": Column(int, Check.isin([0, 1, 2, 3, 4]))
})

# === 5. Проверка данных ===
try:
    validated_df = schema.validate(df, lazy=True)
    print("✅ Все проверки пройдены успешно.")
except pa.errors.SchemaErrors as err:
    print("❌ Ошибки валидации данных:")
    print(err.failure_cases)

# === 6. Пример теста ===
def test_data_integrity():
    """Тест падает, если данные не соответствуют схеме."""
    try:
        schema.validate(df)
    except pa.errors.SchemaError as e:
        raise AssertionError(f"Validation failed: {e}")

# Пример вызова теста
test_data_integrity()


Пропуски в данных:
ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
PAY_MEAN                      0
PAY_MAX                       0
PAY_MIN                       0
AGE_BINNED                    0
dtype: int64
❌ Ошибки валидации данных:
  schema_cont

AssertionError: Validation failed: Column 'AGE' failed element-wise validator number 0: in_range(18, 100) failure cases: 101