In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
credit_approval = fetch_ucirepo(id=27)

# data (as pandas dataframes)
X = credit_approval.data.features
y = credit_approval.data.targets
X.head()

ModuleNotFoundError: No module named 'ucimlrepo'

In [None]:
X.describe()

Unnamed: 0,A15,A14,A11,A8,A3,A2
count,690.0,677.0,690.0,690.0,690.0,678.0
mean,1017.385507,184.014771,2.4,2.223406,4.758725,31.568171
std,5210.102598,173.806768,4.86294,3.346513,4.978163,11.957862
min,0.0,0.0,0.0,0.0,0.0,13.75
25%,0.0,75.0,0.0,0.165,1.0,22.6025
50%,5.0,160.0,0.0,1.0,2.75,28.46
75%,395.5,276.0,3.0,2.625,7.2075,38.23
max,100000.0,2000.0,67.0,28.5,28.0,80.25


In [None]:
import pandas as pd
from scipy.stats import spearmanr

# Предположим, что X - это DataFrame с независимыми переменными
# А y - это либо Series, либо DataFrame с одной колонкой, содержащей целевую переменную

# Если y является DataFrame, преобразуем его в Series
if isinstance(y, pd.DataFrame):
    y_series = y.iloc[:, 0].copy()  # предполагаем, что целевая переменная в первой колонке
else:
    y_series = y.copy()

# Преобразуем целевую переменную в числовой формат для вычислений (например, + -> 1, - -> 0)
y_numeric = y_series.map({'+': 1, '-': 0})

# Убедимся, что y_numeric это pandas Series и содержит только 0 и 1
assert isinstance(y_numeric, pd.Series), "y_numeric должен быть pandas Series"
assert set(y_numeric.unique()) <= {0, 1}, "y_numeric должен содержать только 0 и 1"

# Выберем категориальные колонки
categorical_columns = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

# Инициализируем словарь для хранения корреляций Спирмена
spearman_correlations = {}

# Рассчитаем корреляцию Спирмена для каждой категориальной переменной с целевой переменной
for col in categorical_columns:
    # Преобразуем категориальную переменную в числовой формат с помощью cat.codes
    ordinal_encoded = X[col].astype('category').cat.codes
    # Вычислим корреляцию Спирмена
    corr, _ = spearmanr(ordinal_encoded, y_numeric)
    spearman_correlations[col] = corr

# Выводим результаты
print(spearman_correlations)


{'A1': -0.00892234911540353, 'A4': -0.19594736836679594, 'A5': -0.1856677777059537, 'A6': 0.13053752320966439, 'A7': 0.012214606399450807, 'A9': 0.7204068158989549, 'A10': 0.4583013316079435, 'A12': 0.03162481448371771, 'A13': -0.09139497708038209}


In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
X.head()

Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
0,0,202.0,g,f,1,t,t,1.25,v,w,g,u,0.0,30.83,b
1,560,43.0,g,f,6,t,t,3.04,h,q,g,u,4.46,58.67,a
2,824,280.0,g,f,0,f,t,1.5,h,q,g,u,0.5,24.5,a
3,3,100.0,g,t,5,t,t,3.75,v,w,g,u,1.54,27.83,b
4,0,120.0,s,f,0,f,t,1.71,v,w,g,u,5.625,20.17,b


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

def clean_data(X, y, target_encoder_columns, one_hot_encoder_columns):
    # Заполнение пропусков для числовых данных
    imputer_num = SimpleImputer(strategy='mean')

    # Заполнение пропусков для категориальных данных (временная замена на "missing")
    imputer_cat = SimpleImputer(strategy='constant', fill_value='missing')

    # Заполняем пропуски в числовых данных
    X_num = X.select_dtypes(include=['number'])
    X_num_filled = imputer_num.fit_transform(X_num)

    # Заполняем пропуски в категориальных данных
    X_cat = X.select_dtypes(include=['object'])
    X_cat_filled = imputer_cat.fit_transform(X_cat)

    # Создаем DataFrame для числовых данных и заполненных категориальных данных
    X_filled = pd.DataFrame(X_num_filled, columns=X_num.columns)
    X_filled = pd.concat([X_filled, pd.DataFrame(X_cat_filled, columns=X_cat.columns)], axis=1)

    # Применяем TargetEncoder
    target_encoder = TargetEncoder(cols=target_encoder_columns)
    X_filled[target_encoder_columns] = target_encoder.fit_transform(X_filled[target_encoder_columns], y)

    # Применяем OneHotEncoder
    one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
    X_one_hot_encoded = one_hot_encoder.fit_transform(X_filled[one_hot_encoder_columns])

    # Объединяем обработанные данные
    X_final = pd.concat([
        X_filled.drop(columns=one_hot_encoder_columns),
        pd.DataFrame(X_one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_encoder_columns))
    ], axis=1)

    return X_final

# Пример использования
# X, y = ...  # Предположим, что X и y уже определены
target_encoder_columns = ['A9', 'A10']
one_hot_encoder_columns = ['A1', 'A4', 'A5', 'A6', 'A7', 'A12', 'A13']

X_final = clean_data(X, y, target_encoder_columns, one_hot_encoder_columns)
print(X_final.head())


     A15    A14  A11    A8     A3     A2       A10        A9  A1_b  \
0    0.0  202.0  1.0  1.25  0.000  30.83  0.708475  0.786704   1.0   
1  560.0   43.0  6.0  3.04  4.460  58.67  0.708475  0.786704   0.0   
2  824.0  280.0  0.0  1.50  0.500  24.50  0.248101  0.786704   0.0   
3    3.0  100.0  5.0  3.75  1.540  27.83  0.708475  0.786704   1.0   
4    0.0  120.0  0.0  1.71  5.625  20.17  0.248101  0.786704   1.0   

   A1_missing  ...  A7_h  A7_j  A7_missing  A7_n  A7_o  A7_v  A7_z  A12_t  \
0         0.0  ...   0.0   0.0         0.0   0.0   0.0   1.0   0.0    0.0   
1         0.0  ...   1.0   0.0         0.0   0.0   0.0   0.0   0.0    0.0   
2         0.0  ...   1.0   0.0         0.0   0.0   0.0   0.0   0.0    0.0   
3         0.0  ...   0.0   0.0         0.0   0.0   0.0   1.0   0.0    1.0   
4         0.0  ...   0.0   0.0         0.0   0.0   0.0   1.0   0.0    0.0   

   A13_p  A13_s  
0    0.0    0.0  
1    0.0    0.0  
2    0.0    0.0  
3    0.0    0.0  
4    0.0    1.0  

[5 rows

In [None]:
y = (y == '+').astype(int)
y.value_counts()

A16
0      383
1      307
Name: count, dtype: int64

In [None]:
X_final

Unnamed: 0,A15,A14,A11,A8,A3,A2,A10,A9,A1_b,A1_missing,...,A7_h,A7_j,A7_missing,A7_n,A7_o,A7_v,A7_z,A12_t,A13_p,A13_s
0,0.0,202.0,1.0,1.25,0.000,30.83,0.708475,0.786704,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,560.0,43.0,6.0,3.04,4.460,58.67,0.708475,0.786704,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,824.0,280.0,0.0,1.50,0.500,24.50,0.248101,0.786704,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,100.0,5.0,3.75,1.540,27.83,0.708475,0.786704,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,120.0,0.0,1.71,5.625,20.17,0.248101,0.786704,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0.0,260.0,0.0,1.25,10.085,21.08,0.248101,0.069909,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
686,394.0,200.0,2.0,2.00,0.750,22.67,0.708475,0.069909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
687,1.0,200.0,1.0,2.00,13.500,25.25,0.708475,0.069909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
688,750.0,280.0,0.0,0.04,0.205,17.92,0.248101,0.069909,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)


# Создание модели
dt = DecisionTreeClassifier()

# Создание GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Обучение модели
grid_search.fit(X_train, y_train)

# Вывод лучших гиперпараметров и результатов
print("Лучшие параметры:", grid_search.best_params_)
print("Лучшее качество на тренировочных данных:", grid_search.best_score_)

# Оценка на тестовых данных
y_pred = grid_search.best_estimator_.predict(X_test)
print("Точность на тестовых данных:", accuracy_score(y_test, y_pred))
print("Отчет о классификации:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Лучшие параметры: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5}
Лучшее качество на тренировочных данных: 0.8313677313677313
Точность на тестовых данных: 0.855072463768116
Отчет о классификации:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87        77
           1       0.85      0.82      0.83        61

    accuracy                           0.86       138
   macro avg       0.85      0.85      0.85       138
weighted avg       0.85      0.86      0.85       138

