In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pickle

In [16]:
# 1. Загрузка данных
data = pd.read_csv('bank-full.csv', sep=';')

In [17]:
# 2. Выбор признаков
selected_cols = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
data_selected = data[selected_cols]
print("\nОтобранные признаки: \n", data_selected.head())


Отобранные признаки: 
    age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome   y  
0       261         1     -1         0  unknown  no  
1       151         1     -1         0  unknown  no  
2        76         1     -1         0  unknown  no  
3        92         1     -1         0  unknown  no  
4       198         1     -1         0  unknown  no  


In [18]:
# 3. Проверка пропусков
print("\nКоличество пропущенных значений:\n", data_selected.isnull().sum())


Количество пропущенных значений:
 age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [19]:
# 4. Анализ колонки 'education'
education_counts = data_selected['education']
most_frequent_education = education_counts.mode()[0]
print(f"\nСамое частое значение в столбце 'education': {most_frequent_education}")


Самое частое значение в столбце 'education': secondary


In [20]:
# 5. Отбор численных признаков
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numeric_data = data_selected[numeric_features]

# 6. Расчет корреляционной матрицы
correlation_matrix = numeric_data.corr()
print("\nКорреляционная матрица: \n", correlation_matrix)


Корреляционная матрица: 
                age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [21]:
# 7. Поиск пары признаков с наибольшей корреляцией
max_correlation = 0
highest_correlated_pair = ('', '')
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        correlation_value = abs(correlation_matrix.iloc[i, j])
        if correlation_value > max_correlation:
            max_correlation = correlation_value
            highest_correlated_pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
print(f"\nНаибольшая корреляция: {max_correlation:.4f} между {highest_correlated_pair[0]} и {highest_correlated_pair[1]}")


Наибольшая корреляция: 0.4548 между pdays и previous


In [22]:
# 8. Подготовка данных для классификации
data_prep = pd.read_csv('bank-full.csv', sep=';')
data_prep = data_prep[selected_cols]

# 9. Кодирование целевой переменной
data_prep['y'] = data_prep['y'].map({'yes': 1, 'no': 0})
print("\nПервые 5 строк с закодированной целевой переменной: \n", data_prep.head())


Первые 5 строк с закодированной целевой переменной: 
    age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome  y  
0       261         1     -1         0  unknown  0  
1       151         1     -1         0  unknown  0  
2        76         1     -1         0  unknown  0  
3        92         1     -1         0  unknown  0  
4       198         1     -1         0  unknown  0  


In [23]:
# 10. Кодирование категориальных переменных
label_encoder = LabelEncoder()
for column in data_prep.select_dtypes(include=['object']):
    data_prep[column] = label_encoder.fit_transform(data_prep[column])

# 11. Выбор категориальных признаков для модели
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X_data = data_prep[categorical_features]

y_data = data_prep['y']

# 12. Разделение на тренировочный, валидационный и тестовый наборы
X_train, X_temp, y_train, y_temp = train_test_split(X_data, y_data, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f"\nРазмеры наборов: \nТренировочный: {X_train.shape}, {y_train.shape}\nВалидационный: {X_val.shape}, {y_val.shape}\nТестовый: {X_test.shape}, {y_test.shape}")


Размеры наборов: 
Тренировочный: (27126, 7), (27126,)
Валидационный: (9042, 7), (9042,)
Тестовый: (9043, 7), (9043,)


In [24]:
# 13. Вычисление взаимной информации
mutual_info = mutual_info_classif(X_train, y_train, discrete_features=True)
mutual_info_series = pd.Series(mutual_info, index=X_train.columns, name='Mutual Info')
mutual_info_series = mutual_info_series.round(2)
print("\nВзаимная информация: \n", mutual_info_series)


Взаимная информация: 
 job          0.01
marital      0.00
education    0.00
housing      0.01
contact      0.01
month        0.02
poutcome     0.03
Name: Mutual Info, dtype: float64


In [25]:
# 14. Определение признака с наибольшей взаимной информацией
feature_max_mutual_info = mutual_info_series.idxmax()
print(f"\n Признак с наиб. взаимной информацией: {feature_max_mutual_info}")


 Признак с наиб. взаимной информацией: poutcome


In [26]:
# 15. Кодирование с помощью OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False for easier handling
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)

# 16. Обучение модели логистической регрессии
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# 17. Прогноз и оценка точности
y_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_pred)
print(f"\nТочность на валидационном наборе: {accuracy:.2f}")


Точность на валидационном наборе: 0.89


In [27]:
# 18. Подготовка полных данных
data_full = pd.read_csv('bank-full.csv', sep=';')
data_full = data_full[selected_cols]
data_full['y'] = data_full['y'].map({'yes': 1, 'no': 0})

# 19. Label Encoding
label_encoder_full = LabelEncoder()
for col in data_full.select_dtypes(include=['object']):
    data_full[col] = label_encoder_full.fit_transform(data_full[col])

# 20. Выбор всех признаков
all_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'age', 'balance', 'previous', 'duration', 'pdays', 'campaign']
X_all = data_full[all_features]
y_all = data_full['y']

# 21. Разделение полных данных
X_train_full, X_temp_full, y_train_full, y_temp_full = train_test_split(X_all, y_all, test_size=0.4, random_state=42)
X_val_full, X_test_full, y_val_full, y_test_full = train_test_split(X_temp_full, y_temp_full, test_size=0.5, random_state=42)

# 22. One Hot Encoding
categorical_cols = X_all.select_dtypes(include=['object']).columns

encoder_full = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
X_encoded_full = encoder_full.fit_transform(X_all[categorical_cols])
feature_names_full = encoder_full.get_feature_names_out(categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded_full, columns=feature_names_full)
numeric_cols_full = X_all.select_dtypes(include=['number']).columns
X_final = pd.concat([X_encoded_df, X_all[numeric_cols_full].reset_index(drop=True)], axis=1)

# 23. Разделение final dataset
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final, y_all, test_size=0.2, random_state=42)

# 24. Взаимная информация для полных данных
mutual_info_full = mutual_info_classif(X_train_final, y_train_final)
mutual_info_df_full = pd.DataFrame(mutual_info_full, index=X_train_final.columns, columns=['Mutual Info']).sort_values(by='Mutual Info', ascending=False)
print("\nВзаимная информация для всех признаков: \n", mutual_info_df_full.head())


Взаимная информация для всех признаков: 
           Mutual Info
duration     0.069842
poutcome     0.033243
pdays        0.028056
month        0.027712
balance      0.018756


In [28]:
# 25. Подбор гиперпараметра C
C_values = [0.01, 0.1, 1, 10]
best_val_accuracy = 0
best_C = None

for C in C_values:
    model_full = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_full.fit(X_train_final, y_train_final)
    y_pred_val = model_full.predict(X_val_full)
    val_accuracy = accuracy_score(y_val_full, y_pred_val)
    print(f"Точность для C = {C}: {val_accuracy:.3f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_C = C

print(f"\n Наилучшая точность: {best_val_accuracy:.3f} (C = {best_C})")

Точность для C = 0.01: 0.889
Точность для C = 0.1: 0.890
Точность для C = 1: 0.891
Точность для C = 10: 0.891

 Наилучшая точность: 0.891 (C = 10)
