In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel

In [2]:
# Загрузка набора данных
df = pd.read_csv('content/income.csv')

In [3]:
df

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [4]:
# Проверка на пропущенные значения
print(df.isnull().sum())

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income >50K          0
dtype: int64


In [5]:
# Удаление строк с пропусками
df_drop = df.dropna()
print("\n# Удаление строк с пропусками")
print(df_drop.isnull().sum()) # пропусков нет


# Удаление строк с пропусками
age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income >50K       0
dtype: int64


In [7]:
df['stonks'] = np.where(df['capital-gain'] >= 5000, 1, 0).astype(int)
df.stonks = df.stonks.astype('category')
df['workclass'] = df['workclass'].astype('category')
df['education'] = df['education'].astype('category')
df['marital-status'] = df['marital-status'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['relationship'] = df['relationship'].astype('category')
df['race'] = df['race'].astype('category')
df['sex'] = df['sex'].astype('category')
df['native-country'] = df['native-country'].astype('category')

In [8]:
Y = df['stonks'] # выбираем целевую переменную (категориальную)
X = df.drop('stonks', axis=1) # переменные для проверки влияния

In [9]:
# В моем случае я дропаю базовую переменную, а не только. Y
X = X.drop('capital-gain', axis=1)

In [10]:
df.drop('capital-gain', axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32561 non-null  int64   
 1   workclass       30725 non-null  category
 2   education       32561 non-null  category
 3   education-num   32561 non-null  int64   
 4   marital-status  32561 non-null  category
 5   occupation      30718 non-null  category
 6   relationship    32561 non-null  category
 7   race            32561 non-null  category
 8   sex             32561 non-null  category
 9   capital-loss    32561 non-null  int64   
 10  hours-per-week  32561 non-null  int64   
 11  native-country  31978 non-null  category
 12  income >50K     32561 non-null  int64   
 13  stonks          32561 non-null  category
dtypes: category(9), int64(5)
memory usage: 1.5 MB


In [11]:
df

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K,stonks
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1,0
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0,0


In [13]:
# Создаем копию данных
X_processed = X.copy()

category_columns: list[str] = X_processed.select_dtypes(include=['category']).columns # собираем колонки помеченные как category

# Применяем One-Hot Encoding
X_processed = pd.get_dummies(X_processed, columns=category_columns,drop_first=True) # drop_first=True позволяет избежать мультиколлинеарности, удаляя первый уровень категориальной переменной.

In [14]:
X_processed

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,13,0,40,0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,13,0,13,0,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
2,38,9,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,53,7,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,28,13,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,12,0,38,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32557,40,9,0,40,1,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32558,58,9,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32559,22,9,0,20,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [15]:
# Выбираем числовые признаки
numeric_features = X_processed.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Инициализируем scaler
scaler = MinMaxScaler()

# Применяем нормализацию
X_processed[numeric_features] = scaler.fit_transform(X_processed[numeric_features])

In [16]:
X_processed

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.800000,0.0,0.397959,0.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.452055,0.800000,0.0,0.122449,0.0,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
2,0.287671,0.533333,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,0.493151,0.400000,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0.150685,0.800000,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.733333,0.0,0.377551,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32557,0.315068,0.533333,0.0,0.397959,1.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32558,0.561644,0.533333,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32559,0.068493,0.533333,0.0,0.193878,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [17]:
from sklearn.model_selection import cross_val_score

def train_and_evaluate(X, Y):
    # Разделение данных на обучающую и тестовую выборки
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42, stratify=Y)

    # Инициализация модели
    model = LogisticRegression(max_iter=1000)

    # Обучение модели
    model.fit(X_train, Y_train)

    # Предсказания на обучающей выборке
    Y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(Y_train, Y_train_pred)

    # Предсказания на тестовой выборке
    Y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)

    # Вывод результатов
    print(f"Точность на обучающей выборке: {train_accuracy:.4f}")
    print(f"Точность на тестовой выборке: {test_accuracy:.4f}")

    # Классификационный отчет
    print("\nКлассификационный отчет на тестовой выборке:")
    print(classification_report(Y_test, Y_test_pred))

    return model

In [18]:
train_and_evaluate(X_processed, Y)

Точность на обучающей выборке: 0.9496
Точность на тестовой выборке: 0.9492

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      6183
           1       0.33      0.00      0.01       330

    accuracy                           0.95      6513
   macro avg       0.64      0.50      0.49      6513
weighted avg       0.92      0.95      0.92      6513



In [24]:
# Функция для удаления выбросов на основе межквартильного размаха (1.5 IQR)
def remove_outliers_based_on_iqr(X, y, iqr_multiplier=1.5):
    # Убедимся, что работаем только с числовыми данными
    X_numeric = X.select_dtypes(include=['number'])

    # Рассчитываем первый и третий квартили для каждой числовой переменной
    Q1 = X_numeric.quantile(0.25)
    Q3 = X_numeric.quantile(0.75)

    # Вычисляем межквартильный размах (IQR)
    IQR = Q3 - Q1

    # Оставляем только те значения, которые находятся в диапазоне [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
    X_filtered = X_numeric[~((X_numeric < (Q1 - iqr_multiplier * IQR)) | (X_numeric > (Q3 + iqr_multiplier * IQR))).any(axis=1)]

    # Синхронизируем y с отфильтрованными X
    y_filtered = y.loc[X_filtered.index]

    return X_filtered, y_filtered

In [25]:
X_iqr, y_iqr = remove_outliers_based_on_iqr(X_processed, Y, iqr_multiplier=3)

In [26]:
X_processed.describe()

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.605379,0.020042,0.402423,0.24081
std,0.186855,0.171515,0.092507,0.125994,0.427581
min,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.533333,0.0,0.397959,0.0
50%,0.273973,0.6,0.0,0.397959,0.0
75%,0.424658,0.733333,0.0,0.44898,0.0
max,1.0,1.0,1.0,1.0,1.0


In [27]:
X_iqr.describe()

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K
count,20324.0,20324.0,20324.0,20324.0,20324.0
mean,0.270328,0.574962,0.0,0.407252,0.0
std,0.17674,0.161178,0.0,0.069889,0.0
min,0.0,0.0,0.0,0.244898,0.0
25%,0.123288,0.533333,0.0,0.397959,0.0
50%,0.246575,0.533333,0.0,0.397959,0.0
75%,0.383562,0.6,0.0,0.397959,0.0
max,1.0,1.0,0.0,0.602041,0.0


In [28]:
train_and_evaluate(X_iqr, y_iqr)

Точность на обучающей выборке: 0.9933
Точность на тестовой выборке: 0.9934

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4038
           1       0.00      0.00      0.00        27

    accuracy                           0.99      4065
   macro avg       0.50      0.50      0.50      4065
weighted avg       0.99      0.99      0.99      4065



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Функция для удаления выбросов на основе среднего значения

def remove_outliers_based_on_mean(X, y, mean_multiplier=2):
    # Убедимся, что работаем только с числовыми данными
    X_numeric = X.select_dtypes(include=['number'])

    # Рассчитываем среднее значение и стандартное отклонение для каждой переменной
    mean_values = X_numeric.mean()
    std_values = X_numeric.std()

    # Устанавливаем верхний и нижний пороги на основе среднего значения и множителя стандартного отклонения
    lower_threshold = mean_values - mean_multiplier * std_values
    upper_threshold = mean_values + mean_multiplier * std_values

    # Фильтруем наблюдения, которые находятся в пределах этих порогов по всем признакам
    X_filtered = X_numeric[~((X_numeric < lower_threshold) | (X_numeric > upper_threshold)).any(axis=1)]

    # Синхронизируем y с отфильтрованными X
    y_filtered = y.loc[X_filtered.index]

    return X_filtered, y_filtered

In [30]:
X_mean, y_mean = remove_outliers_based_on_mean(X_processed.copy(), Y.copy())

In [31]:
X_processed.describe()

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.605379,0.020042,0.402423,0.24081
std,0.186855,0.171515,0.092507,0.125994,0.427581
min,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.533333,0.0,0.397959,0.0
50%,0.273973,0.6,0.0,0.397959,0.0
75%,0.424658,0.733333,0.0,0.44898,0.0
max,1.0,1.0,1.0,1.0,1.0


In [32]:
X_mean.describe()

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K
count,26919.0,26919.0,26919.0,26919.0,26919.0
mean,0.275529,0.616826,0.000153,0.406594,0.233924
std,0.164203,0.141904,0.004837,0.090555,0.423332
min,0.0,0.266667,0.0,0.153061,0.0
25%,0.136986,0.533333,0.0,0.397959,0.0
50%,0.260274,0.6,0.0,0.397959,0.0
75%,0.39726,0.733333,0.0,0.44898,0.0
max,0.657534,0.933333,0.20202,0.653061,1.0


In [33]:
train_and_evaluate(X_mean, y_mean)

Точность на обучающей выборке: 0.9472
Точность на тестовой выборке: 0.9471

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5099
           1       0.00      0.00      0.00       285

    accuracy                           0.95      5384
   macro avg       0.47      0.50      0.49      5384
weighted avg       0.90      0.95      0.92      5384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
from sklearn.feature_selection import SelectKBest, f_regression

In [35]:
# Применяем SelectKBest для выбора 5 лучших признаков
selector = SelectKBest(score_func=f_regression, k=5)
X_kbest = selector.fit_transform(X_processed, Y)

In [36]:
selector.scores_

array([4.40346601e+02, 8.43512593e+02, 8.16828478e+01, 3.12006800e+02,
       4.85285196e+03, 1.07701954e+00, 3.73237929e-01, 5.41931257e+01,
       2.78163878e+02, 7.67621497e+00, 1.82362144e-01, 7.46644967e-01,
       2.72144588e+01, 9.43425016e+00, 7.01073171e+00, 8.87443716e+00,
       2.00564702e+01, 2.18035675e+01, 5.04862728e-01, 1.74447327e-02,
       2.16357571e+02, 9.50656364e+01, 1.48072051e+02, 2.40396219e+02,
       1.02186696e+00, 4.33470902e+02, 4.95771017e+01, 2.43794935e-02,
       7.10982128e+02, 6.27790223e+00, 4.09955799e+02, 1.99987603e+01,
       1.16973583e+01, 4.79908393e-01, 7.30524924e+00, 3.10179019e+02,
       6.47112423e+00, 4.18482801e+01, 4.03571869e+01, 1.37849410e+02,
       6.00458765e+00, 3.00707530e+02, 7.68847821e-01, 1.91383023e+00,
       1.85674449e+00, 5.38836145e+00, 9.33335464e+01, 3.27073826e+01,
       2.51423778e+02, 5.52992204e+01, 4.80148876e+01, 5.20798466e-02,
       2.84526566e+01, 3.49304397e+00, 2.74991065e+01, 1.81664513e+02,
      

In [37]:
X_processed.columns

Index(['age', 'education-num', 'capital-loss', 'hours-per-week', 'income >50K',
       'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay', 'education_11th',
       'education_12th', 'education_1st-4th', 'education_5th-6th',
       'education_7th-8th', 'education_9th', 'education_Assoc-acdm',
       'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate',
       'education_HS-grad', 'education_Masters', 'education_Preschool',
       'education_Prof-school', 'education_Some-college',
       'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupatio

In [38]:
X_processed

Unnamed: 0,age,education-num,capital-loss,hours-per-week,income >50K,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.800000,0.0,0.397959,0.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.452055,0.800000,0.0,0.122449,0.0,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
2,0.287671,0.533333,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,0.493151,0.400000,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0.150685,0.800000,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.733333,0.0,0.377551,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32557,0.315068,0.533333,0.0,0.397959,1.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32558,0.561644,0.533333,0.0,0.397959,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
32559,0.068493,0.533333,0.0,0.193878,0.0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
