In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
pd.options.display.max_columns = None

In [3]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
data.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [5]:
for column in data.select_dtypes('object').columns:
    display(data[column].value_counts().to_frame())

Unnamed: 0,workclass
Private,22696
Self-emp-not-inc,2541
Local-gov,2093
?,1836
State-gov,1298
Self-emp-inc,1116
Federal-gov,960
Without-pay,14
Never-worked,7


Unnamed: 0,education
HS-grad,10501
Some-college,7291
Bachelors,5355
Masters,1723
Assoc-voc,1382
11th,1175
Assoc-acdm,1067
10th,933
7th-8th,646
Prof-school,576


Unnamed: 0,marital.status
Married-civ-spouse,14976
Never-married,10683
Divorced,4443
Separated,1025
Widowed,993
Married-spouse-absent,418
Married-AF-spouse,23


Unnamed: 0,occupation
Prof-specialty,4140
Craft-repair,4099
Exec-managerial,4066
Adm-clerical,3770
Sales,3650
Other-service,3295
Machine-op-inspct,2002
?,1843
Transport-moving,1597
Handlers-cleaners,1370


Unnamed: 0,relationship
Husband,13193
Not-in-family,8305
Own-child,5068
Unmarried,3446
Wife,1568
Other-relative,981


Unnamed: 0,race
White,27816
Black,3124
Asian-Pac-Islander,1039
Amer-Indian-Eskimo,311
Other,271


Unnamed: 0,sex
Male,21790
Female,10771


Unnamed: 0,native.country
United-States,29170
Mexico,643
?,583
Philippines,198
Germany,137
Canada,121
Puerto-Rico,114
El-Salvador,106
India,100
Cuba,95


Unnamed: 0,income
<=50K,24720
>50K,7841


Выше можно увидеть, что в некоторых переменных - workclass, occupation, native.country - встречается ?. Это обозначение пропущенных данных.

Выполним замену ? на NaN и заполним пропуски модой.

In [6]:
data[data == '?'] = np.nan
data.isna().sum()

  result = method(y)


age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [7]:
for column in ['workclass', 'occupation', 'native.country']:
    data[column].fillna(data[column].mode()[0], inplace=True)
data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [8]:
X = data.drop(columns=data.select_dtypes('object').columns)
y = data['income'].replace({'<=50K': -1, '>50K': 1})

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [11]:
roc_auc_score(y_test, y_pred)

0.6175194033932127

In [12]:
accuracy_score(y_test, y_pred)

0.7976251407513564

Теперь, для классификации будем использовать все переменые. Выполним преобразование категориальных переменных в dummy-переменные, после чего проведем обучение классификатора и посмотрим на результаты его работы

In [13]:
X = pd.get_dummies(data.drop(columns='income'))
y = data['income'].replace({'<=50K': -1, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
clf = LogisticRegression(C=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [15]:
roc_auc_score(y_test, y_pred)

0.6175194033932127

In [16]:
accuracy_score(y_test, y_pred)

0.7976251407513564

Продолжаем использовать для классификации все переменые. На этот раз преобразуем категориальные переменные с помощью LabelEncoder, после чего проведем обучение классификатора и посмотрим на результаты его работы

In [18]:
encoded_data = data.copy()
for feature in encoded_data.select_dtypes('object').columns:
        le = LabelEncoder()
        encoded_data[feature] = le.fit_transform(encoded_data[feature])

In [19]:
X = encoded_data.copy().drop(columns='income')
y = encoded_data['income'].copy()

In [20]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [22]:
clf = LogisticRegression(C=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [23]:
roc_auc_score(y_test, y_pred)

0.6918803514149446

In [24]:
accuracy_score(y_test, y_pred)

0.8218855563517249

На этот раз воспользуемся методом главных компонент, для того чтобы определить наиболее значимые переменные.
После чего проведем обучение классификатора и посмотрим на результаты его работы

Поискать что-нибудь на тему feature importance; понять, насколько актуален здесь PCA

In [30]:
X = encoded_data.copy().drop(columns='income')
y = encoded_data['income'].copy()

In [31]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [32]:
pca = PCA()
X = pca.fit_transform(X)
pca.explained_variance_ratio_

array([0.14783223, 0.10161135, 0.08056461, 0.07870396, 0.07427552,
       0.07329118, 0.07027089, 0.06772609, 0.06487451, 0.06119792,
       0.0608425 , 0.0486531 , 0.04276666, 0.02738948])

In [35]:
pd.DataFrame(pca.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.314791,0.108278,-0.052445,0.105856,0.212485,-0.335866,0.038225,-0.53932,0.165233,0.472076,0.144169,0.115255,0.369721,0.063438
1,0.080224,0.043167,0.145299,-0.611544,-0.608146,0.000342,0.003967,-0.208468,-0.034967,0.280068,-0.151913,-0.074194,-0.053439,-0.268677
2,-0.488311,-0.023642,0.457471,0.122733,0.191504,0.42042,0.342019,-0.159483,-0.143482,0.315748,0.066965,-0.016855,0.130203,-0.196995
3,0.274151,-0.176441,0.13011,0.069678,0.100591,-0.24347,-0.076773,0.074319,-0.610875,-0.152282,0.244518,0.012157,0.119572,-0.566145
4,0.085559,0.406683,-0.150399,-0.112657,-0.004041,0.06538,0.292843,0.05124,0.033282,-0.051037,0.605687,-0.566648,-0.086395,0.013436
5,0.198943,-0.018979,-0.355825,-0.153206,0.083247,0.094838,0.72905,-0.005408,-0.147935,-0.050494,-0.101734,0.441801,-0.178915,-0.022981
6,-0.091549,0.82697,-0.134509,0.104115,0.009383,0.122647,-0.228556,0.054711,-0.090629,-0.014439,-0.162688,0.299893,0.073221,-0.279748
7,0.16901,0.145429,0.642356,-0.121355,0.006364,-0.160326,0.074648,0.152979,0.346433,-0.201195,0.32007,0.422896,-0.156581,0.043807
8,-0.207077,-0.194201,-0.30761,-0.269456,-0.07703,0.297118,-0.309278,0.014024,-0.059455,0.062599,0.572586,0.414358,0.209253,0.106417
9,0.312517,0.047538,0.120318,0.191371,-0.003088,0.305341,-0.2413,-0.2676,-0.341531,0.213163,0.083077,0.047768,-0.617337,0.271156
