In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
X = data[['age', 'workclass', 'education','educational-num', 'occupation', 'gender', 'native-country']]
y = data['income']

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         48842 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
X = pd.get_dummies(X, columns = ['education', 'workclass', 'occupation', 'gender', 'native-country'])

In [6]:
X.head()

Unnamed: 0,age,educational-num,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,12,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,44,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

## Модель логистической регрессии

In [8]:
from sklearn.linear_model import LogisticRegression


In [9]:
le.fit(data['income'])

LabelEncoder()

In [10]:
y = le.transform(data['income'])
y

array([0, 0, 1, ..., 0, 0, 1])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
penalty = ['l1', 'l2']
C = [0.001, 0.01, 0.1, 1.0, 10.0]

In [13]:
lr = LogisticRegression()

In [14]:
lr.fit(X_train, y_train)
preditation = lr.predict_proba(X_test)



In [15]:
preditation

array([[0.97745768, 0.02254232],
       [0.8579344 , 0.1420656 ],
       [0.87942645, 0.12057355],
       ...,
       [0.92379134, 0.07620866],
       [0.96064755, 0.03935245],
       [0.81047288, 0.18952712]])

In [16]:
lr.score(X, y)

0.8082388108595062

In [17]:
from sklearn.metrics import roc_auc_score

In [18]:
roc_auc_score(y_test, preditation[:, 1])

0.8228622242024122

In [19]:
params = []
scores = []

for p in penalty:
    for c in C:
        lr = LogisticRegression(penalty=p, C=c)
        lr.fit(X_train, y_train)
        
        probas = lr.predict_proba(X_test)[:, 1]
        
        params.append((p, c))
        scores.append(roc_auc_score(y_test, probas))



In [20]:
for i in range((len(params))):
    print('{}_{}: {}'.format(params[i][0], params[i][1], scores[i]))

l1_0.001: 0.7396809648628565
l1_0.01: 0.815553996411161
l1_0.1: 0.8225080355903417
l1_1.0: 0.8229082074789936
l1_10.0: 0.8226566263891951
l2_0.001: 0.781574855772764
l2_0.01: 0.8175347477401338
l2_0.1: 0.8228023081461338
l2_1.0: 0.8228622242024122
l2_10.0: 0.8227447397377167


## Модель на основе деревьев

In [21]:
from sklearn.tree import DecisionTreeClassifier

In [22]:
dtc=DecisionTreeClassifier(max_depth=4)


In [23]:
criterion_dtc=['gini','entropy']
max_depth_dtc=[x for x in range(2, 11)]

In [27]:
params_dtc = []
scores_dtc = []

for c in criterion_dtc:
    for m in max_depth_dtc:
        dtc = DecisionTreeClassifier(criterion=c, max_depth=m)
        dtc.fit(X_train, y_train)
        
        probas = dtc.predict_proba(X_test)[:, 1]
        
        params_dtc.append((c, m))
        scores_dtc.append(roc_auc_score(y_test, probas))

In [28]:
for i in range((len(params_dtc))):
    print('{}_{}: {}'.format(params_dtc[i][0], params_dtc[i][1], scores_dtc[i]))

gini_2: 0.7504187999640708
gini_3: 0.7907001757673746
gini_4: 0.8087949140760884
gini_5: 0.8172177132072546
gini_6: 0.8232360871242742
gini_7: 0.8240300259057838
gini_8: 0.8236461982394682
gini_9: 0.8228793849979177
gini_10: 0.820420785257588
entropy_2: 0.7481641138547969
entropy_3: 0.7843126980189934
entropy_4: 0.8008848080644766
entropy_5: 0.8136495308787144
entropy_6: 0.818919298584062
entropy_7: 0.8242180291189993
entropy_8: 0.8259246670422904
entropy_9: 0.82716178815234
entropy_10: 0.8251361380580258


## Модель на основе случайного леса

In [29]:
from sklearn.ensemble import RandomForestClassifier 

In [30]:
n_estimators_rf=[x for x in range(10, 15)]
max_depth_rf=[x for x in range(2, 11)]
min_samples_leaf_rf=[x for x in range(20, 25)]
criterion_rf=['gini','entropy']
max_features_rf=['auto', 0.5]

In [31]:
params_dtc = []
scores_dtc = []

for c in criterion_rf:
    for m in max_depth_rf:
        for m_s in min_samples_leaf_rf:
            for m_f in max_features_rf:
                for n in n_estimators_rf:
                    dtc = RandomForestClassifier(n_estimators=n, max_depth=m, criterion=c, min_samples_leaf=m_s, max_features=m_f, n_jobs=-1)
                    dtc.fit(X_train, y_train)

                    probas = dtc.predict_proba(X_test)[:, 1]

                    params_dtc.append((c, m, n, m_s, m_f))
                    scores_dtc.append(roc_auc_score(y_test, probas))

In [32]:
for i in range((len(params_dtc))):
    print('{} max_depth:{} n_estimators:{} min_samples_leaf:{} max_features_rf:{}: {}'.format(params_dtc[i][0], params_dtc[i][1], params_dtc[i][2], params_dtc[i][3], params_dtc[i][4], scores_dtc[i]))

gini max_depth:2 n_estimators:10 min_samples_leaf:20 max_features_rf:auto: 0.7870595608469497
gini max_depth:2 n_estimators:11 min_samples_leaf:20 max_features_rf:auto: 0.7881644864163052
gini max_depth:2 n_estimators:12 min_samples_leaf:20 max_features_rf:auto: 0.7971388169283784
gini max_depth:2 n_estimators:13 min_samples_leaf:20 max_features_rf:auto: 0.794136366698513
gini max_depth:2 n_estimators:14 min_samples_leaf:20 max_features_rf:auto: 0.7677008438262986
gini max_depth:2 n_estimators:10 min_samples_leaf:20 max_features_rf:0.5: 0.8021418203865658
gini max_depth:2 n_estimators:11 min_samples_leaf:20 max_features_rf:0.5: 0.8028149828927922
gini max_depth:2 n_estimators:12 min_samples_leaf:20 max_features_rf:0.5: 0.7853826640495496
gini max_depth:2 n_estimators:13 min_samples_leaf:20 max_features_rf:0.5: 0.7948238575120649
gini max_depth:2 n_estimators:14 min_samples_leaf:20 max_features_rf:0.5: 0.7948627723271519
gini max_depth:2 n_estimators:10 min_samples_leaf:21 max_features_