In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
data = pd.read_csv( 'adult.csv', na_values = '?')
data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,,227026,HS-grad,9,Never-married,,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        46043 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       46033 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   47985 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
#проверка пустых значений
print('workclass NaN {}'.format(len(data[pd.isnull(data.workclass)])))
print('occupation NaN {}'.format(len(data[pd.isnull(data.occupation)])))
print('native-country NaN {}'.format(len(data[pd.isnull(data['native-country'])])))

workclass NaN 2799
occupation NaN 2809
native-country NaN 857


In [5]:
#обнуление пустых значений
data = data[pd.isnull(data.workclass) == 0]
data = data[pd.isnull(data.occupation) == 0]
data = data[pd.isnull(data['native-country']) == 0]

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              45222 non-null  int64 
 1   workclass        45222 non-null  object
 2   fnlwgt           45222 non-null  int64 
 3   education        45222 non-null  object
 4   educational-num  45222 non-null  int64 
 5   marital-status   45222 non-null  object
 6   occupation       45222 non-null  object
 7   relationship     45222 non-null  object
 8   race             45222 non-null  object
 9   gender           45222 non-null  object
 10  capital-gain     45222 non-null  int64 
 11  capital-loss     45222 non-null  int64 
 12  hours-per-week   45222 non-null  int64 
 13  native-country   45222 non-null  object
 14  income           45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [7]:
#выборка признаков для Х
df = data.iloc[:, :-1]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States


In [8]:
#перевод столбцов с типом object к int
X = pd.get_dummies(df, columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])
X.head(10)

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,0,0,30,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
7,63,104626,15,3103,0,32,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,24,369667,10,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9,55,104996,4,0,0,10,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10,65,184454,9,6418,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
11,36,212465,13,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Columns: 104 entries, age to native-country_Yugoslavia
dtypes: int64(6), uint8(98)
memory usage: 6.6 MB


In [10]:
le = LabelEncoder()
le.fit(data['income'])

LabelEncoder()

In [11]:
#целевая переменная
Y = pd.Series(data = le.transform(data['income']))
Y.head(10)

0    0
1    0
2    1
3    1
4    0
5    1
6    0
7    0
8    1
9    0
dtype: int32

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [13]:
#для модели логистической регрессии
def make_regression(x_train, y_train, x_test, y_test):
    model = LogisticRegression()
    model.fit(x_train, y_train)

    print("Score train:", model.score(x_train, y_train))
    
    print("MSE train:", mean_squared_error(y_train, model.predict(x_train)))
    print("MSE test:", mean_squared_error(y_test, model.predict(x_test)))
    
    print("R2 train:", r2_score(y_train, model.predict(x_train)))
    print("R2 test:", r2_score(y_test, model.predict(x_test)))

In [14]:
make_regression(x_train, y_train, x_test, y_test)

Score train: 0.791165657738342
MSE train: 0.20883434226165795
MSE test: 0.20796019900497512
R2 train: -0.11943608995579913
R2 test: -0.11883176375335602


In [15]:
#для модели полиномиальной регрессии
def make_regression_p(x_train, x_train_disp, y_train, x_test, x_test_disp, y_test):
    model = LogisticRegression()
    model.fit(x_train, y_train)

    print("Score train poly:", model.score(x_train, y_train))
    
    print("MSE train poly:", mean_squared_error(y_train, model.predict(x_train)))
    print("MSE test poly:", mean_squared_error(y_test, model.predict(x_test)))
    
    print("R2 train poly:", r2_score(y_train, model.predict(x_train)))
    print("R2 test poly:", r2_score(y_test, model.predict(x_test)))

In [16]:
polynomial_features = PolynomialFeatures(degree=2)
x_train_poly = polynomial_features.fit_transform(x_train)
x_test_poly = polynomial_features.fit_transform(x_test)
make_regression_p(x_train_poly, x_train, y_train, x_test_poly, x_test, y_test)

Score train poly: 0.791801420792216
MSE train poly: 0.20819857920778395
MSE test poly: 0.20906578220011054
R2 train poly: -0.11602814421536456
R2 test poly: -0.12477983267283155


In [17]:
from sklearn import svm
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA

In [38]:
#на всей выборке у меня не получилось дождаться результата от модели
X_ = data.loc[:2500,['age', 'gender']]
X_ = pd.get_dummies(X_, columns = ['gender'])
X_.head(10)

Unnamed: 0,age,gender_Female,gender_Male
0,25,0,1
1,38,0,1
2,28,0,1
3,44,0,1
5,34,0,1
7,63,0,1
8,24,1,0
9,55,0,1
10,65,0,1
11,36,0,1


In [39]:
X_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2310 entries, 0 to 2500
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            2310 non-null   int64
 1   gender_Female  2310 non-null   uint8
 2   gender_Male    2310 non-null   uint8
dtypes: int64(1), uint8(2)
memory usage: 40.6 KB


In [40]:
y = Y.loc[:2309]
y.head()

0    0
1    0
2    1
3    1
4    0
dtype: int32

In [41]:
pca = PCA(n_components=3)
x_ = pca.fit_transform(X_)
x1_train, x1_test, y1_train, y1_test = train_test_split(x_, y, test_size=0.2)

In [45]:
#SVM модель
model1 = svm.SVC(kernel='linear')
model1.fit(x1_train, y1_train)
y1_predict = model1.predict(x1_test)

In [46]:
model1.score(x1_train, y1_train)

0.7435064935064936