# Домашнее задание к занятию "Классификация: Логистическая регрессия и SVM"

Имеются данные adult.csv (см. в материалах к занятию).  
Целевой переменной является уровень дохода income (крайний правый столбец).  
Описание признаков можно найти по ссылке http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html  
Вам необходимо построить модели логистической регрессии и SVM, которые предсказывает уровень дохода человека.  
Вывести качество полученных моделей на тестовой выборке, используя функцию score у модели.  
Готовый ноутбук выложить на гитхаб и прислать ссылку.

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df_adult = pd.read_csv('adult.csv', na_values=['?'])
df_adult[:5]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


Проверим процент заполненности данных в каждом столбце

In [3]:
print('Процент заполненности данных:')
for column_name in df_adult.columns:
    percent = 100 - df_adult[column_name].isna().mean()
    print(f'{column_name} - {round(percent,2)}%')

Процент заполненности данных:
age - 100.0%
workclass - 99.94%
fnlwgt - 100.0%
education - 100.0%
educational-num - 100.0%
marital-status - 100.0%
occupation - 99.94%
relationship - 100.0%
race - 100.0%
gender - 100.0%
capital-gain - 100.0%
capital-loss - 100.0%
hours-per-week - 100.0%
native-country - 99.98%
income - 100.0%


Столбцы workclass, occupation и native-country имеют пропуски. Так как задача учебная и незаполненных данных достаточно мало, то заполним их медианой. 

Для этой операции требуется преобразовать тектовые категориальные данные в числовые. Преобразуем все текcтовые категориальные данные в нашем датафрейме.

In [4]:
workclass_label_enc = LabelEncoder()
workclass_label_enc.fit(df_adult.loc[~df_adult.workclass.isna(), 'workclass'])

education_label_enc = LabelEncoder()
education_label_enc.fit(df_adult.loc[~df_adult.education.isna(), 'education'])

maritalstatus_label_enc = LabelEncoder()
maritalstatus_label_enc.fit(df_adult.loc[~df_adult['marital-status'].isna(), 'marital-status'])

occupation_label_enc = LabelEncoder()
occupation_label_enc.fit(df_adult.loc[~df_adult.occupation.isna(), 'occupation'])

relationship_label_enc = LabelEncoder()
relationship_label_enc.fit(df_adult.loc[~df_adult.occupation.isna(), 'relationship'])

race_label_enc = LabelEncoder()
race_label_enc.fit(df_adult.loc[~df_adult.occupation.isna(), 'race'])

gender_label_enc = LabelEncoder()
gender_label_enc.fit(df_adult.loc[~df_adult.occupation.isna(), 'gender'])

country_label_enc = LabelEncoder()
country_label_enc.fit(df_adult.loc[~df_adult['native-country'].isna(), 'native-country'])

income_label_enc = LabelEncoder()
income_label_enc.fit(df_adult.loc[~df_adult.income.isna(), 'income'])

LabelEncoder()

In [5]:
df_adult.loc[~df_adult.workclass.isna(), 
             'workclass'] = workclass_label_enc.transform(df_adult.loc[~df_adult.workclass.isna(), 'workclass'])
df_adult.loc[~df_adult.education.isna(), 
             'education'] = education_label_enc.transform(df_adult.loc[~df_adult.education.isna(), 'education'])
df_adult.loc[~df_adult['marital-status'].isna(), 
             'marital-status'] = maritalstatus_label_enc.transform(df_adult.loc[~df_adult['marital-status'].isna(), 
                                                                                'marital-status'])
df_adult.loc[~df_adult.occupation.isna(), 
             'occupation'] = occupation_label_enc.transform(df_adult.loc[~df_adult.occupation.isna(), 'occupation'])
df_adult.loc[~df_adult.relationship.isna(), 
             'relationship'] = relationship_label_enc.transform(df_adult.loc[~df_adult.relationship.isna(), 'relationship'])
df_adult.loc[~df_adult.race.isna(), 
             'race'] = race_label_enc.transform(df_adult.loc[~df_adult.race.isna(), 'race'])
df_adult.loc[~df_adult.gender.isna(), 
             'gender'] = gender_label_enc.transform(df_adult.loc[~df_adult.gender.isna(), 'gender'])

df_adult.loc[~df_adult['native-country'].isna(), 
             'native-country'] = country_label_enc.transform(df_adult.loc[~df_adult['native-country'].isna(), 
                                                                          'native-country'])
df_adult.loc[~df_adult.income.isna(), 
             'income'] = income_label_enc.transform(df_adult.loc[~df_adult.income.isna(), 'income'])

Заменим отсутствующие данные в workclass, occupation и native-country медианой.

In [6]:
df_adult.loc[df_adult.workclass.isna(), 'workclass'] = df_adult.workclass.median()
df_adult.workclass = df_adult.workclass.astype(np.int32)
df_adult.loc[df_adult.occupation.isna(), 'occupation'] = df_adult.occupation.median()
df_adult.occupation = df_adult.occupation.astype(np.int32)
df_adult.loc[df_adult['native-country'].isna(), 'native-country'] = df_adult['native-country'].median()
df_adult['native-country'] = df_adult['native-country'].astype(np.int32)

Проверим процент заполненности данных по всем столбцам.

In [7]:
print('Процент заполненности данных:')
for column_name in df_adult.columns:
    percent = 100 - df_adult[column_name].isna().mean()
    print(f'{column_name} - {round(percent,2)}%')

Процент заполненности данных:
age - 100.0%
workclass - 100.0%
fnlwgt - 100.0%
education - 100.0%
educational-num - 100.0%
marital-status - 100.0%
occupation - 100.0%
relationship - 100.0%
race - 100.0%
gender - 100.0%
capital-gain - 100.0%
capital-loss - 100.0%
hours-per-week - 100.0%
native-country - 100.0%
income - 100.0%


Столбцы occupation и native-country являются номинальными категориями, поэтому их следует преобразовать в вектора с помощью One-Hot Encoding.

In [8]:
df_adult['native-country'] = country_label_enc.inverse_transform(df_adult['native-country'])
df_adult.occupation = occupation_label_enc.inverse_transform(df_adult.occupation)
df_adult = pd.get_dummies(df_adult, columns=['native-country', 'occupation'])
df_adult

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,relationship,race,gender,capital-gain,...,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving
0,25,3,226802,1,7,4,3,2,1,0,...,0,0,1,0,0,0,0,0,0,0
1,38,3,89814,11,9,2,0,4,1,0,...,1,0,0,0,0,0,0,0,0,0
2,28,1,336951,7,12,2,0,4,1,0,...,0,0,0,0,0,0,1,0,0,0
3,44,3,160323,15,10,2,0,2,1,7688,...,0,0,1,0,0,0,0,0,0,0
4,18,3,103497,15,10,4,3,4,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,3,257302,7,12,2,5,4,0,0,...,0,0,0,0,0,0,0,0,1,0
48838,40,3,154374,11,9,2,0,4,1,0,...,0,0,1,0,0,0,0,0,0,0
48839,58,3,151910,11,9,6,4,4,0,0,...,0,0,0,0,0,0,0,0,0,0
48840,22,3,201490,11,9,4,3,4,1,0,...,0,0,0,0,0,0,0,0,0,0


Подготавливаем данные для использования логистической регрессии и SVM. Производим разделение на обучающую и тестовую выборки, выполняем стандартизацию всех данных.

In [9]:
X = df_adult.loc[:, df_adult.columns != 'income']
y = df_adult['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [10]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train_std, y_train)
y_predict = lr.predict(X_test_std)
print("Точность модели логистической регрессии:", round(accuracy_score(y_predict, y_test),2))

Точность модели логистической регрессии: 0.83


In [11]:
svc = SVC()
svc.fit(X_train_std, y_train)
y_predict = svc.predict(X_test_std)
print("Точность модели SVM:", round(accuracy_score(y_predict, y_test),2))

Точность модели SVM: 0.84
