In [3]:
%matplotlib inline

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

from pathlib import Path

# Title

## Income dataset

In [9]:
income = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None)

In [17]:
def clean_income_dataset(df):
    col_mapper = {
        0: 'age',
        1: 'workclass',
        2: 'final_weight',
        3: 'education',
        4: 'education-num',
        5: 'marital-status',
        6: 'occupation',
        7: 'relationship',
        8: 'race',
        9: 'sex',
        10: 'capital_gain',
        11: 'capital_loss',
        12: 'hours_per_week',
        13: 'native_country',
        14: 'income_class'
    }
    
    def clean_categorical_data(df_):
        return (df_
                .str.strip()
                .str.lower()
               )
    
    return (df
            .rename(columns=col_mapper)
            .pipe(clean_categorical_data)
            
#             .astype({
#                 col: 'category' for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income_class']})
           )


clean_income = clean_income_dataset(income)
clean_income

Unnamed: 0,age,workclass,final_weight,education,education-num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_class
0,39,<=50k,77516,<=50k,13,<=50k,<=50k,<=50k,<=50k,<=50k,2174,0,40,<=50k,<=50k
1,50,<=50k,83311,<=50k,13,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,13,<=50k,<=50k
2,38,<=50k,215646,<=50k,9,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,40,<=50k,<=50k
3,53,<=50k,234721,<=50k,7,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,40,<=50k,<=50k
4,28,<=50k,338409,<=50k,13,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,40,<=50k,<=50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,<=50k,257302,<=50k,12,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,38,<=50k,<=50k
32557,40,>50k,154374,>50k,9,>50k,>50k,>50k,>50k,>50k,0,0,40,>50k,>50k
32558,58,<=50k,151910,<=50k,9,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,40,<=50k,<=50k
32559,22,<=50k,201490,<=50k,9,<=50k,<=50k,<=50k,<=50k,<=50k,0,0,20,<=50k,<=50k


In [None]:
income_target = income_data.income_class
income_target = income_target.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)

income_attributes_train, income_attributes_test, \
income_target_train, income_target_test, \
= train_test_split(income_attributes, income_target, train_size=0.8)

for x in [income_attributes_train, income_attributes_test, income_target_train, income_target_test]:
    print(x.shape)

In [162]:
df.select_dtypes('object').columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income_class'],
      dtype='object')

### Explore

In [83]:
df.dtypes

age                int64
workclass         object
final_weight       int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income_class      object
dtype: object

### Object

In [84]:
df.select_dtypes('object').describe()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native_country,income_class
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [153]:
df.select_dtypes('object').isna().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native_country    0
income_class      0
dtype: int64

#### workclass

In [96]:
pd.unique(df.workclass)

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [136]:
(df
 .workclass
 .str.strip()
 .replace({'?': 'unknown'})
 .astype('category')
 .unique()
)

['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'unknown', 'Self-emp-inc', 'Without-pay', 'Never-worked']
Categories (9, object): ['Federal-gov', 'Local-gov', 'Never-worked', 'Private', ..., 'Self-emp-not-inc', 'State-gov', 'Without-pay', 'unknown']

#### Education

In [113]:
(df
 .education
 .str.strip()
 .pipe(pd.Series.unique)
)

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

#### Education

In [158]:
(df
 .education
 .unique()
)

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

#### Sex, income_class

In [58]:
(df
 .assign(income_class=df.income_class.astype('category'),
         sex=df.sex.astype('category'))
)

Unnamed: 0,age,workclass,final_weight,education,education-num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
income_target = income_data.income_class
income_target = income_target.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)

income_attributes_train, income_attributes_test, \
income_target_train, income_target_test, \
= train_test_split(income_attributes, income_target, train_size=0.8)

for x in [income_attributes_train, income_attributes_test, income_target_train, income_target_test]:
    print(x.shape)

(26048, 100)
(6513, 100)
(26048,)
(6513,)
