In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

from pathlib import Path

# Create Preprocess for Adult Income

## Final preprocessing functions

In [101]:
income = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None)

In [104]:
def clean_income_dataset(df: pd.DataFrame):
    col_mapper = {
        0: 'age',
        1: 'workclass',
        2: 'final_weight',
        3: 'education',
        4: 'education_num',
        5: 'marital_status',
        6: 'occupation',
        7: 'relationship',
        8: 'race',
        9: 'sex',
        10: 'capital_gain',
        11: 'capital_loss',
        12: 'hours_per_week',
        13: 'native_country',
        14: 'income_class'
    }

    relevant_cols = ['age', 'workclass', 'final_weight', 'education',
            'marital_status', 'occupation', 'relationship', 'race',
            'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
            'native_country', 'income_class']
    
    categorical_cols = ['workclass', 'education', 'marital_status', 
                    'occupation', 'relationship', 'race', 'sex', 
                    'native_country', 'income_class',]
    
    def tweak_categorical_column(col: pd.Series):
        return (col
                .str.strip()
                .str.lower()
                .astype('category')
                )
    
    df = df.rename(columns=col_mapper)

    return (df
            [relevant_cols]
            .assign(**{col: tweak_categorical_column(df[col]) for col in categorical_cols})
            .astype({'age': 'uint8', 'hours_per_week': 'uint8'})
            )

In [105]:
def split_income_dataset(df: pd.DataFrame):
    labels = (df
              .income_class
              .pipe(pd.get_dummies, drop_first=True)
             )
    
    attributes = (df
                  .drop(columns="income_class")
                  .pipe(pd.get_dummies, drop_first=True)
                  .pipe(MinMaxScaler().fit_transform)
                  )

    return train_test_split(attributes, labels, train_size=0.8, stratify=df.income_class)

In [106]:
(attributes_train, attributes_test, labels_train, labels_test) = (income
                                                                  .pipe(clean_income_dataset)
                                                                  .pipe(split_income_dataset))

for d in [attributes_train, attributes_test, labels_train, labels_test]:
    print(d.shape)

(26048, 99)
(6513, 99)
(26048, 1)
(6513, 1)


## Explore to develope the functions

In [107]:
col_mapper = {
    0: 'age',
    1: 'workclass',
    2: 'final_weight',
    3: 'education',
    4: 'education_num',
    5: 'marital_status',
    6: 'occupation',
    7: 'relationship',
    8: 'race',
    9: 'sex',
    10: 'capital_gain',
    11: 'capital_loss',
    12: 'hours_per_week',
    13: 'native_country',
    14: 'income_class'
}

In [83]:
df.dtypes

age                int64
workclass         object
final_weight       int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income_class      object
dtype: object

### Object

In [84]:
df.select_dtypes('object').describe()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native_country,income_class
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [153]:
df.select_dtypes('object').isna().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native_country    0
income_class      0
dtype: int64

#### workclass

In [96]:
pd.unique(df.workclass)

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [136]:
(df
 .workclass
 .str.strip()
 .replace({'?': 'unknown'})
 .astype('category')
 .unique()
 )

['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'unknown', 'Self-emp-inc', 'Without-pay', 'Never-worked']
Categories (9, object): ['Federal-gov', 'Local-gov', 'Never-worked', 'Private', ..., 'Self-emp-not-inc', 'State-gov', 'Without-pay', 'unknown']

#### Education

In [113]:
(df
 .education
 .str.strip()
 .pipe(pd.Series.unique)
 )

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

#### Education

In [158]:
(df
 .education
 .unique()
 )

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

#### Sex, income_class

In [58]:
(df
 .assign(income_class=df.income_class.astype('category'),
         sex=df.sex.astype('category'))
 )

Unnamed: 0,age,workclass,final_weight,education,education-num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
income_target = income_data.income_class
income_target = income_target.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)

(
    income_attributes_train,
    income_attributes_test,
    income_target_train,
    income_target_test,
) = train_test_split(income_attributes, income_target, train_size=0.8)

for x in [
    income_attributes_train,
    income_attributes_test,
    income_target_train,
    income_target_test,
]:
    print(x.shape)

(26048, 100)
(6513, 100)
(26048,)
(6513,)


### Integers

In [51]:
(income
 .rename(columns=col_mapper)
 .select_dtypes('integer')
 .describe()
 )

Unnamed: 0,age,final_weight,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


## Old versions

In [102]:
# V1
# def clean_income_dataset(df: pd.DataFrame):
#     def tweak_categorical_column(df_, col):
#         return (df_
#                 [col]
#                 .str.strip()
#                 .str.lower()
#                 .astype('category')
#                 )

#     col_mapper = {
#         0: 'age',
#         1: 'workclass',
#         2: 'final_weight',
#         3: 'education',
#         4: 'education_num',
#         5: 'marital_status',
#         6: 'occupation',
#         7: 'relationship',
#         8: 'race',
#         9: 'sex',
#         10: 'capital_gain',
#         11: 'capital_loss',
#         12: 'hours_per_week',
#         13: 'native_country',
#         14: 'income_class'
#     }
#     df = df.rename(columns=col_mapper)

#     cols = ['age', 'workclass', 'final_weight', 'education',
#             'marital_status', 'occupation', 'relationship', 'race',
#             'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
#             'native_country', 'income_class']
#     return (df
#             [cols]
#             .assign(workclass=tweak_categorical_column(df.workclass),
#                     education=tweak_categorical_column(df.education),
#                     marital_status=tweak_categorical_column(df.marital_status),
#                     occupation=tweak_categorical_column(df.occupation),
#                     relationship=tweak_categorical_column(df.relationship),
#                     race=tweak_categorical_column(df.race),
#                     sex=tweak_categorical_column(df.sex),
#                     native_country=tweak_categorical_column(df.native_country),
#                     income_class=tweak_categorical_column(df.income_class),
#                     )
#             .astype({'age': 'uint8', 'hours_per_week': 'uint8'})
#             )