In [1]:
import pandas as pd

# to display nice model diagram
from sklearn import set_config
set_config(display='diagram')

# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

In [2]:
features.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [3]:
from sklearn.compose import make_column_selector as selector

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# results in a list containing relevant column names
numerical_columns

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [4]:
numerical_features = features[numerical_columns]
numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,10.078089,1079.067626,87.502314,40.422382
std,13.71051,2.570973,7452.019058,403.004552,12.391444
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [5]:
#Standardization

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(numerical_features)

In [6]:
scaler.mean_

array([  38.64358544,   10.07808853, 1079.06762622,   87.50231358,
         40.42238238])

In [7]:
scaler.scale_

array([1.37103696e+01, 2.57094644e+00, 7.45194277e+03, 4.03000427e+02,
       1.23913172e+01])

In [8]:
numerical_features_scaled = scaler.transform(numerical_features)
numerical_features_scaled

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])

In [9]:
# fitting and transforming in one step
scaler.fit_transform(numerical_features)

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])

In [10]:
numerical_features = pd.DataFrame(
    numerical_features_scaled,
    columns=numerical_columns
)

numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,1.584958e-16,1.594573e-17,2.294458e-16,7.617582e-17,9.071110000000001e-17
std,1.00001,1.00001,1.00001,1.00001,1.00001
min,-1.578629,-3.53103,-0.1448035,-0.2171271,-3.181452
25%,-0.7763164,-0.4193353,-0.1448035,-0.2171271,-0.03408696
50%,-0.119879,-0.03037346,-0.1448035,-0.2171271,-0.03408696
75%,0.6824334,0.7475502,-0.1448035,-0.2171271,0.3694214
max,3.745808,2.303397,13.27438,10.59179,4.727312


In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
min_max_features = MinMaxScaler()
min_max_features.fit(numerical_features)

In [13]:
# fitting and transforming in one step
min_max_features.fit_transform(numerical_features)

array([[0.10958904, 0.4       , 0.        , 0.        , 0.39795918],
       [0.28767123, 0.53333333, 0.        , 0.        , 0.5       ],
       [0.15068493, 0.73333333, 0.        , 0.        , 0.39795918],
       ...,
       [0.56164384, 0.53333333, 0.        , 0.        , 0.39795918],
       [0.06849315, 0.53333333, 0.        , 0.        , 0.19387755],
       [0.47945205, 0.53333333, 0.1502415 , 0.        , 0.39795918]])

In [14]:
numerical_features = pd.DataFrame(
    min_max_features.fit_transform(numerical_features),
    columns=numerical_columns
)

numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,0.296487,0.605206,0.010791,0.020088,0.402269
std,0.187815,0.171398,0.074521,0.092517,0.126443
min,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.533333,0.0,0.0,0.397959
50%,0.273973,0.6,0.0,0.0,0.397959
75%,0.424658,0.733333,0.0,0.0,0.44898
max,1.0,1.0,1.0,1.0,1.0


In [15]:
#Changing the range of the scaler
min_max_features = MinMaxScaler(feature_range=(-1,1))
min_max_features.fit(numerical_features)
# fitting and transforming in one step
min_max_features.fit_transform(numerical_features)

array([[-0.78082192, -0.2       , -1.        , -1.        , -0.20408163],
       [-0.42465753,  0.06666667, -1.        , -1.        ,  0.        ],
       [-0.69863014,  0.46666667, -1.        , -1.        , -0.20408163],
       ...,
       [ 0.12328767,  0.06666667, -1.        , -1.        , -0.20408163],
       [-0.8630137 ,  0.06666667, -1.        , -1.        , -0.6122449 ],
       [-0.04109589,  0.06666667, -0.699517  , -1.        , -0.20408163]])

In [16]:
#Modeling Piplines
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LogisticRegression())
model

In [17]:
from sklearn.model_selection import train_test_split

# split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(numerical_features, target, random_state=123)

# fit our pipeline model
model.fit(X_train, y_train)

# score our model on the test data
model.score(X_test, y_test)

0.8135287855212513

In [18]:
features[["education"]]

Unnamed: 0,education
0,11th
1,HS-grad
2,Assoc-acdm
3,Some-college
4,Some-college
...,...
48837,Assoc-acdm
48838,HS-grad
48839,HS-grad
48840,HS-grad


In [19]:
#Preprocessing categorical data

from sklearn.preprocessing import OrdinalEncoder

# let's illustrate with the 'education' feature
education_column = features[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 1.],
       [11.],
       [ 7.],
       ...,
       [11.],
       [11.],
       [11.]])

In [20]:
encoder.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]

In [21]:
ed_levels = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', 
             ' 12th', ' HS-grad', ' Prof-school', ' Some-college', ' Assoc-acdm', 
             ' Assoc-voc', ' Bachelors', ' Masters', ' Doctorate']

encoder = OrdinalEncoder(categories=[ed_levels])
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 6.],
       [ 8.],
       [11.],
       ...,
       [ 8.],
       [ 8.],
       [ 8.]])

In [22]:
encoder.categories_

[array([' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',
        ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',
        ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Masters',
        ' Doctorate'], dtype=object)]

In [23]:
#Encoding Nominal Categories

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
feature_names = encoder.get_feature_names(input_features=["education"])
pd.DataFrame(education_encoded, columns=feature_names)

Unnamed: 0,education_ 10th,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
# get all categorical features
categorical_features = features[categorical_columns]

# one-hot encode all features
categorical_features_encoded = encoder.fit_transform(categorical_features)

# view as a data frame
columns_encoded = encoder.get_feature_names(categorical_features.columns)
pd.DataFrame(categorical_features_encoded, columns=columns_encoded).head()

Unnamed: 0,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
#My turn

categorical_features[["occupation"]]

Unnamed: 0,occupation
0,Machine-op-inspct
1,Farming-fishing
2,Protective-serv
3,Machine-op-inspct
4,?
...,...
48837,Tech-support
48838,Machine-op-inspct
48839,Adm-clerical
48840,Adm-clerical


In [27]:
# let's illustrate with the 'education' feature
occupation_column = categorical_features[["occupation"]]

encoder = OrdinalEncoder()
occupation_encoded = encoder.fit_transform(occupation_column)
occupation_encoded

array([[ 7.],
       [ 5.],
       [11.],
       ...,
       [ 1.],
       [ 1.],
       [ 4.]])

In [28]:
encoder.categories_

[array([' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair',
        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
        ' Transport-moving'], dtype=object)]

In [29]:
encoder = OneHotEncoder(sparse=False)
occupation_encoded = encoder.fit_transform(occupation_column)
occupation_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
feature_names = encoder.get_feature_names(input_features=["occupation"])
pd.DataFrame(occupation_encoded, columns=feature_names)

Unnamed: 0,occupation_ ?,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
48838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48839,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48840,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
#Using numerical and categorical variables together

# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

In [32]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [33]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

In [34]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

In [35]:
# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.8503808041929408