In [1]:
import my_module

features, target = my_module.get_features_and_target(
    csv_file='../data/adult-census.csv',
    target_col='class',
)

# Drop education-num as discussed before, because it's redundant.
features = features.drop('education-num', axis=1)

preprocessor = my_module.make_preprocessor(features)

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

# If we want a logistic regression
model = make_pipeline(preprocessor, LogisticRegression())
# or perhaps we prefer a random forest?
#model = make_pipeline(RandomForestRegressor())

In [3]:
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index([], dtype='object')),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object'))])),
                ('logisticregression', LogisticRegression())])

In [4]:
from sklearn.model_selection import train_test_split

# one small addition: the target column is encoded as a string in our data so we need to convert to 1s and 0s.
target = target.str.contains('>50K').astype(int)

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.7988698714274015

In [5]:
import pandas as pd
fake_features = pd.read_csv('../data/planes.csv')
preprocessor = my_module.make_preprocessor(fake_features)
preprocessor

ColumnTransformer(transformers=[('one-hot-encoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 Index(['tailnum', 'type', 'manufacturer', 'model', 'engine'], dtype='object')),
                                ('standard_scaler', StandardScaler(),
                                 Index(['year', 'engines', 'seats', 'speed'], dtype='object'))])

In [6]:
import my_module
from sklearn.preprocessing import Normalizer, OrdinalEncoder

features, target = my_module.get_features_and_target(
    csv_file='../data/adult-census.csv',
    target_col='class',
)
features = features.drop('education-num', axis=1)
target = target.str.contains('>50K').astype(int)

preprocessor = my_module.make_preprocessor(features, numeric_preprocessor=Normalizer())
model = make_pipeline(preprocessor, LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

_ = model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7806076488412087