# copyright: 
@dataschool

https://www.youtube.com/@dataschool

https://www.youtube.com/watch?v=sCt4LVD5hPc&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=2

This notebook is for personal study purposes only

In [4]:
import pandas as pd
import numpy as np

In [5]:
X = pd.DataFrame.from_dict(
    {
        'Fare': [7.25, 71.82, 7.92, 53.1, 8.05, 8.46],
        'Embarked': ['S', 'C', 'S', 'S', 'S', 'Q'],
        'Sex': ['male', 'female', 'female', 'female', 'male', 'male'],
        'Age': [22, 38, 26, 35, 35, np.nan]
    }
)
X

Unnamed: 0,Fare,Embarked,Sex,Age
0,7.25,S,male,22.0
1,71.82,C,female,38.0
2,7.92,S,female,26.0
3,53.1,S,female,35.0
4,8.05,S,male,35.0
5,8.46,Q,male,


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

## Tip 1
https://www.youtube.com/watch?v=sCt4LVD5hPc&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=2

In [7]:
ct = make_column_transformer(
    (OneHotEncoder(), ['Embarked', 'Sex']),
    (SimpleImputer(), ['Age']),
    remainder='passthrough',
)

In [8]:
ct.fit_transform(X)

array([[ 0.  ,  0.  ,  1.  ,  0.  ,  1.  , 22.  ,  7.25],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  , 38.  , 71.82],
       [ 0.  ,  0.  ,  1.  ,  1.  ,  0.  , 26.  ,  7.92],
       [ 0.  ,  0.  ,  1.  ,  1.  ,  0.  , 35.  , 53.1 ],
       [ 0.  ,  0.  ,  1.  ,  0.  ,  1.  , 35.  ,  8.05],
       [ 0.  ,  1.  ,  0.  ,  0.  ,  1.  , 31.2 ,  8.46]])

## Tip 2: ways to select columns using Column Transformer

In [9]:
from sklearn.compose import make_column_selector

In [10]:
ct = make_column_transformer(
    (OneHotEncoder(), ['Embarked', 'Sex'])
)

ct = make_column_transformer(
    (OneHotEncoder(), [1, 2])
)

ct = make_column_transformer(
    (OneHotEncoder(), slice(1, 3))
)

ct = make_column_transformer(
    (OneHotEncoder(), [False, True, True, False])
)

ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(pattern='E|S'))
)

ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include=object))
)

ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_exclude='number'))
)

## Tip 3: fit vs transform

## Tip 4: use fit_transform on training data, but transform (ONLY) on testing/new data

## Tip 5: reasons to use scikit-learn, NOT pandas, for ML preprocessing
1. can cross-validate entire workflow (pipeline)
2. can grid search all pipeline hyperparameters (including model and preprocessing)
3. avoid adding new columns to the source dataframe 
    
    for example, get_dummies, pandas will create 20 new columns if category has 20 levels
    
    if use a column transform, will not affect source df
4. pandas lacks separate fit/transform steps to prevent data leakage

## Tip 6: OneHotEncoder (unordered data) vs OrdinalEncoder (ordered data)
Unordered: male/female

Ordered: S, M, L, XL, XXL (size); 1st class, 2nd class, 3rd class

## Tip 7: for one-hot encoded feature, what if a new level shows up?
First choose handle_unknown='ignore', so code won't error out with transform

In future when re-train, include new levels

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [12]:
XX = pd.DataFrame.from_dict({'col': ['A', 'B', 'C', 'B']})
XX

Unnamed: 0,col
0,A
1,B
2,C
3,B


In [13]:
ohe.fit_transform(XX)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [14]:
XX_new = pd.DataFrame.from_dict({'col': ['A', 'C', 'D']})
XX_new

Unnamed: 0,col
0,A
1,C
2,D


In [15]:
ohe.transform(XX_new)

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.]])

## Tip 8: how to use pipeline to chain multiple steps

In [16]:
train = pd.DataFrame.from_dict(
    {
        'feat1': [10, 20, np.nan, 2],
        'feat2': [25, 20, 5, 3],
        'label': ['A', 'A', 'B', 'B'],
    }
)

test = pd.DataFrame.from_dict(
    {
        'feat1': [30, 5, 15],
        'feat2': [12, 10, np.nan],
    }
)

In [17]:
features = ['feat1', 'feat2']
X, y = train[features], train['label']
X_new = test[features]

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(
    SimpleImputer(),
    LogisticRegression(),
)

pipe.fit(X, y)

In [19]:
pipe.predict(X_new)

array(['A', 'B', 'A'], dtype=object)

## Tip 9: when imputing missing values, preserve info about which values were missing and use THAT as a feature

In [20]:
X = pd.DataFrame(columns=['Age'], data=[20, 30, 10, np.nan, 10])
X

Unnamed: 0,Age
0,20.0
1,30.0
2,10.0
3,
4,10.0


In [21]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(add_indicator=True)
imputer.fit_transform(X)


array([[20. ,  0. ],
       [30. ,  0. ],
       [10. ,  0. ],
       [17.5,  1. ],
       [10. ,  0. ]])

## Tip 10: set a random_state to make code reproducible

## Tip 11: KNN Imputer and Iterative Imputer

Simple Imputer: take mean, median, mode, ...

Multivariate Approaches

KNN Imputer: 3 features abc, c has missing value
* for a row where c is missing
* find a most similar row in terms of ab
* use the c of that row to fill in

Iterative Imputer: 3 features abc, c has missing values
* Use ab as features, c as target, train a regression model
* Fill missing c with predictions

## Tip 12: Pipeline vs make_pipeline

Pipeline require naming of steps
make_pipeline doesn't

In [22]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [23]:
ct = make_column_transformer(
    (OneHotEncoder(), ['Embarked', 'Sex']),
    (SimpleImputer(), ['Age']),
    remainder='passthrough',
)
make_pipeline(ct, LogisticRegression())

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

ct = ColumnTransformer(
    [
        ('encoder', OneHotEncoder(), ['Embarked', 'Sex']),
        ('imputer', SimpleImputer(), ['Age']),
    ],
    remainder='passthrough',
)
Pipeline(
    [
        ('preprocessor', ct), 
        ('classifier', LogisticRegression()),
    ]
)

## Tip 13: examine intermediate steps in pipeline

In [30]:
df = pd.DataFrame(columns = ['age', 'class', 'survived'],
                  data = np.array([[22, 38, 26, 35, 35, np.nan],
                          [3, 1, 3, 1, 3, 3],
                          [0, 1, 1, 1, 0, 0]]).T)
df

Unnamed: 0,age,class,survived
0,22.0,3.0,0.0
1,38.0,1.0,1.0
2,26.0,3.0,1.0
3,35.0,1.0,1.0
4,35.0,3.0,0.0
5,,3.0,0.0


In [31]:
X = df[['age', 'class']]
y = df['survived']

In [32]:
pipe = make_pipeline(SimpleImputer(), LogisticRegression())
pipe.fit(X, y)

In [38]:
pipe.named_steps.logisticregression.coef_

array([[ 0.03232238, -0.83741131]])

In [39]:
pipe.named_steps.simpleimputer.statistics_

array([31.2       ,  2.33333333])

In [40]:
pipe.named_steps.simpleimputer.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'mean',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['age', 'class'], dtype=object),
 'n_features_in_': 2,
 '_fit_dtype': dtype('float64'),
 'indicator_': None,
 'statistics_': array([31.2       ,  2.33333333])}

## Tip 14: handle nan
1. drop rows
2. drop ColumnTransformer
3. fill nan with imputed values
4. use a model that naively handle nan

In [44]:
train = pd.concat([X, y], axis = 1)
train

Unnamed: 0,age,class,survived
0,22.0,3.0,0.0
1,38.0,1.0,1.0
2,26.0,3.0,1.0
3,35.0,1.0,1.0
4,35.0,3.0,0.0
5,,3.0,0.0


In [45]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier



In [50]:
clf = HistGradientBoostingClassifier()
clf.fit(X, y)

In [51]:
clf.predict(X)

array([0., 0., 0., 0., 0., 0.])

## Tip 15: don't use drop='first' with OneHotEncoder
1. multicollinearity is rarely an issue with scikit-learn model
2. drop='first' is incompatibel with handle_unknow='ignore'
3. problematic if standarize all features or use a regularized model

## Tip 16: use cross_val_score and GridSearchCV on a pipeline

In [160]:
from sklearn.feature_extraction.text import CountVectorizer

In [161]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

In [167]:
cv = CountVectorizer()
cv.fit_transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [168]:
cv.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [53]:
pipe = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(), ['Sex']),
        (CountVectorizer(), ['Name']),
    ),
    LogisticRegression(),
)

In [68]:
df = pd.DataFrame(columns = ['Sex', 'Name', 'label'], index=range(5))
df['Sex'] = np.random.choice(['M', 'F'], 5, replace=True)
df['Name'] = ['Sam', 'Bob', 'Nancy', 'Caty', 'Emily']
df['label'] = np.random.choice([0, 1], 5, replace = True)

df = pd.concat([df, df], axis = 0)

df

Unnamed: 0,Sex,Name,label
0,M,Sam,1
1,F,Bob,0
2,M,Nancy,1
3,M,Caty,1
4,F,Emily,0
0,M,Sam,1
1,F,Bob,0
2,M,Nancy,1
3,M,Caty,1
4,F,Emily,0


In [69]:
X = df[['Sex', 'Name']]
y = df['label']

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

In [None]:
params = {
    'columntransformer__countvectorizer__min_df': [1, 2],
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__penalty': ['l1', 'l2'],
}

In [71]:
pipe

## Tip 17: if grid-search-cv takes too long, try Randomized-Search-CV with a small number of iterations. 
Make Sure to specify a distribution for continuous parameters. 

In [72]:
from sklearn.naive_bayes import MultinomialNB

In [73]:
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

In [74]:
import scipy as sp
params = {
    'countvectorizer__min_df': [1, 2, 3, 4],
    'countvectorizer__lowercase': [True, False],
    'multinomialnb__alpha': sp.stats.uniform(scale=1),
}

In [75]:
from sklearn.model_selection import RandomizedSearchCV

rand = RandomizedSearchCV(
    pipe, 
    params,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=1,
)

rand.fit(X, y)

{'countvectorizer__min_df': [1, 2, 3, 4],
 'countvectorizer__lowercase': [True, False],
 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x7fa67e662e00>}

## Tip 18: hyper-params results from gridsearchcv and randomizedsearchcv can be stored into a Panda Dataframe

## Tip 19: tune params for LogisticRegression

C: inverse of regulaization strength

penalty

solver

## Tip 20: plot a confusion matrix in one line of code

In [None]:
## Tip 21: 

## Tip 22: pipeline fit(), predict()  vs fit_transform() and transform()

Pipeline has steps

Type 1: Pipeline that ends with a model (classifier or regressor)
* pipe.fit(): all steps before the last one run fit_transform(), the final step runs fit()
* pipe.predict(): all steps before the last run transform(), the final step runs predict()

Type 2: Pipeline that ends with a transformer
* pipe.fit_transform(): all steps run fit_transform()
* pipe.transform(): all steps run transform()

In [None]:
## Tip 23: 

## Tip 24: visualize a decision tree
from sklearn.tree import plot_tree, export_text

In [None]:
## Tip 25: 

## Tip 26: use train_test_split with stratify=y

## Tip 27: impute missing values for categorical feature
1. most frequent
2. replace with 'missing', treated as a separate category

## Tip 28: save a model 
```
joblib.dump(pipe, 'pipe.joblib')
```

## Tip 29: vectorize two text columns into a ColumnTransformer

In [78]:
make_column_transformer(
    (CountVectorizer(), 'Name'),
    (CountVectorizer(), 'Cabin'),
)

## Tip 30: display part of a pipeline

In [None]:
pipe.named_steps.logisticregression.coef_

In [None]:
pipe[1].coef_

## Tip 31: cross-validation, and samples are NOT in random order, then SHUFFLING may be needed to get meaningful result

In [80]:
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
cross_val_score(
    LogisticRegression(),
    X,
    y, 
    cv=KFold(5, shuffle=True, random_state=1),
    scoring='r2',
)

cross_val_score(
    LogisticRegression(),
    X,
    y,
    cv=StratifiedKFold(5, shuffle=True, random_state=1),
    scoring='accuracy',
)

## Tip 32: AUC good for multiclass problems
1. one-vs-one
2. one-vs-rest

In [86]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

In [88]:
X.shape, y.shape

((150, 4), (150,))

In [89]:
from sklearn.metrics import roc_auc_score

In [91]:
lr = LogisticRegression()
lr.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
y_score = lr.predict_proba(X)

In [96]:
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([50, 50, 50]))

In [93]:
roc_auc_score(y, y_score, multi_class='ovo')

0.9984000000000001

In [94]:
roc_auc_score(y, y_score, multi_class='ovr')

0.9984000000000001

## Tip 33: feature engineering with ColumnTransformer or Pipeline
1. select an existing function (or write yours down)
2. convert it into a transformer using FunctionTransformer

In [98]:
from sklearn.preprocessing import FunctionTransformer

In [101]:
clip_values_transformer = FunctionTransformer(
    np.clip, kw_args={'a_min': 100, 'a_max': 600}
)


def first_letter(df):
    return df.apply(lambda x: x.str.slice(0, 1))
get_first_letter_transformer = FunctionTransformer(first_letter)




ct = make_column_transformer(
    (clip_values_transformer, ['Fare']),
    (get_first_letter_transformer, ['Code', 'Deck']),
)

In [103]:
X = pd.DataFrame.from_dict(
    {
        'Fare': [200, 300, 50, 900],
        'Code': ['X12', 'Y20', 'Z7', np.nan],
        'Deck': ['A101', 'C102', 'A200', 'C300'],
    }
)
X

Unnamed: 0,Fare,Code,Deck
0,200,X12,A101
1,300,Y20,C102
2,50,Z7,A200
3,900,,C300


In [104]:
ct.fit_transform(X)

array([[200, 'X', 'A'],
       [300, 'Y', 'C'],
       [100, 'Z', 'A'],
       [600, nan, 'C']], dtype=object)

In [109]:
clip_values_transformer.fit_transform(X[['Fare']])

Unnamed: 0,Fare
0,200
1,300
2,100
3,600


In [111]:
get_first_letter_transformer.fit_transform(X[['Code']])

Unnamed: 0,Code
0,X
1,Y
2,Z
3,


In [112]:
get_first_letter_transformer.fit_transform(X[['Deck']])

Unnamed: 0,Deck
0,A
1,C
2,A
3,C


## Tip 34: add feature selection to Pipeline
1. Use SelectPercentile to keep highest score features
2. Add feature selection after preprocessing but before model building

In [113]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [114]:
pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(),
)

In [116]:
from sklearn.feature_selection import SelectPercentile, chi2

pipe = make_pipeline(
    CountVectorizer(),
    SelectPercentile(chi2, percentile=50),
    LogisticRegression(),
)

In [118]:
X, y = load_iris(return_X_y=True)
fs = SelectPercentile(chi2, percentile=10).fit(X, y)

In [120]:
fs.__dict__

{'score_func': <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 'percentile': 10,
 'n_features_in_': 4,
 'scores_': array([ 10.81782088,   3.7107283 , 116.31261309,  67.0483602 ]),
 'pvalues_': array([4.47651499e-03, 1.56395980e-01, 5.53397228e-26, 2.75824965e-15])}

In [124]:
SelectPercentile(chi2, percentile=50).fit_transform(X, y).shape

(150, 2)

In [None]:
## Tip 35: no need to use .values when passing df to scikit-learn

In [None]:
## Tip 36: most params should be passed as keyword arguments

## Tip 37: interactive diagrams of Pipelines in Jupyter

In [125]:
pipe

## Tip 38: get_feature_names for ColumnTransformer

In [133]:
X, y = load_iris(return_X_y=True)
X = pd.DataFrame(X, columns = ['Embarked', 'Sex', 'Parch', 'Fare']).head(10)
X

Unnamed: 0,Embarked,Sex,Parch,Fare
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [134]:
ct = make_column_transformer(
    (OneHotEncoder(), ['Embarked', 'Sex']),
    remainder='passthrough',
)

ct.fit_transform(X).shape

(10, 17)

In [136]:
ct.get_feature_names_out()

array(['onehotencoder__Embarked_4.4', 'onehotencoder__Embarked_4.6',
       'onehotencoder__Embarked_4.7', 'onehotencoder__Embarked_4.9',
       'onehotencoder__Embarked_5.0', 'onehotencoder__Embarked_5.1',
       'onehotencoder__Embarked_5.4', 'onehotencoder__Sex_2.9',
       'onehotencoder__Sex_3.0', 'onehotencoder__Sex_3.1',
       'onehotencoder__Sex_3.2', 'onehotencoder__Sex_3.4',
       'onehotencoder__Sex_3.5', 'onehotencoder__Sex_3.6',
       'onehotencoder__Sex_3.9', 'remainder__Parch', 'remainder__Fare'],
      dtype=object)

## Tip 39: 
as_frame=True

return_X_y=True

In [138]:
X, y = load_iris(as_frame=True, return_X_y=True)

In [139]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Tip 40: estimators only print params NOT set to default values

In [142]:
lr = LogisticRegression(C=0.1, solver='liblinear')
lr

In [144]:
lr.get_params()

{'C': 0.1,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

## Tip 41: 

drop='if_binary' with OneHotEncoder 

## Tip 42: 

ColumnTransformer

passthrough some, drop others


In [None]:
make_column_transformer(
    (SimpleImputer(), ['A']),
    ('passthrough', ['B', 'C']), # B C keep, make no change
    remainder='drop', # throw away remaining columns
)

make_columns_transformer(
    (SimpleImputer(), ['A']),
    ('drop', ['B', 'C']),
    remainder='passthrough',
)

## Tip 43: 

tree-based model

try OrdinalEncoder instead of OneHotEncoder, even for nominal (unordered) features

accuracy similar, speed faster


OrdinalEncoder:
* encode different levels (size = S, M, L, XL, XXL) into single column
* even for variable like occupation, or unordered features, can use OrdinalEncoder
* way less time (because single column is easier to train than many columns of dummies)
* tree-based model: can learn same info from OrdinalEncoder than OneHotEncoder, even if unordered: because it learns through recursive splits
* linear-regression: would be far better using OneHotEncoder, then OrdinalEncoder



## Tip 44: speed up GridSearchCV using Parallel Processing

set n_jobs=-1: using all CPUs

## Tip 45: 

PolynomialFeatures: create feature interactions

not necessary if using tree-based model

tree-based: can learn interaction on its own through recursive splitting

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# original: A, B, C
# new: A, B, C, A*B, A*C, B*C
PolynomialFeatures(include_bias=False, interaction_only=True)

## Tip 46: 

Want to increase accuracy? 

create multiple models, ensemble them using VotingClassifier 

In [None]:
lr = LogisticRegression()
rf = RandomForestClassifier()


# ensemble
#   voting='hard': use predicted class labels for majority voting
#   voting='soft': use argmax of sums of predicted probabilities, require all models to have predict_proba method
vc = VotingClassifier(
    [
        ('lr', lr),
        ('rf', rf),
    ],
    voting='soft',
)
# ensemble itself can be improved through GridSearch


# VotingRegressor: average of all regressors


## Tip 47: VotingClassifier

Tune parameters:
1. voting
2. weights

In [None]:
lr = LogisticRegression()
rf = RandomForestClassifier()
nb = MultinomialNB()

vc = VotingClassifier(
    [
        ('lr', lr),
        ('rf', rf),
        ('nb', nb),
    ]
)


In [None]:
params = {
    'voting': ['hard', 'soft'],
    'weights': [(1,1,1), (2,1,1), (1,2,1), (1,1,2)],
}

grid = GridSearchCV(vc, params)
grid.fit(X, y)

grid.best_params_
grid.best_score_

## Tip 48: slice Pipeline

In [None]:
pipe = Pipeline(
    [
        ('preprocessing', ct),
        ('feature selector', fs),
        ('classifier', lr),
    ]
)
pipe

In [None]:
pipe[0].fit_transform(X)

In [None]:
# access step 0 and step 1: how data look like after preprocessing and feature selection
pipe[0:2].fit_transform(X, y)

In [None]:
# what features are selected during step 1
pipe[1].get_support()

In [None]:
## Tip 49: 
Can tune 2 models using the same Grid Search

## Tip 50:

Simple Pattern for ML problems (below)

1. assume all types have proper data types
2. may include irrelevant or improper features (this pattern use all features)
3. does not handle text or date columns well
    * text: CountVectorizer
    * date: extract relevant features from date, instead of OneHotEncoder
4. no feature engineering
5. ordinal encoding may be better
6. numeric features may not need to be scaled
7. a different model may be better
8. ...

In [151]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [154]:
# set up preprocessing of numeric columns
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()

In [156]:
# set up preprocessing of categorical columns
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')

In [157]:
# select column by data types
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [158]:
# do all preprocessing
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols),
)

In [159]:
# create a pipeline
pipe = make_pipeline(
    preprocessor, LogisticRegression()
)

In [None]:
# cross-validate the pipeline
cross_val_score(pipe, X, y).mean()

In [None]:
pipe.fit(X, y)