In [1]:
import pandas as pd
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)
from sklearn.base import (
    BaseEstimator,
    TransformerMixin,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
from sklearn.pipeline import Pipeline
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

class TitanicTransformer(
    BaseEstimator, TransformerMixin
):
    def transform(self, X):
        # assumes X is output
        # from reading Excel file
        X = tweak_titanic(X)
        X = X.drop(columns="survived")
        return X
    def fit(self, X, y):
        return self
pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("rf", RandomForestClassifier()),
    ]
)

In [2]:
from sklearn.model_selection import (
    train_test_split,
)
url = (
    "http://biostat.mc.vanderbilt.edu/"
    "wiki/pub/Main/DataSets/titanic3.xls"
)
df = pd.read_excel(url)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df,
    df.survived,
    test_size=0.3,
    random_state=42,
)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)



0.7837150127226463

In [3]:
params = {
    "rf__max_features": [0.4, "auto"],
    "rf__n_estimators": [15, 200],
}
grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=params
)
grid.fit(df, df.survived)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('titan', TitanicTransformer()),
                                       ('impute',
                                        IterativeImputer(add_indicator=False,
                                                         estimator=None,
                                                         imputation_order='ascending',
                                                         initial_strategy='mean',
                                                         max_iter=10,
                                                         max_value=None,
                                                         min_value=None,
                                                         missing_values=nan,
                                                         n_nearest_features=None,
                                                         random_state=None,
          

In [4]:
grid.best_params_
pipe.set_params(**grid.best_params_)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7786259541984732

In [5]:
from sklearn import metrics
metrics.roc_auc_score(
    y_test2, pipe.predict(X_test2)
)

0.7673024091293321

In [6]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn import (
    model_selection,
    preprocessing,
)
b = load_boston()
bos_X = pd.DataFrame(
    b.data, columns=b.feature_names
)
bos_y = b.target
bos_X_train, bos_X_test, bos_y_train, bos_y_test = model_selection.train_test_split(
    bos_X,
    bos_y,
    test_size=0.3,
    random_state=42,
)
bos_sX = preprocessing.StandardScaler().fit_transform(
    bos_X
)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = model_selection.train_test_split(
    bos_sX,
    bos_y,
    test_size=0.3,
    random_state=42,
)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
reg_pipe = Pipeline(
    [
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("lr", LinearRegression()),
    ]
)
reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)

0.7112260057484933

In [8]:
reg_pipe.named_steps["lr"].intercept_
reg_pipe.named_steps["lr"].coef_

array([-1.10834602,  0.80843998,  0.34313466,  0.81386426, -1.79804295,
        2.913858  , -0.29893918, -2.94251148,  2.09419303, -1.44706731,
       -2.05232232,  1.02375187, -3.88579002])

In [9]:
from sklearn import metrics
metrics.mean_squared_error(
    bos_y_test, reg_pipe.predict(bos_X_test)
)

21.517444231177205

In [17]:
from sklearn.decomposition import PCA
pca_pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("pca", PCA()),
    ]
)
X_pca = pca_pipe.fit_transform(df, df.survived)

In [19]:
pca_pipe.named_steps[
    "pca"
].explained_variance_ratio_
spca_pipe.named_steps["pca"].components_[0]

array([-0.63591201,  0.39601221, -0.00210875,  0.10899408,  0.58278257,
       -0.19349714, -0.19275661, -0.11258022])