**Pipeline and Model Construction**
---

Imports

In [None]:
from sklearn.ensemble import RandomForestRegressor ### Model construction tool
from sklearn.cluster import KMeans ### Model construction tools

from sklearn.pipeline import Pipeline ### Model construction tools
from sklearn.compose import ColumnTransformer ### Model construction tools

from sklearn.preprocessing import StandardScaler ### Model construction tools

from sklearn.model_selection import GridSearchCV ### Testing tools
import utils.data_handling_lib as dhl ### Testing tools

from utils.data_handling_lib import RANDOM_STATE ### Set constant


**Define pipeline elements**

In [None]:
def make_model_pipeline(X, y):
    """
    Creates pre-fit pipeline object

    Args:
        X (iterable): Wine training features dataset
        y (iterable): Wine training labels dataset

    Returns:
        Pipeline: Unfitted pipeline object
    """
    
    cluster_features = ["free sulfur dioxide", "total sulfur dioxide", "pH", "sulphates"]
    scale_features = ["fixed acidity", "volatile acidity", "density",
                        "residual sugar", "chlorides", "citric acid"]
    cat_features = ["color"]

    cluster_pipeline = Pipeline([
        ("kmeans_cluster", KMeans(n_clusters=3, random_state=RANDOM_STATE))
        ])

    num_pipeline = Pipeline([
        ("std_scaler", StandardScaler()),
        ])

    data_preparation_pipeline = ColumnTransformer([
        ("cluster", cluster_pipeline, cluster_features),
        ("num", num_pipeline, scale_features),
        ("pass", "passthrough", cat_features)
        ])

    model_pipeline = Pipeline([
        ("preprocessing", data_preparation_pipeline),
        ("rf_reg", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)),
        ])

    model_pipeline.fit(X=X,y=y)
    
    return model_pipeline

**Load and strat split dataset**

In [None]:
test_dataset = dhl.load_data()
X_train, y_train, X_test, y_test = dhl.strat_split_dataset(test_dataset, "quality")

In [None]:
comparison_slice = X_test[100:125]
comparison_slice_labels = y_test[100:125]

**Create model object to run GridSearchCV on**

In [None]:
test_model = make_model_pipeline(X_train, y_train)

In [None]:
prediction = test_model.predict(X_test[90:100])
actual = y_test[90:100]

for val in prediction:
      print(f"{val}")
print(f"Actual:\n{actual}")

In [None]:
param_grid = {"preprocessing__cluster__kmeans_cluster__n_clusters":[2, 3],
              "preprocessing__cluster__kmeans_cluster__n_init":[10,50,100],
              "preprocessing__num__std_scaler__with_mean":[False, True],
              "rf_reg__bootstrap":[False, True],
              "rf_reg__n_estimators":[100,500,1000],
              "rf_reg__min_samples_leaf":[1,2,5]
            }

grid_search = GridSearchCV(test_model, param_grid=param_grid, n_jobs=-1)

In [None]:
grid_search.get_params().keys()

In [None]:
#grid_search.fit(X_train, y_train)

grid_search = dhl.load_model("grid_search_1_rf_reg")

grid_search.best_score_

In [None]:
print(f"{grid_search.best_params_}",
      f"\n\n{grid_search.best_estimator_}")

**Final Function Call Construction**

In [None]:
def make_best_pipeline(X, y):
    """
    Creates pre-fit pipeline object

    Args:
        X (iterable): Wine training features dataset
        y (iterable): Wine training labels dataset

    Returns:
        Pipeline: Unfitted pipeline object
    """
    
    cluster_features = ["free sulfur dioxide", "total sulfur dioxide", "pH", "sulphates"]
    scale_features = ["fixed acidity", "volatile acidity", "density",
                        "residual sugar", "chlorides", "citric acid"]
    cat_features = ["color"]

    cluster_pipeline = Pipeline([
        ("kmeans_cluster", KMeans(n_clusters=2, random_state=RANDOM_STATE))
        ])

    num_pipeline = Pipeline([
        ("std_scaler", StandardScaler(with_mean=True)),
        ])

    data_preparation_pipeline = ColumnTransformer([
        ("cluster", cluster_pipeline, cluster_features),
        ("num", num_pipeline, scale_features),
        ("pass", "passthrough", cat_features)
        ], n_jobs=-1)

    model_pipeline = Pipeline([
        ("preprocessing", data_preparation_pipeline),
        ("rf_reg", RandomForestRegressor(n_estimators=1000, random_state=RANDOM_STATE, bootstrap=True, n_jobs=-1)),
        ])

    model_pipeline.fit(X=X,y=y)
    
    return model_pipeline

Creating a model with final pipeline constructor

In [None]:
best_model = make_best_pipeline(X_train, y_train)

In [None]:
def make_comparison_pipeline(X, y):
    """
    Creates pre-fit pipeline object

    Args:
        X (iterable): Wine training features dataset
        y (iterable): Wine training labels dataset

    Returns:
        Pipeline: Unfitted pipeline object
    """

    scale_features = ["fixed acidity", "volatile acidity", "density",
                        "residual sugar", "chlorides", "citric acid"]
    cat_features = ["color"]

    num_pipeline = Pipeline([
        ("std_scaler", StandardScaler(with_mean=True)),
        ])

    data_preparation_pipeline = ColumnTransformer([
        ("num", num_pipeline, scale_features),
        ("pass", "passthrough", cat_features)
        ], n_jobs=-1)

    model_pipeline = Pipeline([
        ("preprocessing", data_preparation_pipeline),
        ("rf_reg", RandomForestRegressor(n_estimators=1000, random_state=RANDOM_STATE, bootstrap=True, n_jobs=-1)),
        ])

    model_pipeline.fit(X=X,y=y)
    
    return model_pipeline

Model Comparison

In [None]:
comparison_model = make_comparison_pipeline(X=X_train, y=y_train)

In [None]:
test_model.score(comparison_slice, comparison_slice_labels), best_model.score(comparison_slice, comparison_slice_labels), comparison_model.score(comparison_slice, comparison_slice_labels)

In [None]:
def predict_scores(model, X_test, y_test):
    predictions = model.predict(X_test)

    y_ix = y_test.index
    
    for i in range(0, len(y_test)):
        print(f"Index: {y_ix[i]}",
              f"Predicted quality: {predictions[i]}",
              f"Actual quality: {y_test.iloc[i]}")
        

In [None]:
predict_scores(best_model, comparison_slice, comparison_slice_labels)