**Pipeline and Model Construction**
---

Imports

In [1]:
from sklearn.ensemble import RandomForestRegressor ### Model construction tool
from sklearn.cluster import KMeans ### Model construction tools

from sklearn.pipeline import Pipeline ### Model construction tools
from sklearn.compose import ColumnTransformer ### Model construction tools

from sklearn.preprocessing import StandardScaler ### Model construction tools

from sklearn.model_selection import GridSearchCV ### Testing tools
import utils.data_handling_lib as dhl ### Testing tools

from utils.data_handling_lib import RANDOM_STATE ### Set constant


**Define pipeline elements**

In [2]:
def make_model_pipeline(X, y):
    """
    Creates pre-fit pipeline object

    Args:
        X (iterable): Wine training features dataset
        y (iterable): Wine training labels dataset

    Returns:
        Pipeline: Unfitted pipeline object
    """
    
    cluster_features = ["free sulfur dioxide", "total sulfur dioxide", "pH", "sulphates"]
    scale_features = ["fixed acidity", "volatile acidity", "density",
                        "residual sugar", "chlorides", "citric acid"]
    cat_features = ["color"]

    cluster_pipeline = Pipeline([
        ("kmeans_cluster", KMeans(n_clusters=3, random_state=RANDOM_STATE))
        ])

    num_pipeline = Pipeline([
        ("std_scaler", StandardScaler()),
        ])

    data_preparation_pipeline = ColumnTransformer([
        ("cluster", cluster_pipeline, cluster_features),
        ("num", num_pipeline, scale_features),
        ("pass", "passthrough", cat_features)
        ])

    model_pipeline = Pipeline([
        ("preprocessing", data_preparation_pipeline),
        ("rf_reg", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)),
        ])

    model_pipeline.fit(X=X,y=y)
    
    return model_pipeline

**Load and strat split dataset**

In [3]:
test_dataset = dhl.load_data()
X_train, y_train, X_test, y_test = dhl.strat_split_dataset(test_dataset, "quality")

In [4]:
comparison_slice = X_test[100:125]
comparison_slice_labels = y_test[100:125]

**Create model object to run GridSearchCV on**

In [5]:
test_model = make_model_pipeline(X_train, y_train)

In [6]:
param_grid = {"preprocessing__cluster__kmeans_cluster__n_clusters":[2, 3],
              "preprocessing__cluster__kmeans_cluster__n_init":[10,50,100],
              "preprocessing__num__std_scaler__with_mean":[False, True],
              "rf_reg__bootstrap":[False, True],
              "rf_reg__n_estimators":[100,500,1000],
              "rf_reg__min_samples_leaf":[1,2,5]
            }

grid_search = GridSearchCV(test_model, param_grid=param_grid, n_jobs=-1)

In [7]:
grid_search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__preprocessing', 'estimator__rf_reg', 'estimator__preprocessing__n_jobs', 'estimator__preprocessing__remainder', 'estimator__preprocessing__sparse_threshold', 'estimator__preprocessing__transformer_weights', 'estimator__preprocessing__transformers', 'estimator__preprocessing__verbose', 'estimator__preprocessing__verbose_feature_names_out', 'estimator__preprocessing__cluster', 'estimator__preprocessing__num', 'estimator__preprocessing__pass', 'estimator__preprocessing__cluster__memory', 'estimator__preprocessing__cluster__steps', 'estimator__preprocessing__cluster__verbose', 'estimator__preprocessing__cluster__kmeans_cluster', 'estimator__preprocessing__cluster__kmeans_cluster__algorithm', 'estimator__preprocessing__cluster__kmeans_cluster__copy_x', 'estimator__preprocessing__cluster__kmeans_cluster__init', 'estimator__preprocessing__cluster__kmeans_cluster__max_iter', 'estimator__p

In [8]:
#grid_search.fit(X_train, y_train)

grid_search = dhl.load_model("grid_search_1_rf_reg")

grid_search.best_score_

0.4541016232606202

In [9]:
prediction = test_model.predict(X_test[90:100])
actual = y_test[90:100]

for val in prediction:
      print(f"{val}")
print(f"Actual:\n{actual}")

5.26
5.88
5.76
6.75
6.36
5.71
5.41
5.92
6.17
5.6
Actual:
370     5
6390    6
1174    6
3898    7
5887    6
6479    6
1332    6
3735    6
5124    6
6377    6
Name: quality, dtype: int64


In [10]:
print(f"{grid_search.best_params_}",
      f"\n\n{grid_search.best_estimator_}")


{'preprocessing__cluster__kmeans_cluster__n_clusters': 2, 'preprocessing__cluster__kmeans_cluster__n_init': 10, 'preprocessing__num__std_scaler__with_mean': True, 'rf_reg__bootstrap': True, 'rf_reg__min_samples_leaf': 1, 'rf_reg__n_estimators': 1000} 

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cluster',
                                                  Pipeline(steps=[('kmeans_cluster',
                                                                   KMeans(n_clusters=2,
                                                                          random_state=17))]),
                                                  ['free sulfur dioxide',
                                                   'total sulfur dioxide', 'pH',
                                                   'sulphates']),
                                                 ('num',
                                                  Pipeline(steps=[('std_scaler',
                         

In [11]:
def make_best_pipeline(X, y):
    """
    Creates pre-fit pipeline object

    Args:
        X (iterable): Wine training features dataset
        y (iterable): Wine training labels dataset

    Returns:
        Pipeline: Unfitted pipeline object
    """
    
    cluster_features = ["free sulfur dioxide", "total sulfur dioxide", "pH", "sulphates"]
    scale_features = ["fixed acidity", "volatile acidity", "density",
                        "residual sugar", "chlorides", "citric acid"]
    cat_features = ["color"]

    cluster_pipeline = Pipeline([
        ("kmeans_cluster", KMeans(n_clusters=2, random_state=RANDOM_STATE))
        ])

    num_pipeline = Pipeline([
        ("std_scaler", StandardScaler(with_mean=True)),
        ])

    data_preparation_pipeline = ColumnTransformer([
        ("cluster", cluster_pipeline, cluster_features),
        ("num", num_pipeline, scale_features),
        ("pass", "passthrough", cat_features)
        ], n_jobs=-1)

    model_pipeline = Pipeline([
        ("preprocessing", data_preparation_pipeline),
        ("rf_reg", RandomForestRegressor(n_estimators=1000, random_state=RANDOM_STATE, bootstrap=True, n_jobs=-1)),
        ])

    model_pipeline.fit(X=X,y=y)
    
    return model_pipeline

In [12]:
best_model = make_best_pipeline(X_train, y_train)

In [13]:
test_model.score(comparison_slice, comparison_slice_labels), best_model.score(comparison_slice, comparison_slice_labels)

(0.4844428571428572, 0.5177752142857142)