**Test Environment Notebook**
------

In [33]:
### Tool imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### sklearn imports
from sklearn.cluster import KMeans, MiniBatchKMeans

### Local imports
import utils.data_handling_lib as dhl
import utils.graphics_lib as gl

from utils.data_handling_lib import RANDOM_STATE

Create raw test dataset

In [34]:
dataset_file_path = "../datasets/winequality-combined.csv"
test_raw_data = dhl.load_data(dataset_file_path)

Reduced features test

In [35]:
test_reduced_raw_data = test_raw_data.copy()

test_reduced_raw_data.drop(["free sulfur dioxide", "total sulfur dioxide", "pH", "residual sugar", "sulphates"], axis=1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,density,alcohol,quality,color
0,7.4,0.70,0.00,0.076,0.99780,9.4,5,1
1,7.8,0.88,0.00,0.098,0.99680,9.8,5,1
2,7.8,0.76,0.04,0.092,0.99700,9.8,5,1
3,11.2,0.28,0.56,0.075,0.99800,9.8,6,1
4,7.4,0.70,0.00,0.076,0.99780,9.4,5,1
...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,0.039,0.99114,11.2,6,0
6493,6.6,0.32,0.36,0.047,0.99490,9.6,5,0
6494,6.5,0.24,0.19,0.041,0.99254,9.4,6,0
6495,5.5,0.29,0.30,0.022,0.98869,12.8,7,0


Split dataset

In [36]:
train, train_labels, test, test_labels = dhl.strat_split_dataset(test_reduced_raw_data, "quality", 1)

In [37]:
print(test.head())

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
4472            5.8              0.21         0.32             1.6      0.045   
4632            6.8              0.24         0.38             8.3      0.045   
4589            5.0              0.27         0.32             4.5      0.032   
4738            6.2              0.19         0.29             4.3      0.045   
4346            7.3              0.26         0.33            11.8      0.057   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
4472                 38.0                  95.0  0.98946  3.23       0.94   
4632                 50.0                 185.0  0.99578  3.15       0.50   
4589                 58.0                 178.0  0.98956  3.45       0.31   
4738                 33.0                 126.0  0.99658  3.18       0.42   
4346                 48.0                 127.0  0.99693  3.10       0.55   

      alcohol  color  
4472     12.4      0  
4632

**Random Forest w/ KMeans Clustering Test**

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [39]:
import utils.clustering_tools_lib as ctl

In [40]:

def fit_test_pipeline_1(data, labels):
    rfkc_test_pipeline = Pipeline([
        ("kmeans", KMeans(n_clusters=5, random_state=RANDOM_STATE)),
        ("rf_reg", RandomForestRegressor(n_estimators=100, max_features=4))
    ])
    
    rfkc_test_pipeline.fit(data, labels)
    
    return rfkc_test_pipeline

In [41]:
param_grid = dict(kmeans__n_clusters=[10, 100, 1000])

In [42]:
test_1 = fit_test_pipeline_1(train, train_labels)

In [43]:
grid_search = GridSearchCV(test_1, param_grid=param_grid)

In [44]:
grid_search.fit(train, train_labels)

GridSearchCV(estimator=Pipeline(steps=[('kmeans',
                                        KMeans(n_clusters=5, random_state=19)),
                                       ('rf_reg',
                                        RandomForestRegressor(max_features=4))]),
             param_grid={'kmeans__n_clusters': [10, 100, 1000]})

In [45]:
print(f"\n{grid_search.best_score_}\n{grid_search.best_params_}\n{grid_search.best_estimator_}")


0.16681016323225006
{'kmeans__n_clusters': 1000}
Pipeline(steps=[('kmeans', KMeans(n_clusters=1000, random_state=19)),
                ('rf_reg', RandomForestRegressor(max_features=4))])


In [46]:
grid_search.best_score_

0.16681016323225006