In [None]:
# Regular EDA and plotting libraries
import numpy as np # np is short for numpy
import pandas as pd # pandas is so commonly used, it's shortened to pd
import matplotlib.pyplot as plt
import seaborn as sns # seaborn gets shortened to sns

# We want our plots to appear in the notebook
%matplotlib inline 

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

# data preparation
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, RFECV
from sklearn.utils import resample
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import SMOTE

# machine learning
from sklearn.linear_model import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression


## Data, Evaluation, Features

In [None]:
## Get the data ready - our problem is classification cos we want to classify whether wine quality is low or high.
wine_quality = pd.read_csv("winequality-white.csv")
wine_quality.head()

In [None]:
wine_quality.shape

In [None]:
wine_quality = wine_quality.rename(columns={"fixed acidity": "fixed_acidity", "volatile acidity": "vol_acidity","citric acid": "citr_acid", 
                             "residual sugar": "residual_sugar", "free sulfur dioxide": "free_sulfur_dioxide",
                             "total sulfur dioxide": "total_sulfur_dioxide"})
wine_quality.head()

In [None]:
wine_quality.head(10)

In [None]:
# Let's see label (quality) distribution in our dataframe
wine_quality.quality.value_counts()

In [None]:
# create a list of our conditions
conditions = [(wine_quality['quality'] <= 5), 
              (wine_quality['quality'] >= 6)]

# create a list of the values we want to assign for each condition where '0' is low, '1' is high
values = [0, 1]

# create a new column and use np.select to assign values to it using our lists as arguments
wine_quality['label'] = np.select(conditions, values)

# display updated DataFrame
wine_quality[300:310]

In [None]:
wine_quality.label.value_counts()

In [None]:
wine_quality = wine_quality.drop(["quality"], axis=1)
wine_quality.head()

In [None]:
# Plot the value counts with a bar graph
wine_quality.label.value_counts().plot(kind="bar", color=["salmon", "lightblue"]);

In [None]:
wine_quality.describe()

In [None]:
# Create another figure
plt.figure(figsize=(8,5))

# Start with low quality wines
plt.scatter(wine_quality.alcohol[wine_quality.label==0], 
            wine_quality.residual_sugar	[wine_quality.label==0],
            c="magenta") 

# High Quality Wines, we want them on the same plot, so we call plt again
plt.scatter(wine_quality.alcohol[wine_quality.label==1], 
            wine_quality.residual_sugar	[wine_quality.label==1], 
            c="lightblue") # define it as a scatter figure

# Add some helpful info
plt.title("Wine Quality as a function of Alcohol and Residual Sugar")
plt.xlabel("Alcohol")
plt.legend(["Low", "High"])
plt.ylabel("Residual Sugar");

In [None]:
# Histograms are a great way to check the distribution of a variable
wine_quality.alcohol.plot.hist();

In [None]:
# set up random seed and create the X and y (train and test datasets)
np.random.seed(42)
X = wine_quality.drop(["label"], axis=1)
y = wine_quality["label"]
X.head()

### Correlation between independent variables

Finally, we'll compare all of the independent variables in one hit.

Why?

Because this may give an idea of which independent variables may or may not have an impact on our target variable.

We can do this using `df.corr()` which will create a [**correlation matrix**](https://www.statisticshowto.datasciencecentral.com/correlation-matrix/) for us, in other words, a big table of numbers telling us how related each variable is the other.

In [None]:
# Find the correlation between our independent variables
corr_matrix = wine_quality.corr()
corr_matrix 

In [None]:
# Let's make it look a little prettier
corr_matrix = wine_quality.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(corr_matrix, 
            annot=True, 
            linewidths=0.5, 
            fmt= ".2f", 
            cmap="YlGnBu");

Much better. A higher positive value means a potential positive correlation (increase) and a higher negative value means a potential negative correlation (decrease).

## Modelling

We've explored the data, now we'll try to use machine learning to predict our target variable based on the 11 independent variables.

And remember our evaluation metric?

> If we can reach 80% accuracy at predicting whether or not a bottle of wine is high quality, we'll adopt this project.

That's what we'll be aiming for.

But before we build a model, we have to get our dataset ready.

Let's look at it again.

In [None]:
wine_quality.head()

In [None]:
# set up random seed
np.random.seed(42)
X = wine_quality.drop(["label"], axis=1)
y = wine_quality["label"]
X.head()

In [None]:
wine_quality['label'].value_counts()

In [None]:
# imbalance of labels
wine_quality.label.value_counts().plot(kind="bar", color=["salmon", "lightblue"]);
plt.title('Label Class Imbalance')
plt.xlabel('Label values')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train

In [None]:
y_train, len(y_train)

In [None]:
# Scaling the data set

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dt_scaler = scaler.fit(X_train)
sc_X_train = dt_scaler.transform(X_train)
sc_X_test = dt_scaler.transform(X_test)

In [None]:
# Balancing the data sets

In [None]:
X_train_1, y_train_1 = make_imbalance(sc_X_train, y_train, 
                                  sampling_strategy={0: 1300, 1: 1300},random_state=14)

In [None]:
y_train_1.value_counts()

In [None]:
y_train_1.value_counts().plot(kind='bar', color='slateblue')
plt.title('label balance (Undersample)')
plt.xlabel('label values')
plt.ylabel('Freqency')
plt.show()

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train_2, y_train_2 = ros.fit_resample(sc_X_train, y_train,)

In [None]:
y_train_2.value_counts()

In [None]:
y_train_2.value_counts().plot(kind='bar', color='orange')
plt.title('Label Balance (Basic Upsample)')
plt.xlabel('Label values')
plt.ylabel('Frequency')
plt.show()

In [None]:
y_train_2.head()

In [None]:
smote = SMOTE(random_state = 14)
X_train_3, y_train_3 = smote.fit_resample(sc_X_train, y_train)

In [None]:
y_train_3.value_counts()

In [None]:
y_train_3.value_counts().plot(kind='bar', color=["salmon", "lightblue"])
plt.title('Label Balance (SMOTE)')
plt.xlabel('Label values')
plt.ylabel('Frequency')
plt.show()

### Model choices

Now we've got our data prepared, we can start to fit models. We'll be using the following and comparing their results.

1. Logistic Regression - [`LogisticRegression()`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
2. K-Nearest Neighbors - [`KNeighboursClassifier()`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
3. LinearSVC -

In [None]:
# Put models in a dictionary and fitting with Undersampling Balanced dataset
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(),
          "Linear SVC": LinearSVC(max_iter=10000)}

# Create function to fit and score models
def fit_and_score(models, X_train_1, sc_X_test, y_train_1, y_test):
    """
    Fits and evaluates given machine learning models using Undersample Balanced Data.
    models : a dictionary of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels associated with training data
    y_test : labels associated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores_1 = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train_1, y_train_1)
        # Evaluate the model and append its score to model_scores
        model_scores_1[name] = model.score(sc_X_test, y_test)
    return model_scores_1

In [None]:
model_scores_1 = fit_and_score(models=models,
                             X_train_1=X_train_1,
                             sc_X_test=sc_X_test,
                             y_train_1=y_train_1,
                             y_test=y_test
                             )
model_scores_1

In [None]:
# Put models in a dictionary and fitting with Oversampling Balanced dataset
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(),
          "Linear SVC": LinearSVC(max_iter=10000)}

# Create function to fit and score models
def fit_and_score(models, X_train_2, sc_X_test, y_train_2, y_test):
    """
    Fits and evaluates given machine learning models using Upsample Balanced Data.
    models : a dictionary of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels associated with training data
    y_test : labels associated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores_2 = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train_2, y_train_2)
        # Evaluate the model and append its score to model_scores
        model_scores_2[name] = model.score(sc_X_test, y_test)
    return model_scores_2

In [None]:
model_scores_2 = fit_and_score(models=models,
                             X_train_2=X_train_2,
                             sc_X_test=sc_X_test,
                             y_train_2=y_train_2,
                             y_test=y_test
                             )
model_scores_2

In [None]:
# Put models in a dictionary and fitting with SMOTE Balanced dataset
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(),
          "Linear SVC": LinearSVC(max_iter=10000)}

# Create function to fit and score models
def fit_and_score(models, X_train_3, sc_X_test, y_train_3, y_test):
    """
    Fits and evaluates given machine learning models using SMOTE balanced data.
    models : a dictionary of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels associated with training data
    y_test : labels associated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores_3 = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train_3, y_train_3)
        # Evaluate the model and append its score to model_scores
        model_scores_3[name] = model.score(sc_X_test, y_test)
    return model_scores_3

In [None]:
model_scores_3 = fit_and_score(models=models,
                             X_train_3=X_train_3,
                             sc_X_test=sc_X_test,
                             y_train_3=y_train_3,
                             y_test=y_test
                             )
model_scores_3

In [None]:
# Setup random seed
np.random.seed(42)

# Instantiate LinearSVC with all 3 balancing methods
lsvc_model_1 = LinearSVC(max_iter=10000)
lsvc_model_1.fit(X_train_1, y_train_1)
lsvc_model_2 = LinearSVC(max_iter=10000)
lsvc_model_2.fit(X_train_2, y_train_2)
lsvc_model_3 = LinearSVC(max_iter=10000)
lsvc_model_3.fit(X_train_3, y_train_3)

# Evaluate the LinearSVC
print(f"LinearSVC Model Score with Undersampling Balancing method: {lsvc_model_1.score(sc_X_test, y_test)*100:.2f}%")
print(f"LinearSVC Model Score with Oversampling Balancing method: {lsvc_model_2.score(sc_X_test, y_test)*100:.2f}%")
print(f"LinearSVC Model Score with SMOTE Balancing method: {lsvc_model_3.score(sc_X_test, y_test)*100:.2f}%")

In [None]:
# Setup random seed
np.random.seed(42)

# Instantiate Logistic Regression with all 3 balancing methods
logreg_model_1 = LogisticRegression()
logreg_model_1.fit(X_train_1, y_train_1)
logreg_model_2 = LogisticRegression()
logreg_model_2.fit(X_train_2, y_train_2)
logreg_model_3 = LogisticRegression()
logreg_model_3.fit(X_train_3, y_train_3)

# Evaluate the LinearSVC
print(f"Logistic Regression Model Score with Undersampling Balancing method: {logreg_model_1.score(sc_X_test, y_test)*100:.2f}%")
print(f"Logistic Regression Model Score with Oversampling Balancing method: {logreg_model_2.score(sc_X_test, y_test)*100:.2f}%")
print(f"Logistic Regression Model Score with SMOTE Balancing method: {logreg_model_3.score(sc_X_test, y_test)*100:.2f}%")

Beautiful! Since our models are fitting, let's compare them visually.

## Model Comparison

Since we've saved our models scores to a dictionary, we can plot them by first converting them to a DataFrame.

In [None]:
# Undersample Balanced Data
model_compare_1 = pd.DataFrame(model_scores_1, index=['Accuracy'])
model_compare_1.plot.bar();

In [None]:
# Upsample Balanced Data
model_compare_2 = pd.DataFrame(model_scores_2, index=['Accuracy'])
model_compare_2.plot.bar();

In [None]:
# Smote Balanced Data
model_compare_3 = pd.DataFrame(model_scores_3, index=['Accuracy'])
model_compare_3.plot.bar();

Beautiful! We can't really see it from the graph but looking at the dictionary, the KNN model performs best.

However, we can optimize the performance by each model by looking at some parameters.



* **Hyperparameter tuning** - Each model you use has a series of dials you can turn to dictate how they perform. Changing these values may increase or decrease model performance.
* **Feature importance** - If there are a large amount of features we're using to make predictions, do some have more importance than others? 
* [**Confusion matrix**](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/) - Compares the predicted values with the true values in a tabular way, if 100% correct, all values in the matrix will be top left to bottom right (diagnol line).
* [**Cross-validation**](https://scikit-learn.org/stable/modules/cross_validation.html) - Splits your dataset into multiple parts and train and tests your model on each part and evaluates performance as an average. 
* [**Precision**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score) - Proportion of true positives over total number of samples. Higher precision leads to less false positives.
* [**Recall**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score) - Proportion of true positives over total number of true positives and false negatives. Higher recall leads to less false negatives.
* [**F1 score**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score) - Combines precision and recall into one metric. 1 is best, 0 is worst.
* [**Classification report**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) - Sklearn has a built-in function called `classification_report()` which returns some of the main classification metrics such as precision, recall and f1-score.
* [**ROC Curve**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_score.html) - [Receiver Operating Characterisitc](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) is a plot of true positive rate versus false positive rate.
* [**Area Under Curve (AUC)**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html) - The area underneath the ROC curve. A perfect model achieves a score of 1.0.

### Tune KNeighborsClassifier (K-Nearest Neighbors or KNN) by hand

There's one main hyperparameter we can tune for the K-Nearest Neighbors (KNN) algorithm, and that is number of neighbours. The default is 5 (`n_neigbors=5`).

We try a few different values of `n_neighbors`.

## Hyperparameters

In [None]:
# Create a list of train scores
train_scores = []

# Create a list of test scores
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(X_train_1, y_train_1)
    
    # Update the training scores
    train_scores.append(knn.score(X_train_1, y_train_1))
    
    # Update the test scores
    test_scores.append(knn.score(sc_X_test, y_test))

Let's look at KNN's train scores.

In [None]:
train_scores

In [None]:
test_scores

In [None]:
plt.plot(neighbors, train_scores, label="Train score 1")
plt.plot(neighbors, test_scores, label="Test score 1")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

In [None]:
# Create a list of train scores
train_scores_2 = []

# Create a list of test scores
test_scores_2 = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(X_train_2, y_train_2)
    
    # Update the training scores
    train_scores_2.append(knn.score(X_train_2, y_train_2))
    
    # Update the test scores
    test_scores_2.append(knn.score(sc_X_test, y_test))

In [None]:
test_scores_2

In [None]:
plt.plot(neighbors, train_scores_2, label="Train score 2")
plt.plot(neighbors, test_scores_2, label="Test score 2")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores_2)*100:.2f}%")

In [None]:
# Create a list of train scores
train_scores_3 = []

# Create a list of test scores
test_scores_3 = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(X_train_3, y_train_3)
    
    # Update the training scores
    train_scores_3.append(knn.score(X_train_3, y_train_3))
    
    # Update the test scores
    test_scores_3.append(knn.score(sc_X_test, y_test))

In [None]:
test_scores_3

In [None]:
plt.plot(neighbors, train_scores_3, label="Train score 3")
plt.plot(neighbors, test_scores_3, label="Test score 3")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores_3)*100:.2f}%")

Looking at the graph, `n_neighbors = 1` seems best.

From the above, the `KNN`'s model performance is better than others but we can optimize the others two before drawing our conclusion.

Instead of us having to manually try different hyperparameters by hand, `RandomizedSearchCV` tries a number of different combinations, evaluates them and saves the best.

### Tuning models with with [`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)


We create a hyperparameter grid (a dictionary of different hyperparameters) for each and then test them out.

In [None]:
# Different LogisticRegression hyperparameters
log_reg_grid_1 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

log_reg_grid_2 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

log_reg_grid_3 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Different LinearSVC hyperparameters
log_lin_svc_1 = {"C": np.logspace(-4, 4, 20),
              "intercept_scaling": np.logspace(-4, 4, 20)}
log_lin_svc_2 = {"C": np.logspace(-4, 4, 20),
              "intercept_scaling": np.logspace(-4, 4, 20)}
log_lin_svc_3 = {"C": np.logspace(-4, 4, 20),
              "intercept_scaling": np.logspace(-4, 4, 20)}

Now let's use `RandomizedSearchCV` to try and tune our `LogisticRegression` model.

We'll pass it the different hyperparameters from `log_reg_grid` as well as set `n_iter = 20`. This means, `RandomizedSearchCV` will try 20 different combinations of hyperparameters from `log_reg_grid` and save the best ones.

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg_1 = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid_1,
                                cv=5,
                                n_iter=20,
                                verbose=True)
rs_log_reg_2 = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid_2,
                                cv=5,
                                n_iter=20,
                                verbose=True)
rs_log_reg_3 = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid_3,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model
rs_log_reg_1.fit(X_train_1, y_train_1);
rs_log_reg_2.fit(X_train_2, y_train_2);
rs_log_reg_3.fit(X_train_3, y_train_3);

In [None]:
print(rs_log_reg_1.best_params_)
print(rs_log_reg_2.best_params_)
print(rs_log_reg_3.best_params_)

In [None]:
print(rs_log_reg_1.score(sc_X_test, y_test)) # tuning hyperparameters C and solver gives us a lower degree of accuracy 
print(rs_log_reg_2.score(sc_X_test, y_test))                           
print(rs_log_reg_3.score(sc_X_test, y_test))

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for LinearSVC
rs_lin_svc_1 = RandomizedSearchCV(LinearSVC(),
                                param_distributions=log_lin_svc_1,
                                cv=5,
                                n_iter=20,
                                verbose=True)

rs_lin_svc_2 = RandomizedSearchCV(LinearSVC(),
                                param_distributions=log_lin_svc_2,
                                cv=5,
                                n_iter=20,
                                verbose=True)

rs_lin_svc_3 = RandomizedSearchCV(LinearSVC(),
                                param_distributions=log_lin_svc_3,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model
rs_lin_svc_1.fit(X_train_1, y_train_1);
rs_lin_svc_2.fit(X_train_2, y_train_2);
rs_lin_svc_3.fit(X_train_3, y_train_3);

In [None]:
print(rs_lin_svc_1.best_params_)
print(rs_lin_svc_2.best_params_)
print(rs_lin_svc_3.best_params_)

In [None]:
print(rs_lin_svc_1.score(sc_X_test, y_test)) # tuning hyperparameters C and intercept_scaling gives 
print(rs_lin_svc_2.score(sc_X_test, y_test)) # us a lower degree of accuracy
print(rs_lin_svc_3.score(sc_X_test, y_test))

In [None]:
# Time taken to train the lInearSVC model.
%timeit rs_lin_svc_1.fit(X_train_1, y_train_1);
%timeit rs_lin_svc_2.fit(X_train_2, y_train_2);
%timeit rs_lin_svc_3.fit(X_train_3, y_train_3);

In [None]:
# Time taken to predict using the linearSVC model.
%timeit lsvc_model_1.predict(sc_X_test)
%timeit lsvc_model_2.predict(sc_X_test)
%timeit lsvc_model_3.predict(sc_X_test)

Excellent! Tuning the hyperparameters for each model saw a slight reduction in performance boost in both the `LinearSVC` and `LogisticRegression`.

This is akin to tuning the settings on your oven and getting it to cook your favourite dish just right.

We give `LogisticRegression` another try to see if we can pull out something better using [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).

### Tuning a model with [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

The difference between `RandomizedSearchCV` and `GridSearchCV` is where `RandomizedSearchCV` searches over a grid of hyperparameters performing `n_iter` combinations, `GridSearchCV` will test every single possible combination.

In short:
* `RandomizedSearchCV` - tries `n_iter` combinations of hyperparameters and saves the best.
* `GridSearchCV` - tries every single combination of hyperparameters and saves the best.

Let's see it in action.

In [None]:
# Different LogisticRegression hyperparameters
gs_log_reg_grid_1 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}
gs_log_reg_grid_2 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}
gs_log_reg_grid_3 = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg_1 = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid_1,
                          cv=5,
                          verbose=True)
gs_log_reg_2 = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid_2,
                          cv=5,
                          verbose=True)
gs_log_reg_3 = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid_3,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg_1.fit(X_train_1, y_train_1);
gs_log_reg_2.fit(X_train_2, y_train_2);
gs_log_reg_3.fit(X_train_3, y_train_3);

In [None]:
# Check the best parameters
print(gs_log_reg_1.best_params_)
print(gs_log_reg_2.best_params_)
print(gs_log_reg_3.best_params_)

In [None]:
# Evaluate the model
print(gs_log_reg_1.score(sc_X_test, y_test))
print(gs_log_reg_2.score(sc_X_test, y_test))
print(gs_log_reg_3.score(sc_X_test, y_test))

In [None]:
# Time taken to train the logistic Regression model.
%timeit gs_log_reg_1.fit(X_train_1, y_train_1);
%timeit gs_log_reg_2.fit(X_train_2, y_train_2);
%timeit gs_log_reg_3.fit(X_train_3, y_train_3);

In [None]:
# Time taken to predict using the logistic Regression model.
%timeit logreg_model_1.predict(sc_X_test)
%timeit logreg_model_2.predict(sc_X_test)
%timeit logreg_model_3.predict(sc_X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model_1 = KNeighborsClassifier(n_neighbors=1)
knn_model_1.fit(X_train_1, y_train_1);
knn_model_2 = KNeighborsClassifier(n_neighbors=1)
knn_model_2.fit(X_train_2, y_train_2);
knn_model_3 = KNeighborsClassifier(n_neighbors=1)
knn_model_3.fit(X_train_3, y_train_3);

In [None]:
# Time taken to train the KNN model.
%timeit knn_model_1.fit(X_train_1, y_train_1);
%timeit knn_model_2.fit(X_train_2, y_train_2);
%timeit knn_model_3.fit(X_train_3, y_train_3);

In [None]:
# Time taken to predict the KNN model.
%timeit knn_model_1.predict(sc_X_test)
%timeit knn_model_2.predict(sc_X_test)
%timeit knn_model_3.predict(sc_X_test)

In [None]:
knn_preds_1 = knn_model_1.predict(sc_X_test)
knn_preds_2 = knn_model_2.predict(sc_X_test)
knn_preds_3 = knn_model_3.predict(sc_X_test)

In [None]:
knn_preds_2[:10]

In [None]:
y_test[:10]

## Evaluating a classification model, beyond accuracy

Now we've got a tuned model, let's get some of the metrics we discussed before.

We want:
* ROC curve and AUC score - [`plot_roc_curve()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_roc_curve.html#sklearn.metrics.plot_roc_curve)
* Confusion matrix - [`confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)
* Classification report - [`classification_report()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)
* Precision - [`precision_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html)
* Recall - [`recall_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html)
* F1-score - [`f1_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)


To access them, we'll have to use our model to make predictions on the test set. You can make predictions by calling `predict()` on a trained model and passing it the data you'd like to predict on.

We'll make predictions on the test data.

In [None]:
# Make predictions on test data for Logistic Regression
lr_preds_1 = logreg_model_1.predict(sc_X_test)
lr_preds_2 = logreg_model_2.predict(sc_X_test)
lr_preds_3 = logreg_model_3.predict(sc_X_test)

Let's see them.

They look like our original test data labels, except different where the model has predicred wrong.

In [None]:
# Make predictions on test data for LinearSVC
lsvc_preds_1 = lsvc_model_1.predict(sc_X_test)
lsvc_preds_2 = lsvc_model_2.predict(sc_X_test)
lsvc_preds_3 = lsvc_model_3.predict(sc_X_test)

Since we've got our prediction values we can find the metrics we want.

Let's start with the ROC curve and AUC scores.

### ROC Curve and AUC Scores

It's a way of understanding how our model is performing by comparing the true positive rate to the false positive rate.

In our case...

A false positive in this case occurs when the wine quality is predicted as high but is actually low. A false negative, on the other hand, occurs when the wine quality is predicted as low when they are actually high quality.


In [None]:
# Setup random seed
np.random.seed(42)

# Instantiate LinearSVC with all 3 balancing methods
logreg_model_1 = LogisticRegression()
logreg_model_1.fit(X_train_1, y_train_1)
logreg_model_2 = LogisticRegression()
logreg_model_2.fit(X_train_2, y_train_2)
logreg_model_3 = LogisticRegression()
logreg_model_3.fit(X_train_3, y_train_3)

# Evaluate the LinearSVC
print(f"Logistic Regression Model Score with Undersampling Balancing method: {logreg_model_1.score(sc_X_test, y_test)*100:.2f}%")
print(f"Logistic Regression Model Score with Oversampling Balancing method: {logreg_model_2.score(sc_X_test, y_test)*100:.2f}%")
print(f"Logistic Regression Model Score with SMOTE Balancing method: {logreg_model_3.score(sc_X_test, y_test)*100:.2f}%")

In [None]:
lr_preds_1 = logreg_model_1.predict(sc_X_test)
lr_preds_2 = logreg_model_2.predict(sc_X_test)
lr_preds_3 = logreg_model_3.predict(sc_X_test)

In [None]:
#set up plotting area
from sklearn import metrics
from sklearn.metrics import plot_roc_curve
#plt.figure(0).clf()

#fit logistic regression model and plot ROC curve
logreg_model_1 = LogisticRegression()
logreg_model_1.fit(X_train_1, y_train_1)
lr_preds_1 = logreg_model_1.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lr_preds_1)
auc = round(metrics.roc_auc_score(y_test, lr_preds_1), 2)
plt.plot(fpr,tpr,label="UnderSampling, AUC="+str(auc))

logreg_model_2 = LogisticRegression()
logreg_model_2.fit(X_train_2, y_train_2)
lr_preds_2 = logreg_model_2.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lr_preds_2)
auc = round(metrics.roc_auc_score(y_test, lr_preds_2), 2)
plt.plot(fpr,tpr,label="OverSampling, AUC="+str(auc))

logreg_model_3 = LogisticRegression()
logreg_model_3.fit(X_train_3, y_train_3)
lr_preds_3 = logreg_model_2.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lr_preds_3)
auc = round(metrics.roc_auc_score(y_test, lr_preds_3), 2)
plt.plot(fpr,tpr, label="SMOTE Sampling, AUC="+str(auc))
#plt.plot
plt.plot([0,1], [0,1], 'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
#add legend
plt.legend(loc = 'lower right')
plt.show()

In [None]:
#fit KNN model and plot ROC curve
knn_model_1 = KNeighborsClassifier(n_neighbors=1)
knn_model_1.fit(X_train_1, y_train_1)
knn_preds_1 = knn_model_1.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, knn_preds_1)
auc = round(metrics.roc_auc_score(y_test, knn_preds_1), 2)
plt.plot(fpr,tpr,label="UnderSampling, AUC="+str(auc))

knn_model_2 = KNeighborsClassifier(n_neighbors=1)
knn_model_2.fit(X_train_2, y_train_2)
knn_preds_2 = knn_model_2.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, knn_preds_2)
auc = round(metrics.roc_auc_score(y_test, knn_preds_2), 2)
plt.plot(fpr,tpr,label="OverSampling, AUC="+str(auc))

knn_model_3 = KNeighborsClassifier(n_neighbors=1)
knn_model_3.fit(X_train_3, y_train_3)
knn_preds_3 = knn_model_3.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, knn_preds_3)
auc = round(metrics.roc_auc_score(y_test, knn_preds_3), 2)
plt.plot(fpr,tpr,label="SMOTE Sampling, AUC="+str(auc))
#plt.plot
plt.plot([0,1], [0,1], 'r--')
#plt.xlim([-0.01, 1.01])
#plt.ylim([0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('K-Neighbours ROC Curve')
#add legend
plt.legend(loc = 'lower right')
plt.show()

In [None]:
#fit LinearSVC model and plot ROC curve
lsvc_model_1 = LinearSVC(max_iter=10000)
lsvc_model_1.fit(X_train_1, y_train_1)
lsvc_preds_1 = lsvc_model_1.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lsvc_preds_1)
auc = round(metrics.roc_auc_score(y_test, lsvc_preds_1), 2)
plt.plot(fpr,tpr,label="UnderSampling, AUC="+str(auc))

lsvc_model_2 = LinearSVC(max_iter=10000)
lsvc_model_2.fit(X_train_2, y_train_2)
lsvc_preds_2 = lsvc_model_2.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lsvc_preds_2)
auc = round(metrics.roc_auc_score(y_test, lsvc_preds_2), 2)
plt.plot(fpr,tpr,label="OverSampling, AUC="+str(auc))

lsvc_model_3 = LinearSVC(max_iter=10000)
lsvc_model_3.fit(X_train_3, y_train_3)
lsvc_preds_3 = lsvc_model_3.predict(sc_X_test) #[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, lsvc_preds_3)
auc = round(metrics.roc_auc_score(y_test, lsvc_preds_3), 2)
plt.plot(fpr,tpr,label="SMOTE Sampling, AUC="+str(auc))
#plt.plot
plt.plot([0,1], [0,1], 'r--')
#plt.xlim([-0.01, 1.01])
#plt.ylim([0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LinearSVC ROC Curve')
#add legend
plt.legend(loc = 'lower right')
plt.show()

Our model does far better than guessing which would be a line going from the bottom left corner to the top right corner, AUC = 0.72 But a perfect model would achieve an AUC score of 1.0, so there's still room for improvement.


### Confusion matrix 

A confusion matrix is a visual way to show where your model made the right predictions and where it made the wrong predictions (or in other words, got confused).

Scikit-Learn allows us to create a confusion matrix using [`confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) and passing it the true labels and predicted labels.

Since we are presenting a paper we want to make it visual.

Let's create a function which uses Seaborn's [`heatmap()`](https://seaborn.pydata.org/generated/seaborn.heatmap.html) for doing so.

In [None]:
sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lr_preds_1):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lr_preds_1),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LogisticRegression Undersampling\n Confusion Matrix")
plot_conf_mat(y_test, lr_preds_1)

In [None]:
sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lr_preds_2):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lr_preds_2),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LogisticRegression Oversampling\n Confusion Matrix")
plot_conf_mat(y_test, lr_preds_2)

In [None]:

sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lr_preds_3):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lr_preds_3),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LogisticRegression SMOTE sampling\n Confusion Matrix")
plot_conf_mat(y_test, lr_preds_3)

In [None]:

sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lsvc_preds_1):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lsvc_preds_1),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LinearSVC Undersampling\n Confusion Matrix")
plot_conf_mat(y_test, lsvc_preds_1)

In [None]:

sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lsvc_preds_2):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lsvc_preds_2),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LinearSVC Oversampling\n Confusion Matrix")
plot_conf_mat(y_test, lsvc_preds_2)

In [None]:

sns.set(font_scale=1) # Increase font size
 
def plot_conf_mat(y_test, lsvc_preds_3):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, lsvc_preds_3),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("LinearSVC SMOTE sampling\n Confusion Matrix")
plot_conf_mat(y_test, lsvc_preds_3)

In [None]:
def plot_conf_mat(y_test, knn_preds_1):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, knn_preds_1),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis
    plt.title("KNN Undersampling\n Confusion Matrix")
    sns.set(font_scale=1) 
    
plot_conf_mat(y_test, knn_preds_1)

In [None]:
def plot_conf_mat(y_test, knn_preds_2):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, knn_preds_2),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis
    plt.title("KNN Oversampling\n Confusion Matrix")
    
plot_conf_mat(y_test, knn_preds_2)

In [None]:
def plot_conf_mat(y_test, knn_preds_3):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, knn_preds_3),
                     annot=True, # Annotate the boxes
                     cmap='YlGnBu', fmt='g')
    plt.xlabel("Predicted Quality") # predictions go on the x-axis
    plt.ylabel("True Quality") # true labels go on the y-axis 
    plt.title("KNN SMOTE\n Confusion Matrix") # predictions go on the x-axis
    
plot_conf_mat(y_test, knn_preds_3)

Beautiful! That looks much better. 

We can see the model gets confused (predicts the wrong label) relatively the same across both classes. In essence, there are several occasaions where the model predicted 0 when it should've been 1 (false negative) and occasions where the model predicted 1 instead of 0 (false positive).

### Classification report

We can make a classification report using [`classification_report()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) and passing it the true labels as well as our models predicted labels. 

A classification report will also give us information of the precision and recall of our model for each class.

In [None]:
# Show classification report
print(classification_report(y_test, knn_preds_1))

In [None]:
print(classification_report(y_test, knn_preds_2))

In [None]:
print(classification_report(y_test, knn_preds_3))

In [None]:
print(classification_report(y_test, lr_preds_1))

In [None]:
print(classification_report(y_test, lr_preds_2))

In [None]:
print(classification_report(y_test, lr_preds_3))

In [None]:
print(classification_report(y_test, lsvc_preds_1))

In [None]:
print(classification_report(y_test, lsvc_preds_2))

In [None]:
print(classification_report(y_test, lsvc_preds_3))

What's going on here?

Let's get a refresh.

* **Precision** - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0.
* **Recall** - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0.
* **F1 score** - A combination of precision and recall. A perfect model achieves an F1 score of 1.0.
* **Support** - The number of samples each metric was calculated on.
* **Accuracy** - The accuracy of the model in decimal form. Perfect accuracy is equal to 1.0.
* **Macro avg** - Short for macro average, the average precision, recall and F1 score between classes. Macro avg doesn’t class imbalance into effort, so if you do have class imbalances, pay attention to this metric.
* **Weighted avg** - Short for weighted average, the weighted average precision, recall and F1 score between classes. Weighted means each metric is calculated with respect to how many samples there are in each class. This metric will favour the majority class (e.g. will give a high value when one class out performs another due to having more samples).

Ok, now we've got a few deeper insights on our model. But these were all calculated using a single training and test set.

What we'll do to make them more solid is calculate them using cross-validation.

How?

We'll take the best model along with the best hyperparameters and use [`cross_val_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html) along with various `scoring` parameter values.

In [None]:
# Check best hyperparameters
# gs_log_reg.best_params_

In [None]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Instantiate best model with best hyperparameters (found with GridSearchCV)
clf = LogisticRegression()

Now we've got an instantiated classifier, let's find some cross-validated metrics.

In [None]:
# Cross-validated accuracy score
cv_acc_1 = cross_val_score(clf,
                         X_train_1,
                         y_train_1,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_2 = cross_val_score(clf,
                         X_train_2,
                         y_train_2,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_3 = cross_val_score(clf,
                         X_train_3,
                         y_train_3,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
print('Cross Validtion Accuracy Score with Undersampling', cv_acc_1)
print('Cross Validtion Accuracy Score with Oversampling', cv_acc_2)
print('Cross Validtion Accuracy Score with SMOTE sampling', cv_acc_3)

Since there are 5 metrics here, we'll take the average.

In [None]:
scaler = StandardScaler()
dt_scaler = scaler.fit(X)
X_scale = dt_scaler.transform(X)

In [None]:
y = wine_quality["label"]
len(y)

In [None]:
# balancing it. Undersampling
X_scale_1, y1 = make_imbalance(X_scale, y, 
                                  sampling_strategy={0: 1300, 1: 1300},random_state=14)

In [None]:
# balancing it Oversampling
X_scale_2, y2 = ros.fit_resample(X_scale, y)

In [None]:
# balancing it SMOTE - Oversampling
X_scale_3, y3 = smote.fit_resample(X_scale, y)

In [None]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Instantiate best model with best hyperparameters (found with GridSearchCV)
clf = LogisticRegression()

In [None]:
cv_acc_lr1 = cross_val_score(clf,
                         X_scale_1,
                         y1,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_lr2 = cross_val_score(clf,
                         X_scale_2,
                         y2,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_lr3 = cross_val_score(clf,
                         X_scale_3,
                         y3,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
print('Cross Validtion Accuracy Score with Undersampling', cv_acc_lr1)
print('Cross Validtion Accuracy Score with Oversampling', cv_acc_lr2)
print('Cross Validtion Accuracy Score with SMOTE sampling', cv_acc_lr3)

In [None]:
cv_acc_lrm1 = np.mean(cv_acc_lr1)
cv_acc_lrm2 = np.mean(cv_acc_lr2)
cv_acc_lrm3 = np.mean(cv_acc_lr3)
print(cv_acc_lrm1)
print(cv_acc_lrm2)
print(cv_acc_lrm3)

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=1)
# Cross-validated accuracy score
cv_acc_knn1 = cross_val_score(clf_knn,
                         X_scale_1,
                         y1,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_knn2 = cross_val_score(clf_knn,
                         X_scale_2,
                         y2,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_knn3 = cross_val_score(clf_knn,
                         X_scale_3,
                         y3,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
print('Cross Validtion Accuracy Score with Undersampling', cv_acc_knn1)
print('Cross Validtion Accuracy Score with Oversampling', cv_acc_knn2)
print('Cross Validtion Accuracy Score with SMOTE sampling', cv_acc_knn3)

In [None]:
cv_acc_knn_m1 = np.mean(cv_acc_knn1)
cv_acc_knn_m2 = np.mean(cv_acc_knn2)
cv_acc_knn_m3 = np.mean(cv_acc_knn3)
print(cv_acc_knn_m1)
print(cv_acc_knn_m2)
print(cv_acc_knn_m3)

In [None]:
clf_lsvc = LinearSVC(max_iter=10000)

# Cross-validated accuracy score
cv_acc_lsvc1 = cross_val_score(clf_lsvc,
                         X_scale_1,
                         y1,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_lsvc2 = cross_val_score(clf_lsvc,
                         X_scale_2,
                         y2,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc_lsvc3 = cross_val_score(clf_lsvc,
                         X_scale_3,
                         y3,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
print('Cross Validtion Accuracy Score with Undersampling', cv_acc_lsvc1)
print('Cross Validtion Accuracy Score with Oversampling', cv_acc_lsvc2)
print('Cross Validtion Accuracy Score with SMOTE sampling', cv_acc_lsvc3)

In [None]:
cv_acc_lsvc_m1 = np.mean(cv_acc_lsvc1)
cv_acc_lsvc_m2 = np.mean(cv_acc_lsvc2)
cv_acc_lsvc_m3 = np.mean(cv_acc_lsvc3)
print(cv_acc_lsvc_m1)
print(cv_acc_lsvc_m2)
print(cv_acc_lsvc_m3)


Another model evaluation techniques is feature importance.

## Feature importance

Feature importance is another way of asking, "which features contributing most to the outcomes of the model?"

In [None]:
wine_quality.head()

In [None]:
features_dict = dict(zip(wine_quality.columns, list(logreg_model_3.coef_[0])))
features_dict

In [None]:
# Visualize feature importance
features_df = pd.DataFrame(features_dict, index=[0])
features_df.T.plot.bar(title="Feature Importance", legend=False);