<a href="https://colab.research.google.com/github/leemichaelwaters/ml-examples/blob/main/2_Predict_wine_quality_(random_forest_classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
- Import wine data
  - 11 features and label assessing quality 0-10
- EDA performed but no data quality issues
- Data split into training (80%) and testing (20%) sets
- Data standardized to mean value of 0 and standard deviation of 1
- Random forest classifier defined with 300 trees
- Model is validated with 5-fold CV and mean accuracy around 69%
- Gridsearch performed to optimize n_estimators
- Model re-fit with best parameters (1000 trees)
- Model used to generate predictions on test data
- Accuracy of predictions determined to be 65%


# Import data

In [None]:
import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine = pd.read_csv(url, sep=';')

# Exploratory data analysis

In [None]:
print('Head:')
print(wine.head())

print('')

print('Info:')
print(wine.info())

print('')

print('Shape:')
print(wine.shape)

Head:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8    

# Train and test model

In [None]:
# Import packages
from sklearn.model_selection import train_test_split

# Define features/labels
X = wine.drop(columns='quality')
y = wine['quality']

# Train/test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=.2)

print('Xtrain, Xtest, ytrain, ytest:')
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

Xtrain, Xtest, ytrain, ytest:
(1279, 11) (320, 11) (1279,) (320,)


In [None]:
# Standardize data
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(Xtrain)                             # compute mean and sd of Xtrain
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

print('Xtrain mean:')
print(Xtrain.mean(axis=0))

print('')

print('Xtrain sd:')
print(Xtrain.std(axis=0))

print('')

print('Xtest mean:')
print(Xtest.mean(axis=0))

print('')

print('Xtest sd:')
print(Xtest.std(axis=0))

Xtrain mean:
[-1.11109106e-16  4.11798126e-16  1.26386609e-16  1.52775021e-17
 -1.04164787e-16 -7.63875107e-17 -7.56930788e-17  2.55939827e-14
 -1.35830883e-15  4.15270285e-16  4.77769158e-16]

Xtrain sd:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Xtest mean:
[-0.01175055 -0.07635958 -0.03823836 -0.05802946 -0.09945134 -0.00055896
 -0.02873498 -0.0735317  -0.01741397 -0.05366908  0.02318761]

Xtest sd:
[1.04825878 0.99202249 0.98315693 0.90535833 0.73761962 1.06953655
 0.99296586 1.08025655 1.01224123 0.84316158 1.06032677]


In [None]:
# Train model

# Define packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Define model
model = RandomForestClassifier(n_estimators=300)                                # n_estimators is num trees

# K-fold cv
scores = cross_val_score(model, Xtrain, ytrain, cv=5)

print('Scores:')
print(scores)

print('')

print('Mean scores:')
print(np.mean(scores))

print('')

print('SD scores:')
print(np.std(scores))

Scores:
[0.6640625  0.69921875 0.65234375 0.69921875 0.75686275]

Mean scores:
0.6943412990196078

SD scores:
0.03643380317670562


In [None]:
# Perform gridsearch

# Import packages
from sklearn.model_selection import GridSearchCV

# Define grid
param_grid = {'n_estimators': [100, 300, 500, 800, 1000]}

# Define model
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

# Fit model
grid.fit(Xtrain, ytrain)

print('Best params:')
print(grid.best_params_)

Best params:
{'n_estimators': 1000}


In [None]:
# Assess test accuracy

# Import packages
from sklearn.metrics import accuracy_score

# Define model
model = grid.best_estimator_

# Generate predictions
y_model = model.predict(Xtest)

print('Accuracy:')
print(accuracy_score(ytest, y_model))

Accuracy:
0.65625
