In [92]:
# Created on Tue Dec 18 10:29:48 2018
# @author: Shashank Misra
# Analysis of Wine specific attributes to determine its Quality.

In [93]:
# Import libraries

In [94]:
# For numerical computations
import pandas as pd

# For dataframes and data handling
import numpy as np

# Module to choose between multiple Machine Learning models
from sklearn.model_selection import train_test_split

# Module for scaling, transforming and wrangling data
from sklearn import preprocessing

# Import Random Forest family
from sklearn.ensemble import RandomForestRegressor

# Import cross-validation tools
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Import evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

# Import module for saving scikit-learn models for future use
# Joblib is alternate to Python Pickle
from sklearn.externals import joblib

In [95]:
# Load Data

In [96]:
# Import from URL
dataset_url = "http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine_data = pd.read_csv(dataset_url)

In [97]:
# Peek at data from dataset - 10 rows
print(wine_data.head(10))

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [98]:
# Importing data using seperation by ';'
wine_data = pd.read_csv(dataset_url, sep = ';')
print(wine_data.head(10))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   
5            7.4              0.66         0.00             1.8      0.075   
6            7.9              0.60         0.06             1.6      0.069   
7            7.3              0.65         0.00             1.2      0.065   
8            7.8              0.58         0.02             2.0      0.073   
9            7.5              0.50         0.36             6.1      0.071   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0

In [99]:
# Shape of the data
print(wine_data.shape)

(1599, 12)


In [100]:
# Summary statistics
print(wine_data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [101]:
# As seen above, various attributes have very different scales. So, we
# will need to standardize the data later.

In [102]:
# Data Split - Test and Train

In [103]:
# Separate Target (y) from Training features (x)
# "quality" is our target variable
y = wine_data.quality
X = wine_data.drop("quality", axis = 1)

In [104]:
# Split data into Train and Test sets
# test_size = 0.2 - 20% data for Test
# Set arbitrary Random state (seed) to reproduce the results
# Stratify by target variable - Ensure Training set looks similar to Test set
# (Evaluation metrics more reliable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123,
                                                    stratify = y)

In [105]:
# Data Preprocessing

In [106]:
# Standardization is the process of subtracting the means from each feature
# and then dividing by the feature standard deviations.
# Many algorithms assume that all features are centered around zero
# and have approximately the same variance.
# X_train_scaled = preprocessing.scale(X_train)
# print(X_train_scaled)
# View - Scaled data centered at zero
# print("Mean")
# print(X_train_scaled.mean(axis=0))
# print("Standard Deviation")
# print(X_train_scaled.std(axis=0))
# Using above will result in not scaling Test set same as Training set.

In [107]:
# Using scikit-learn Transformer API
scaler = preprocessing.StandardScaler().fit(X_train)

# scaler object has saved Means and Standard Deviations for each Training set feature
X_train_scaled = scaler.transform(X_train)
print("Mean")
print(X_train_scaled.mean(axis = 0))
print("Standard Deviation")
print(X_train_scaled.std(axis = 0))

Mean
[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
Standard Deviation
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [108]:
# Applying transformer to Test data
X_test_scaled = scaler.transform(X_test)
print("Mean")
print(X_test_scaled.mean(axis = 0))
print("Standard Deviation")
print(X_test_scaled.std(axis = 0))
# Note: the scaled features in the test set
# are not perfectly centered at zero with unit variance, because
# we're transforming the test set using the means from the training set,
# not from the test set itself.

Mean
[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
Standard Deviation
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [109]:
# Alternatively, declare class object
# First, transform the data using StandardScaler(), then fit a model using Random forest regressor
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators = 100))

In [110]:
# Declare Hyperparameters to tune

In [111]:
# Keys are Hyperparameter names and values are list of settings to try
hyperparameters = { 'randomforestregressor__max_features' : 
                   ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [112]:
# Tune model using a Cross-validation pipeline
# Cross - validation - Maximize model performance while reducing
# the chance of overfitting

In [113]:
# Alternate - scikit-learn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv = 10)

# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [114]:
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [115]:
# Refit on the entire Training set

In [116]:
print(clf.refit)
# clf object can be used as a model when applying other sets of data.

True


In [117]:
# Evaluate model pipeline on Test data

In [118]:
# Predict a new set of data using clf object
y_pred = clf.predict(X_test)
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.4694410073248986
0.34235562500000005


In [119]:
# Saving model
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']