### Python Machine Learning Tutorial, Sckit-Learn Wine Snob Edition

Source: https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn

In [1]:
import numpy

In [2]:
import pandas as pd


In [3]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn import preprocessing

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [8]:
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
from sklearn.externals import joblib

In [10]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [12]:
print(data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [13]:
data = pd.read_csv(dataset_url, sep=';')
print(data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [14]:
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [15]:
y = data.quality
X = data.drop('quality', axis=1)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123,) 


In [24]:
X_trained_scaled = preprocessing.scale(X_train)
print (X_trained_scaled)

[[ 0.89358298 -1.13946293  1.48264243 ... -1.03607885  0.40597574
   0.93295636]
 [-0.53104385  0.86341667 -0.57810487 ...  1.23111916  0.23188892
  -0.47443764]
 [ 1.91931429 -0.19365867  1.12201165 ... -0.7769705  -1.33489252
  -1.13122151]
 ...
 [-0.07516327 -1.58454728  0.34923141 ... -0.58263925  1.62458353
   0.46382503]
 [-0.92993936 -0.24929422 -1.29936643 ... -1.35996428 -0.7546031
  -0.66209018]
 [-0.98692444  0.19579014 -0.68114224 ...  0.71290247 -0.23234262
   1.21443516]]


In [25]:
print(X_train_scaled.mean(axis=0))

[-2.61106400e-16  1.74996843e-16  3.05550043e-17  1.34719792e-16
 -1.09720243e-16  8.88872852e-17  6.11100085e-17 -5.58531589e-15
 -4.91657796e-16 -4.72213702e-16  9.55538315e-16]


In [26]:
print(X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [27]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [28]:
X_train_scaled = scaler.transform(X_train)

In [31]:
print(X_train_scaled.mean(axis=0))

[-2.61106400e-16  1.74996843e-16  3.05550043e-17  1.34719792e-16
 -1.09720243e-16  8.88872852e-17  6.11100085e-17 -5.58531589e-15
 -4.91657796e-16 -4.72213702e-16  9.55538315e-16]


In [33]:
print(X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [34]:
X_test_scaled = scaler.transform(X_test)

In [38]:
print (X_test_scaled.mean(axis=0))

[-0.03491756  0.08373667 -0.03184584 -0.06182511 -0.01639929 -0.23555402
 -0.21049197 -0.05934262  0.03780378 -0.05481033  0.08123865]


In [39]:
print (X_test_scaled.std(axis=0))

[0.95809934 0.97645773 1.01586477 0.86829634 0.84790218 0.79912665
 0.93883036 0.97949213 0.99821657 0.91231869 0.99516772]


In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [44]:
print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=Fals

In [45]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [46]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [47]:
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [51]:
print (clf.refit)

True


In [54]:
y_pred = clf.predict(X_test)

In [56]:
print (r2_score(y_test, y_pred))

0.483129340137355


In [57]:
print (mean_squared_error(y_test, y_pred))

0.34322937499999995


In [59]:
joblib.dump(clf, 'Models/rf_regressor.sav')

['Models/rf_regressor.sav']

In [60]:
clf2 = joblib.load('Models/rf_regressor.sav')

In [61]:
clf2.predict(X_test)

array([6.6 , 5.02, 5.02, 6.4 , 5.74, 6.55, 6.21, 5.75, 5.64, 5.41, 5.01,
       6.73, 5.38, 6.06, 5.65, 5.92, 5.98, 6.09, 6.02, 5.27, 6.18, 5.38,
       6.42, 5.15, 5.07, 5.26, 6.45, 5.74, 6.11, 5.86, 5.16, 6.65, 5.69,
       5.02, 6.32, 5.17, 5.39, 6.05, 5.07, 5.11, 5.64, 6.43, 5.45, 4.85,
       5.66, 4.96, 5.36, 4.98, 5.85, 5.05, 5.46, 6.97, 5.37, 7.09, 5.22,
       6.67, 5.15, 5.03, 5.7 , 5.16, 4.93, 6.12, 5.16, 5.97, 6.56, 6.07,
       6.08, 5.22, 5.19, 5.94, 5.19, 5.97, 5.89, 6.1 , 4.81, 5.62, 4.97,
       5.11, 5.99, 5.65, 5.35, 5.07, 6.93, 5.07, 5.15, 4.87, 6.11, 5.03,
       5.19, 5.23, 5.64, 5.52, 5.05, 6.22, 6.3 , 5.51, 6.59, 6.46, 6.09,
       5.89, 6.65, 6.58, 5.09, 5.76, 5.05, 5.08, 5.45, 5.99, 5.91, 5.54,
       5.07, 6.47, 5.01, 5.91, 5.98, 6.63, 5.66, 4.94, 5.61, 5.07, 5.63,
       6.25, 5.62, 5.89, 5.99, 5.  , 7.06, 5.98, 5.82, 5.3 , 5.42, 5.8 ,
       5.33, 6.21, 5.64, 5.54, 5.72, 5.6 , 6.  , 6.11, 5.35, 5.69, 5.63,
       6.81, 5.21, 5.64, 4.93, 5.23, 5.8 , 6.83, 5.