# Importing Dependencies

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

# Load Data

In [4]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)
data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


looks like our csv file use semicolon as seprator but it's super simple to fix

In [5]:
data = pd.read_csv(dataset_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Let's take a look at data

In [7]:
data.shape

(1599, 12)

We have 1,599 samples and 12 features, including our target feature. We can easily print some summary statistics.

In [8]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


List of all features:

In [9]:
list(data)

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

# Split Data

In [16]:
y = data.quality #target
X = data.drop('quality', axis=1) #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
print("Trainning data shape {}".format(X_train.shape))
print("Test data shape {}".format(X_test.shape))

Trainning data shape (1279, 11)
Test data shape (320, 11)


Randomly 20% is for test & rest is for trainning

# Standardization

In [19]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

# Setting HyperParameters

In [22]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

# Train & Tune via Cross Validation

In [23]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

Refit to get more accuracy

In [25]:
clf.refit

True

# Testing

In [31]:
y_pred = clf.predict(X_test)
print("r2_score: {}".format(r2_score(y_test, y_pred)))
print("mean_squared: {}".format(mean_squared_error(y_test, y_pred)))

r2_score: 0.46820655003329514
mean_squared: 0.3431521875


# Save Model

In [32]:
joblib.dump(clf, 'rf_regressor.pkl')
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(X_test)

array([6.47, 5.76, 4.98, 5.48, 6.29, 5.49, 4.93, 4.82, 5.  , 6.01, 5.28,
       5.78, 5.79, 5.05, 5.81, 5.73, 6.6 , 5.69, 5.76, 6.95, 5.47, 5.68,
       5.02, 6.12, 5.95, 5.04, 5.38, 5.16, 5.94, 5.93, 5.88, 6.51, 5.98,
       5.01, 5.01, 5.86, 5.01, 6.07, 4.99, 5.85, 4.97, 5.87, 6.68, 5.17,
       6.29, 5.27, 5.58, 5.47, 5.1 , 6.56, 5.83, 5.32, 5.83, 5.16, 5.5 ,
       5.84, 5.38, 5.38, 4.98, 5.31, 5.29, 5.19, 5.07, 5.83, 5.97, 5.33,
       6.42, 5.01, 5.18, 6.73, 5.72, 5.66, 5.02, 5.02, 5.33, 5.97, 5.36,
       5.17, 5.24, 5.26, 6.25, 5.54, 6.15, 6.33, 5.07, 5.97, 6.35, 6.4 ,
       5.81, 5.77, 6.  , 5.34, 6.47, 5.79, 5.73, 5.84, 6.7 , 6.73, 5.47,
       6.73, 5.1 , 5.48, 5.15, 6.45, 5.08, 4.88, 5.69, 5.01, 5.59, 5.96,
       5.84, 5.57, 6.01, 5.39, 5.13, 5.3 , 5.93, 5.14, 4.78, 6.08, 5.82,
       5.09, 5.83, 6.01, 5.26, 5.34, 5.29, 5.97, 5.33, 5.4 , 5.77, 6.25,
       5.15, 5.35, 5.09, 6.3 , 5.03, 5.17, 6.69, 5.49, 5.31, 5.12, 5.67,
       6.04, 5.3 , 5.46, 5.1 , 6.51, 5.79, 5.1 , 5.