Scikit-learn is a machine library for the Python programming language.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



In [4]:
data = pd.read_csv("/Users/xujiayi/Downloads/datasets_1117_2013_winequality-red.csv")

In [5]:
data.shape

(1599, 12)

In [4]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
# split data into training and test sets
y = data.quality
x = data.drop("quality", axis = 1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   test_size = 0.2,
                                                   random_state = 123) # random state 

In [7]:
# preprocessing
# scate x
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [8]:
x_train_scaled

array([[ 0.89358298, -1.13946293,  1.48264243, ..., -1.03607885,
         0.40597574,  0.93295636],
       [-0.53104385,  0.86341667, -0.57810487, ...,  1.23111916,
         0.23188892, -0.47443764],
       [ 1.91931429, -0.19365867,  1.12201165, ..., -0.7769705 ,
        -1.33489252, -1.13122151],
       ...,
       [-0.07516327, -1.58454728,  0.34923141, ..., -0.58263925,
         1.62458353,  0.46382503],
       [-0.92993936, -0.24929422, -1.29936643, ..., -1.35996428,
        -0.7546031 , -0.66209018],
       [-0.98692444,  0.19579014, -0.68114224, ...,  0.71290247,
        -0.23234262,  1.21443516]])

In [9]:
x_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1076,9.9,0.32,0.56,2.0,0.073,3.0,8.0,0.99534,3.15,0.73,11.4
847,7.4,0.68,0.16,1.8,0.078,12.0,39.0,0.99770,3.50,0.70,9.9
582,11.7,0.49,0.49,2.2,0.083,5.0,15.0,1.00000,3.19,0.43,9.2
172,8.0,0.42,0.17,2.0,0.073,6.0,18.0,0.99720,3.29,0.61,9.2
779,7.1,0.52,0.03,2.6,0.076,21.0,92.0,0.99745,3.50,0.60,9.8
...,...,...,...,...,...,...,...,...,...,...,...
1122,6.3,0.47,0.00,1.4,0.055,27.0,33.0,0.99220,3.45,0.48,12.3
1346,6.1,0.59,0.01,2.1,0.056,5.0,13.0,0.99472,3.52,0.56,11.4
1406,8.2,0.24,0.34,5.1,0.062,8.0,22.0,0.99740,3.22,0.94,10.9
1389,6.7,0.48,0.02,2.2,0.080,36.0,111.0,0.99524,3.10,0.53,9.7


**Question**: why we need to scale the input variables? <br/>
1) Several algorithms, in particular SVMs come to mind, can sometimes converge far faster on normalized data. <br/>
2) When your model is sensitive to magnitude, and the units of two different features are different, and arbitrary. SVM uses Euclidean distance.

In [10]:
# Pipeline with preprocessing and modelPython
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [12]:
# list tunable hyperparameters
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurit

In [13]:
# Declare hyperparameters to tunePython
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [1, 2, 3]}

In [14]:
# sklearn cross validation for tuning model
clf = GridSearchCV(pipeline, hyperparameters, cv=3) # here use the k = 3, for quick computation
 
# Fit and tune model
clf.fit(x_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                             

In [17]:
print(clf.best_params_)

{'randomforestregressor__max_depth': 3, 'randomforestregressor__max_features': 'auto'}


In [18]:
#we did refit the model with picked hyperparameter
print(clf.refit)

True


In [19]:
# Evaluate model on test data
test_pred = clf.predict(x_test)

In [20]:
print(r2_score(y_test, test_pred)) # negative 

0.34922648257262956


In [21]:
print(mean_squared_error(y_test, test_pred))

0.4321479337064821


In [22]:
# Backtesting on training dataset
train_pred = clf.predict(x_train)
print(r2_score(y_train, train_pred))
print(mean_squared_error(y_train, train_pred))

# Train, Dev, Test

0.40998293187877644
0.3826812931136619


Python pickle is used for serializing and de-serializing a Python object structure. Any object in python can be pickled so that it can be saved on disk. What pickle does is that it “serialises” the object first before writing it to file. Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.

In [23]:
# save your model for future use
joblib.dump(clf, "rf_4_5.pkl") 

['rf_4_5.pkl']

In [24]:
# load the model from .pkl file
clf2 = joblib.load("rf_4_5.pkl")

In [25]:
test_pred_2 = clf2.predict(x_test)