# Sklearn Practice

[link to the tutorial](https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn)

## Import libraries and modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

 ## Load red wine data.

In [2]:
# Load data from a remote URL
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)
print (data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [3]:
# Read CSV with semicolon separator as the one above seems quite messy
data = pd.read_csv(dataset_url, sep=';')
print (data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [4]:
# Looking at the shape
print (data.shape)

(1599, 12)


In [5]:
# Summary statistics
print (data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

## Split data into training and test sets

In [6]:
# Separate target from training features
y = data.quality
X = data.drop('quality', axis=1)

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)


In [8]:
print (X_train.shape)
print(X_test.shape)
print(X.shape)

(1279, 11)
(320, 11)
(1599, 11)


In [9]:
# Fitting the Transformer API
scaler = preprocessing.StandardScaler().fit(X_train)

# Applying transformer to training data
X_train_scaled = scaler.transform(X_train)

print("Training Data")
print (X_train_scaled.mean(axis=0))
# [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
print (X_train_scaled.std(axis=0))
# [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]



# Applying transformer to test data
X_test_scaled = scaler.transform(X_test)

print("Test Data") 
print (X_test_scaled.mean(axis=0))
# [ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
#  -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
 
print (X_test_scaled.std(axis=0))
# [ 1.02160495  1.00135689  0.97456598  0.91099054  0.86716698  0.94193125
#  1.03673213  1.03145119  0.95734849  0.83829505  1.0286218 ]

Training Data
[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Test Data
[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


## Declare data preprocessing steps

In practice, when we set up the cross-validation pipeline, we won't even need to manually fit the Transformer API. Instead, we'll simply declare the class object, like so:

In [19]:
# Pipeline with preprocessing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

print (pipeline.get_params())

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
              oob_score=False, random_state=None, verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_s

## Cross Validation

In [11]:
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]
                  }

# Sklearn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

# best set of parameters 
print (clf.best_params_)

# Confirm model will be retrained
print (clf.refit)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}
True


## Evaluate model pipeline on test data

In [18]:
# Predict a new set of data
y_pred = clf.predict(X_test)


print (r2_score(y_test, y_pred))
# 0.45044082571584243
 
print (mean_squared_error(y_test, y_pred))
# 0.35461593750000003

0.3389221875

## Saving and loading the model for future use

In [72]:
# Save model to a .pkl file
joblib.dump(clf, 'rf_regressor.pkl')

# Load model from .pkl file
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)


array([6.49, 5.74, 4.99, 5.48, 6.38, 5.64, 4.98, 4.87, 5.01, 6.16, 5.28,
       5.73, 5.75, 5.13, 5.83, 5.61, 6.55, 5.79, 5.73, 6.98, 5.51, 5.59,
       5.02, 6.08, 5.95, 5.05, 5.56, 5.14, 5.93, 5.88, 5.88, 6.37, 5.99,
       5.08, 4.93, 5.89, 5.09, 6.08, 5.02, 6.05, 4.99, 5.94, 6.63, 5.08,
       6.29, 5.38, 5.47, 5.69, 5.17, 6.5 , 6.09, 5.3 , 5.93, 5.15, 5.69,
       5.66, 5.29, 5.37, 4.98, 5.22, 5.22, 5.2 , 5.06, 5.8 , 6.05, 5.29,
       6.3 , 5.05, 5.16, 6.7 , 5.77, 5.92, 5.09, 5.02, 5.31, 5.99, 5.35,
       5.06, 5.22, 5.28, 6.3 , 5.54, 6.14, 6.36, 5.07, 6.02, 6.35, 6.31,
       5.73, 5.74, 5.87, 5.28, 6.45, 5.79, 5.66, 5.78, 6.78, 6.72, 5.69,
       6.8 , 5.15, 5.44, 5.13, 6.49, 5.01, 4.77, 5.72, 4.95, 5.64, 5.85,
       5.9 , 5.29, 6.11, 5.33, 5.19, 5.34, 5.86, 5.15, 4.87, 6.  , 5.87,
       5.07, 5.81, 6.18, 5.28, 5.45, 5.24, 5.96, 5.37, 5.41, 5.74, 6.11,
       5.13, 5.49, 5.05, 6.44, 5.05, 5.2 , 6.71, 5.64, 5.22, 5.05, 5.6 ,
       6.09, 5.32, 5.33, 5.18, 6.37, 5.92, 5.15, 5.

In [84]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=";", header=None)
# data.columns = ["x", "y"]
# data.head()
array = np.array(data)
array

array([['fixed acidity', 'volatile acidity', 'citric acid', ...,
        'sulphates', 'alcohol', 'quality'],
       ['7.4', '0.7', '0', ..., '0.56', '9.4', '5'],
       ['7.8', '0.88', '0', ..., '0.68', '9.8', '5'],
       ...,
       ['6.3', '0.51', '0.13', ..., '0.75', '11', '6'],
       ['5.9', '0.645', '0.12', ..., '0.71', '10.2', '5'],
       ['6', '0.31', '0.47', ..., '0.66', '11', '6']], dtype=object)