In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score , accuracy_score
from sklearn.externals import joblib


dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

In [33]:
data.head(3)
data.shape
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [67]:
y = data.quality
x = data.drop('quality',axis=1)

x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.2,random_state=123,stratify=y)




In [36]:
scalar = preprocessing.StandardScaler().fit(x_train)

x_scaled_tarin = scalar.transform(x_train)
x_scaled_test = scalar.transform(x_test)


pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))
pipeline.get_params()

{'memory': None,
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False),
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto',
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__min_impurity_decrease': 0.0,
 'randomforestregressor__min_impurity_split': None,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 '

In [48]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(x_train, y_train)
clf.refit

y_predict = clf.predict(x_test)



In [57]:

print(r2_score(y_test, y_predict))

print(mean_squared_error(y_test, y_predict))


0.468547975059023
0.342931875


In [25]:
#save the model

joblib.dump(clf,'rf_regressor.pkl')


['rf_regressor.pkl']

In [83]:
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(x_test)


array([6.5 , 5.75, 4.96, 5.37, 6.22, 5.6 , 4.96, 4.71, 5.01, 6.27, 5.35,
       5.75, 5.87, 5.02, 5.74, 5.72, 6.51, 5.71, 5.79, 6.96, 5.46, 5.69,
       5.02, 6.01, 5.91, 5.06, 5.43, 5.17, 5.84, 5.91, 5.89, 6.31, 5.98,
       5.04, 4.96, 5.98, 5.09, 6.23, 5.01, 6.11, 4.96, 5.95, 6.67, 5.12,
       6.19, 5.46, 5.54, 5.61, 5.11, 6.37, 6.11, 5.38, 5.81, 5.22, 5.68,
       5.75, 5.25, 5.37, 4.99, 5.31, 5.3 , 5.19, 5.06, 5.78, 5.92, 5.23,
       6.42, 5.06, 5.2 , 6.71, 5.77, 5.67, 5.06, 5.01, 5.38, 6.  , 5.28,
       5.15, 5.29, 5.26, 6.37, 5.62, 6.21, 6.51, 5.08, 6.06, 6.34, 6.34,
       5.61, 5.86, 5.96, 5.32, 6.37, 5.76, 5.73, 5.78, 6.72, 6.78, 5.61,
       6.7 , 5.03, 5.49, 5.13, 6.55, 5.06, 4.8 , 5.66, 4.94, 5.75, 5.93,
       5.81, 5.54, 6.04, 5.37, 5.19, 5.23, 5.99, 5.11, 5.1 , 6.07, 5.91,
       5.09, 5.74, 6.15, 5.26, 5.42, 5.33, 5.88, 5.57, 5.4 , 5.86, 6.4 ,
       5.16, 5.21, 4.98, 6.39, 5.02, 5.26, 6.77, 5.37, 5.17, 5.11, 5.71,
       6.1 , 5.31, 5.44, 5.13, 6.31, 5.79, 5.06, 5.

In [87]:
from sklearn import tree, svm, neighbors, linear_model
from sklearn.metrics import accuracy_score
import numpy as np


clf = tree.DecisionTreeClassifier()
clf2 = svm.SVC()
clf3 = neighbors.KNeighborsClassifier()
clf4 = linear_model.LogisticRegression(solver="liblinear",C=1)



# clf = clf.fit(x_scaled_tarin , y_train)
# prediction1 = clf.predict(x_scaled_test)
# print(prediction1)
print(prediction2)


clf = clf.fit(x_train, y_train)
prediction1 = clf.predict(x_test)
acc_clf = accuracy_score(y_test, prediction1)
print('Accuracy of Decision Tree is: ', acc_clf)
print('mean square error : ',mean_squared_error(y_test, prediction1))

clf2 = clf2.fit(x_train, y_train)
prediction2 = clf2.predict(x_test)
acc_clf2 = accuracy_score(y_test, prediction2)
print('Accuracy of Support Vector Machine is: ', acc_clf2)
print('mean square error : ',mean_squared_error(y_test, prediction2))


clf3 = clf3.fit(x_train, y_train)
prediction3 = clf3.predict(x_test)
acc_clf3 = accuracy_score(y_test, prediction3)
print('Accuracy of K-Nearest Neighbors is: ', acc_clf3)
print('mean square error : ',mean_squared_error(y_test, prediction3))


clf4 = clf4.fit(x_train, y_train)
prediction4 = clf4.predict(x_test)
acc_clf4 = accuracy_score(y_test, prediction4)
print('Accuracy of Logistic Regression is: ', acc_clf4)
print('mean square error : ',mean_squared_error(y_test, prediction4))



[6 6 6 5 5 6 6 5 5 6 5 6 6 5 6 6 6 6 6 7 5 6 5 6 6 5 6 5 6 6 6 5 6 5 5 5 5
 6 5 6 6 6 6 5 6 5 6 5 5 6 6 5 6 5 5 5 6 5 5 5 6 5 5 6 7 5 6 5 5 7 5 6 5 5
 6 6 5 5 5 5 6 5 6 5 5 5 5 6 6 5 6 6 7 6 6 6 5 6 5 7 5 6 5 7 5 5 6 5 5 6 7
 5 6 6 6 6 6 5 5 5 5 5 6 6 5 6 6 6 5 6 5 7 5 5 5 5 5 5 5 6 5 5 5 6 5 6 5 5
 6 6 5 5 5 5 5 6 5 6 6 5 5 5 5 5 5 6 6 5 6 5 6 5 6 5 5 5 6 5 6 5 6 5 6 5 5
 6 5 5 6 6 6 6 6 6 6 5 6 5 6 5 6 5 5 5 6 7 6 5 5 6 6 7 6 6 6 6 6 5 6 5 5 6
 6 5 6 5 5 5 5 5 5 5 5 5 5 5 7 6 6 6 6 6 5 6 6 5 5 6 6 6 6 5 5 5 5 6 6 6 5
 6 6 6 5 6 5 6 6 6 5 5 5 5 7 6 5 6 6 6 6 5 6 5 5 5 6 6 6 6 5 6 5 6 5 6 6 6
 6 6 6 6 5 5 5 6 6 6 7 5 5 5 5 6 5 6 6 5 6 5 6 5]
Accuracy of Decision Tree is:  0.61875
mean square error :  0.61875




Accuracy of Support Vector Machine is:  0.571875
mean square error :  0.603125
Accuracy of K-Nearest Neighbors is:  0.496875
mean square error :  0.74375
Accuracy of Logistic Regression is:  0.553125
mean square error :  0.559375


