<a href="https://colab.research.google.com/github/jks1192/TLOU/blob/ML/RedWine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd

# 1. Load red wine data.
data = pd.read_csv('winequality-red.csv')
data.quality.value_counts()
X=data.drop(columns = 'quality',axis = 1)
Y=data.quality
Y.head()

# 2. Split data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
x_test.shape

# 3. Declare data preprocessing steps
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators = 100))

# 4. Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

# 5. Tune model using cross-validation pipeline
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(x_train, y_train)


# 6. Evaluate model pipeline on test data
pred = clf.predict(x_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
r2score=r2_score(y_test, pred)
mse=mean_squared_error(y_test, pred)
print(r2score)
print(mse)

# 7. Save model for future use
from sklearn.externals import joblib 
joblib.dump(clf, 'rf_regressor.pkl')
# To load: clf2 = joblib.load('rf_regressor.pkl')



0.5045847941399342
0.30646075000000006
[5.36 5.1  5.44 5.17 6.   5.08 5.14 4.99 6.31 5.85 6.7  5.4  5.81 5.29
 5.43 6.5  5.24 5.71 6.8  5.08 4.95 5.66 5.44 6.16 5.66 5.94 6.57 5.36
 5.36 6.1  5.32 5.55 5.97 5.34 5.5  5.15 6.38 5.9  5.62 6.24 5.06 5.16
 6.24 5.1  5.6  5.63 6.43 5.46 5.05 5.6  5.08 5.27 5.54 6.74 5.24 5.09
 6.05 5.77 5.74 5.02 5.62 6.07 5.43 5.25 6.65 5.42 6.71 5.62 6.49 5.38
 6.12 5.36 5.93 5.55 6.13 5.08 6.77 5.12 5.93 6.44 5.14 6.66 5.22 5.71
 5.73 6.27 5.05 5.97 6.49 5.47 6.2  5.68 5.38 5.17 5.37 5.68 5.11 5.76
 4.84 5.51 5.11 5.1  5.79 6.35 5.57 6.59 5.9  5.15 5.7  5.15 6.5  5.13
 6.37 5.01 5.34 6.04 5.69 5.27 5.11 5.67 6.12 5.78 5.8  5.45 5.62 5.2
 6.31 5.51 5.14 5.53 5.93 5.26 5.05 6.5  5.6  5.12 4.89 5.34 5.11 5.84
 6.59 6.07 6.77 5.42 5.57 5.09 5.85 5.67 5.55 5.15 5.86 6.36 5.35 5.21
 5.93 5.45 5.58 6.51 5.19 5.92 5.85 5.44 6.31 5.09 5.35 5.75 5.51 5.09
 4.92 5.28 5.18 4.78 6.48 5.29 6.6  5.88 6.25 5.19 5.4  5.33 4.85 5.92
 5.42 6.47 4.98 6.49 5.78 5.84 6.73 5.2

In [47]:
import numpy as np
pred = np.array(pred).astype(int)
r2score=r2_score(y_test, pred)
mse=mean_squared_error(y_test, pred)
print(r2score)
print(mse)

0.07047234150037884
0.575
