In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# 1. Load the dataset wineQualityReds (consider the first column as the index)

In [3]:
# read_csv
data = pd.read_csv("../data/wineQualityReds.csv", index_col=0)
data

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
1,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
2,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
3,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
4,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1596,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1597,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1598,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 2. Split the dataset in train and test considering that the target column is called 'quality'

In [4]:
X = data.loc[:,'fixed.acidity':'alcohol']
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [5]:
X_train

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
1132,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5
504,10.5,0.26,0.47,1.9,0.078,6.0,24.0,0.99760,3.18,1.04,10.9
1017,8.9,0.38,0.40,2.2,0.068,12.0,28.0,0.99486,3.27,0.75,12.6
682,8.0,0.45,0.23,2.2,0.094,16.0,29.0,0.99620,3.21,0.49,10.2
1156,8.3,0.60,0.25,2.2,0.118,9.0,38.0,0.99616,3.15,0.53,9.8
...,...,...,...,...,...,...,...,...,...,...,...
193,6.8,0.63,0.12,3.8,0.099,16.0,126.0,0.99690,3.28,0.61,9.5
756,7.8,0.91,0.07,1.9,0.058,22.0,47.0,0.99525,3.51,0.43,10.7
126,9.0,0.62,0.04,1.9,0.146,27.0,90.0,0.99840,3.16,0.70,9.4
427,6.4,0.67,0.08,2.1,0.045,19.0,48.0,0.99490,3.49,0.49,11.4


In [6]:
X_test

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
121,7.3,1.070,0.09,1.7,0.178,10.0,89.0,0.99620,3.30,0.57,9.0
1020,7.0,0.500,0.14,1.8,0.078,10.0,23.0,0.99636,3.53,0.61,10.4
1346,8.2,0.440,0.24,2.3,0.063,10.0,28.0,0.99613,3.25,0.53,10.2
981,9.1,0.500,0.30,1.9,0.065,8.0,17.0,0.99774,3.32,0.71,10.5
1083,7.2,0.380,0.30,1.8,0.073,31.0,70.0,0.99685,3.42,0.59,9.5
...,...,...,...,...,...,...,...,...,...,...,...
447,12.5,0.380,0.60,2.6,0.081,31.0,72.0,0.99960,3.10,0.73,10.5
1275,7.8,0.580,0.13,2.1,0.102,17.0,36.0,0.99440,3.24,0.53,11.2
1423,7.0,0.540,0.00,2.1,0.079,39.0,55.0,0.99560,3.39,0.84,11.4
413,7.1,0.735,0.16,1.9,0.100,15.0,77.0,0.99660,3.27,0.64,9.3


# 3. Normalize train and test data

In [8]:
# StandardScaler + fit_transform
ss = StandardScaler()

newX_train = ss.fit_transform(X_train)
newX_test = ss.transform(X_test)

# 4. Create a RandomForest classifier with 300 estimators

In [9]:
# RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc

# 5. Get the accuracy of the model in cross validation (with K=5)

In [10]:
scores = cross_val_score(rfc, newX_train, y_train, cv = 5)
print(scores)
scores.mean()

[0.6484375 0.7265625 0.7421875 0.6875    0.6745098]


0.6958394607843138

# 6. Apply in cross validation a GridSearch with the following parameters to be tested:
- 'n_estimators': [100, 300, 500, 800, 1000],
- 'criterion': ['gini', 'entropy'],
- 'bootstrap': [True, False]

In [13]:
parameters = {'n_estimators':[100, 300, 500, 800, 1000], 
              'criterion':['gini', 'entropy'],
              'bootstrap':[True, False]
              }
clf = GridSearchCV(rfc, parameters, cv=5, n_jobs=-1)
clf.fit(newX_train, y_train)

# 7. Indicate the configuration that generates the best accuracy

In [17]:
print(f'{clf.best_params_} -> accuracy: {clf.best_score_}')

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 800} -> accuracy: 0.7044270833333333


# 8. Create a pipeline with the the preprocessing component and the model component.

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [23]:
my_pipeline = Pipeline(steps=[('preprocessing_transformer', ss),
                              ('model', RandomForestClassifier(n_estimators=clf.best_params_['n_estimators'], bootstrap=clf.best_params_['bootstrap'],
                                                               criterion=clf.best_params_['criterion']))], verbose = True)
my_pipeline

In [24]:
# Preprocessing of validation data, get predictions
my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_test)

[Pipeline]  (step 1 of 2) Processing preprocessing_transformer, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   3.0s


In [27]:
# Evaluate the model
score = accuracy_score(y_test, preds)
print('Accuracy Score:', score)

Accuracy Score: 0.7125
