In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
whitewine = pd.read_csv('code/data/whitewines.csv')
whitewine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.62,0.24,1.1,0.039,6.0,62.0,0.9934,3.41,0.32,10.4,5
1,5.7,0.22,0.2,16.0,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6
2,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6
3,5.3,0.47,0.1,1.3,0.036,11.0,74.0,0.99082,3.48,0.54,11.2,4
4,6.4,0.29,0.21,9.65,0.041,36.0,119.0,0.99334,2.99,0.34,10.933333,6


## Separando features e label

In [3]:
X = whitewine.drop('quality', axis = 1)
y = whitewine['quality']

## Normalizando

In [4]:
stdScaler = StandardScaler()
normalizedX = pd.DataFrame(stdScaler.fit_transform(X), columns=X.columns)
normalizedX.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.183445,3.390995,-0.778394,-1.043355,-0.310008,-1.723457,-1.796987,-0.209783,1.468577,-1.488394,-0.092863
1,-1.368585,-0.577879,-1.108952,1.894609,-0.081131,0.334712,-0.596809,1.535686,0.210175,-0.261553,-1.311884
2,-1.131557,-0.875545,-0.613115,0.198872,-0.538886,-0.135727,-0.361481,0.325227,1.99843,-0.612079,-0.336667
3,-1.842641,1.902667,-1.935348,-1.003919,-0.447335,-1.429433,-1.514592,-1.072485,1.932199,0.439499,0.557282
4,-0.538987,0.116674,-1.026313,0.642524,-0.218457,0.040688,-0.455612,-0.229846,-1.313153,-1.313131,0.340567


## Separando teste e treino

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [19]:
randomForest = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100, 150], 'max_depth': [None, 3, 5, 7], 'max_features': ['auto', 'sqrt', 'log2']}
gridSearchCV = GridSearchCV(randomForest, params)

In [20]:
gridSearchCV.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 150], 'max_depth': [None, 3, 5, 7], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
bestRandomForest = gridSearchCV.best_estimator_

In [22]:
bestRandomForest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
bestRandomForest.fit(X_train, y_train)
bestRandomForest.score(X_test, y_test)

0.66632653061224489

In [26]:

randomForest.fit(X_train, y_train)
randomForest.score(X_test, y_test)

0.6489795918367347

## Cross validation

In [27]:
cross_val_score(bestRandomForest, X_train, y_train, cv=10)



array([ 0.67088608,  0.69796954,  0.68527919,  0.70812183,  0.68286445,
        0.64102564,  0.67692308,  0.63589744,  0.72307692,  0.67435897])

In [30]:
cross_val_score(randomForest, X_train, y_train, cv=10)



array([ 0.65063291,  0.6319797 ,  0.68020305,  0.65736041,  0.62915601,
        0.63333333,  0.64358974,  0.62307692,  0.7       ,  0.65128205])