## Importação dos pacotes

In [12]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [13]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore")

## Carga dos dados

In [14]:
prefixo_arquivos = ''
#prefixo_arquivos = 'https://github.com/hjort/ai-labs/raw/master/kaggle/serpro-wine/'

In [15]:
# carregar arquivo de dados de treino
train_data = pd.read_csv(prefixo_arquivos + 'wine-train.csv', index_col='wine')
train_data.shape

(3265, 12)

In [16]:
# carregar arquivo de dados de teste
test_data = pd.read_csv(prefixo_arquivos + 'wine-test.csv', index_col='wine')
test_data.shape

(1633, 11)

In [17]:
# unir ambos os dados de treino e teste
data = pd.concat([train_data, test_data])
print(data.shape)

# mostrar alguns exemplos de registros
data.head()

(4898, 12)


Unnamed: 0_level_0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,ph,quality,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,9.1,0.053,0.3,0.9986,7.4,48.5,3.14,good,12.8,0.49,229.0,0.19
1382,9.1,0.045,0.16,0.994,6.6,28.0,3.12,bad,3.1,0.35,92.0,0.56
3346,10.6,0.057,0.24,0.9952,6.7,64.0,3.12,bad,10.3,0.5,185.0,0.18
3308,10.6,0.039,0.28,0.9954,6.4,19.0,3.2,bad,12.6,0.43,124.0,0.35
3167,12.1,0.034,0.4,0.9914,5.6,36.0,3.21,good,6.1,0.43,118.0,0.28


## Transformações nos dados

In [18]:
# categorizar os valores dos sexos
data['quality'].fillna('unknown', inplace=True)
data['quality'] = data['quality'].map({'good': 1, 'bad': 0, 'unknown': -1}).astype(int)

In [19]:
data.head()

Unnamed: 0_level_0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,ph,quality,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,9.1,0.053,0.3,0.9986,7.4,48.5,3.14,1,12.8,0.49,229.0,0.19
1382,9.1,0.045,0.16,0.994,6.6,28.0,3.12,0,3.1,0.35,92.0,0.56
3346,10.6,0.057,0.24,0.9952,6.7,64.0,3.12,0,10.3,0.5,185.0,0.18
3308,10.6,0.039,0.28,0.9954,6.4,19.0,3.2,0,12.6,0.43,124.0,0.35
3167,12.1,0.034,0.4,0.9914,5.6,36.0,3.21,1,6.1,0.43,118.0,0.28


In [20]:
data.columns.values

array(['alcohol', 'chlorides', 'citric_acid', 'density', 'fixed_acidity',
       'free_sulfur_dioxide', 'ph', 'quality', 'residual_sugar',
       'sulphates', 'total_sulfur_dioxide', 'volatile_acidity'],
      dtype=object)

In [21]:
# realizar normalização nos dados numéricos contínuos
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler)

cols = ['alcohol', 'chlorides', 'citric_acid', 'density', 'fixed_acidity', 'free_sulfur_dioxide',
        'ph', 'residual_sugar', 'sulphates', 'total_sulfur_dioxide', 'volatile_acidity']

#for col in cols:
data.loc[:,cols] = scaler.fit_transform(data.loc[:,cols])

MinMaxScaler(copy=True, feature_range=(0, 1))


In [22]:
data.head()

Unnamed: 0_level_0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,ph,quality,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,0.1774,0.1306,0.1807,0.2215,0.3462,0.162,0.3818,1,0.1871,0.314,0.5104,0.1078
1382,0.1774,0.1068,0.0964,0.1328,0.2692,0.0906,0.3636,0,0.0383,0.1512,0.1926,0.4706
3346,0.4194,0.1424,0.1446,0.1558,0.2788,0.216,0.3636,0,0.1488,0.3256,0.4084,0.098
3308,0.4194,0.089,0.1687,0.1596,0.25,0.0592,0.4364,0,0.184,0.2442,0.2668,0.2647
3167,0.6613,0.0742,0.241,0.0835,0.1731,0.1185,0.4455,1,0.0844,0.2442,0.2529,0.1961


## Modelagem preditiva

In [23]:
# importar os pacotes necessários para os algoritmos de classificação
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [24]:
from datetime import datetime
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# avalia o desempenho do modelo, retornando o valor da precisão
def evaluate_classification_model(model, X, y):
    start = datetime.now()
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy', verbose=1)
    end = datetime.now()
    elapsed = int((end - start).total_seconds() * 1000)
    score = 100.0 * results.mean()
    stddev = 100.0 * results.std()
    print(model, '\nScore: %.2f (+/- %.2f) [%5s ms]' % (score, stddev, elapsed))
    return score, stddev, elapsed

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X, y):
    print('\nFine Tuning Model:')
    print(model, "\nparams:", params)
    kfold = KFold(n_splits=10, random_state=42)
    grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)
    grid.fit(X, y)
    print('\nGrid Best Score: %.2f' % (grid.best_score_ * 100.0))
    print('Best Params:', grid.best_params_)
    return grid

In [26]:
# definir dados de treino
train_data = data[data.quality >= 0]

# selecionar atributos para o modelo
cols = ['alcohol', 'chlorides', 'citric_acid', 'density', 'fixed_acidity', 'free_sulfur_dioxide',
        'ph', 'residual_sugar', 'sulphates', 'total_sulfur_dioxide', 'volatile_acidity']

X_train = train_data[cols]
y_train = train_data['quality']

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (3265, 11) (3265,)


In [27]:
train_data.corr()

Unnamed: 0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,ph,quality,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
alcohol,1.0,-0.363,-0.0445,-0.8026,-0.1331,-0.2207,0.1321,0.3858,-0.4598,-0.0198,-0.4452,0.0617
chlorides,-0.363,1.0,0.0764,0.2717,0.0183,0.1061,-0.0942,-0.186,0.1089,0.0175,0.2039,0.0816
citric_acid,-0.0445,0.0764,1.0,0.1333,0.2872,0.0871,-0.1624,-0.0272,0.0889,0.0778,0.1041,-0.1458
density,-0.8026,0.2717,0.1333,1.0,0.2806,0.2829,-0.108,-0.2889,0.8328,0.063,0.5372,0.0079
fixed_acidity,-0.1331,0.0183,0.2872,0.2806,1.0,-0.0365,-0.4286,-0.0735,0.0885,-0.0256,0.1117,-0.0125
free_sulfur_dioxide,-0.2207,0.1061,0.0871,0.2829,-0.0365,1.0,-0.0025,-0.0215,0.2897,0.063,0.6091,-0.0971
ph,0.1321,-0.0942,-0.1624,-0.108,-0.4286,-0.0025,1.0,0.0932,-0.1947,0.1622,-0.0094,-0.0302
quality,0.3858,-0.186,-0.0272,-0.2889,-0.0735,-0.0215,0.0932,1.0,-0.1203,0.0407,-0.161,-0.0725
residual_sugar,-0.4598,0.1089,0.0889,0.8328,0.0885,0.2897,-0.1947,-0.1203,1.0,-0.0392,0.3993,0.0449
sulphates,-0.0198,0.0175,0.0778,0.063,-0.0256,0.063,0.1622,0.0407,-0.0392,1.0,0.1343,-0.0328


In [28]:
# definir dados de teste
test_data = data[data.quality < 0]

X_test = test_data[cols]

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (1633, 11)


In [29]:
names = []
models = []
scores = []
stddevs = []
times = []

def add_model_info(name, model, score, stddev, elapsed):
    names.append(name)
    models.append((name, model))
    scores.append(score)
    stddevs.append(stddev)
    times.append(elapsed)

## Avaliação e ajuste fino de cada modelo preditivo

-  https://scikit-learn.org/stable/modules/classes.html

### Generalized Linear Models

In [31]:
model = LogisticRegression(random_state=42, solver='newton-cg', C=0.1, multi_class='auto', max_iter=500)

params = dict(
    solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    C=np.logspace(-3, 3, 7)
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('LR', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False) 
Score: 78.71 (+/- 2.15) [ 1708 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.7s finished


### Decision Trees

In [32]:
model = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=6, min_samples_split=0.25)

#criterion=’mse’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, 
#min_impurity_decrease=0.0, min_impurity_split=None, presort=False

params = dict(
    criterion=['gini','entropy'],
    max_depth=[4, 6, 8, 10, 12, 14],
    min_samples_split=[0.25, 0.5, 0.75, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('DT', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.25,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 
Score: 79.48 (+/- 2.40) [  751 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


### Discriminant Analysis

In [33]:
model = LinearDiscriminantAnalysis(solver='lsqr')

#solver=’svd’, shrinkage=None, priors=None,
#n_components=None, store_covariance=False, tol=0.0001

params = dict(
    solver=['svd', 'lsqr'] #, 'eigen']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('LDA', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='lsqr', store_covariance=False, tol=0.0001) 
Score: 80.12 (+/- 2.47) [  902 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


### Naïve Bayes

In [34]:
model = GaussianNB(priors=None, var_smoothing=1e-8)

#priors=None, var_smoothing=1e-09

params = dict(
    priors=[None],
    var_smoothing=[1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('NB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(priors=None, var_smoothing=1e-08) 
Score: 71.94 (+/- 2.40) [  332 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


### Nearest Neighbors

In [35]:
model = KNeighborsClassifier(n_neighbors=11, weights='uniform')

#n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’,
#metric_params=None, n_jobs=None

params = dict(
    n_neighbors=[1, 3, 5, 7, 9, 11, 13],
    weights=['uniform', 'distance']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('KNN', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform') 
Score: 80.83 (+/- 2.34) [ 2785 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.8s finished


### Support Vector Machines

In [36]:
model = SVC(random_state=42, C=10, gamma=0.1, kernel='rbf')

#kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0, tol=0.001, C=1.0, 
#epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1

params = dict(
    C=[0.001, 0.01, 0.1, 1, 10, 100],
    gamma=[0.001, 0.01, 0.1, 1, 10, 100],
    kernel=['linear', 'rbf']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('SVM', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False) 
Score: 78.16 (+/- 2.08) [17290 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.3s finished


### Neural network models

In [37]:
model = MLPClassifier(random_state=42, solver='lbfgs', alpha=1, hidden_layer_sizes=(100,), activation='logistic')
                
#hidden_layer_sizes=(100, ), activation=’relu’, solver=’adam’, alpha=0.0001, batch_size=’auto’, 
#learning_rate=’constant’, learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
#random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
#early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10

params = dict(
    alpha=[1,0.1,0.01,0.001,0.0001,0],
    hidden_layer_sizes=[(100,), (50,), (50,2), (5,5,2), (10,5,2)],
    activation=['identity', 'logistic', 'tanh', 'relu'],
    solver=['lbfgs', 'sgd', 'adam']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('MLP', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


MLPClassifier(activation='logistic', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False) 
Score: 80.67 (+/- 2.23) [119693 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.0min finished


### Ensemble Methods

In [38]:
model = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42, n_estimators=50)

#base_estimator=None, n_estimators=50, learning_rate=1.0,
#algorithm=’SAMME.R’, random_state=None

params = dict(
    n_estimators=[10, 25, 50, 100]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('ABDT', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=42,
                            

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [39]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(random_state=42, n_estimators=100,
                          max_samples=0.25, max_features=0.8)

#base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0,
#bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False,
#n_jobs=None, random_state=None, verbose=0

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_samples=[0.25, 0.5, 0.75, 1.0],
    max_features=[0.7, 0.8, 0.9, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('BC', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=0.8, max_samples=0.25, n_estimators=100,
                  n_jobs=None, oob_score=False, random_state=42, verbose=0,
                  warm_start=False) 
Score: 84.99 (+/- 2.52) [27008 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   27.0s finished


In [40]:
model = ExtraTreesClassifier(random_state=42, n_estimators=100, max_depth=7, 
                             min_samples_split=0.25, max_features='auto')

#n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, 
#max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
#bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0,
#warm_start=False, class_weight=None

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_depth=[None, 3, 7, 11],
    min_samples_split=[0.25, 0.5],
    max_features=['auto', 0.7, 0.85, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('ET', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=7, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=0.25,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=42, verbose=0,
                     warm_start=False) 
Score: 78.22 (+/- 2.03) [11415 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.4s finished


In [41]:
model = GradientBoostingClassifier(random_state=42, n_estimators=100, max_features=0.75,
                                   max_depth=4, learning_rate=0.1, subsample=0.6)

#loss=’ls’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
#min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
#max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, 
#tol=0.0001

params = dict(
    n_estimators=[100, 250, 500],
    max_features=[0.75, 0.85, 1.0],
    max_depth=[4, 6, 8, 10],
    learning_rate=[0.05, 0.1, 0.15],
    subsample=[0.4, 0.6, 0.8]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('GB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=4,
                           max_features=0.75, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=0.6, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 
Score: 84.47 (+/- 2.18) [22950 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   22.9s finished


In [42]:
model = RandomForestClassifier(random_state=42, n_estimators=100, max_features='auto', max_depth=5)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
#verbose=0, warm_start=False

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_features=['auto', 'sqrt', 'log2'],
    max_depth=[None, 3, 5, 7, 9, 11, 13]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('RF', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False) 
Score: 82.73 (+/- 2.77) [21402 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   21.4s finished


### Outros algoritmos

#### XGBoost

- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [43]:
model = XGBClassifier(max_depth=3, n_estimators=100)

#max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:squarederror',
#booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, 
#colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
#base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain'

params = dict(
    max_depth=[3, 5, 7, 9],
    n_estimators=[50, 75, 100, 200]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('XGB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) 
Score: 83.49 (+/- 2.99) [14708 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   14.7s finished


#### LightGBM

- https://github.com/microsoft/LightGBM
- https://lightgbm.readthedocs.io/en/latest/
- https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

### Ensemble Learning Model

- https://towardsdatascience.com/automate-stacking-in-python-fc3e7834772e
- https://github.com/vecxoz/vecstack

In [44]:
estimators =  [
    ('RF', RandomForestClassifier(random_state=42, n_estimators=100, max_features='auto', max_depth=5)),
    ('BC', BaggingClassifier(random_state=42, n_estimators=100, max_samples=0.25, max_features=0.8)),
    ('GB', GradientBoostingClassifier(random_state=42, max_depth=4, max_features=0.75,
                                   n_estimators=100, learning_rate=0.1, subsample=0.6)),
#    ('XGB', XGBClassifier(max_depth=3, n_estimators=100)),
]
model = VotingClassifier(estimators, n_jobs=-1, weights=(2,1,1))

#estimators, weights=None, n_jobs=None

params = dict(
    weights=[(1,1,1), (2,1,1), (3,1,1), (3,2,1), (2,2,1), (2,1,2), (5,4,3), (1,2,1), (1,1,2), ]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('VC', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


VotingClassifier(estimators=[('RF',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=5,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
             

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.5min finished


## Avaliar importância dos atributos no modelo

In [45]:
model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=100)
model.fit(X_train, y_train)

importances = pd.DataFrame({'feature': X_train.columns,
                            'importance': np.round(model.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending=False).set_index('feature')
importances.head(20)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
alcohol,0.149
density,0.134
residual_sugar,0.09
chlorides,0.088
volatile_acidity,0.088
free_sulfur_dioxide,0.085
ph,0.084
total_sulfur_dioxide,0.079
sulphates,0.069
citric_acid,0.068


In [46]:
importances.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0xa2bdcc0c>

## Comparação final entre os algoritmos

In [47]:
results = pd.DataFrame({'Algorithm': names, 'Score': scores, 'Std Dev': stddevs, 'Time (ms)': times})
results.sort_values(by='Score', ascending=False)

Unnamed: 0,Algorithm,Score,Std Dev,Time (ms)
9,BC,84.9933,2.5194,27008
11,GB,84.4725,2.1765,22950
13,XGB,83.492,2.989,14708
14,VC,82.9109,2.6722,92132
12,RF,82.727,2.7712,21402
8,ABDT,81.8687,1.3903,1599
5,KNN,80.8284,2.3355,2785
7,MLP,80.6748,2.2327,119693
3,LDA,80.1232,2.4697,902
2,DT,79.4797,2.3962,751


## Gerar arquivos com resultados

In [48]:
# criar diretório para os arquivos de envio
!test -d submissions || mkdir submissions

In [49]:
prefixo_arquivo = 'submissions/wine-submission'
sufixo_arquivo = '31ago'

for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    vfunc = np.vectorize(lambda x: 'good' if x > 0 else 'bad')

    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'wine': X_test.index,
      'quality': vfunc(y_pred)
    })
    submission.set_index('wine', inplace=True)

    # gerar arquivo CSV para o envio
    filename = '%s-p-%s-%s.csv' % (prefixo_arquivo, sufixo_arquivo, name.lower())
    submission.to_csv(filename)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False) 

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False) 

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.25,
                       min_weight_fraction_leaf=0.0, presor

In [50]:
!head submissions/*.csv

==> submissions/wine-submission-p-18ago-abdt.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-bc.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-dt.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-et.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-gb.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-knn.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==> submissions/wine-submission-p-18ago-lda.csv <==
wine,quality
1214,bad
727,bad
481,bad
1308,bad
4087,bad
619,bad
4669,bad
874,bad
2337,bad

==