# INF1032 - Trabalho 1.2 - Classificação de vinhos
### Alunos: Leonardo Wajnsztok e Samuel Bastos

O objetivo deste trabalho é comparar diversos métodos de classificação para a base de dados de qualidade de vinhos disponível em https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv.

Vocês devem encontrar um bom modelo preditivo, variando:
* o número e conjunto de features (atributos) utilizados
* o método utilizado
* a configuração do algoritmo correspondente (e.g.: número k para nearest neighbors, profundidade para árvore de decisão)

Vocês devem listar algumas métricas de qualidade, tais como: precision, recall, accuracy e f1_score, e utilizar accuracy como base para a avaliação final, considerando a accuracy média de 10 iterações para cada configuração.

Para assegurar que eu obterei os mesmos resultados de vocês, vocês devem estabelecer a semente para a geração dos números aleatórios (utilizados para separar os conjuntos de treinamento e teste, por exemplo), utilizando os seguintes comandos no início do seu código (podem utilizar uma outra semente):
```
import random
random.seed(1001001)
```

In [87]:
import pandas as pd
import random

seed = 1032
random.seed(seed)

# Read dataset

In [88]:
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
initial_df = pd.read_csv(dataset_url, sep=";")

# Dataset analysis

In [89]:
initial_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [90]:
initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [91]:
initial_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [92]:
label_column = 'quality'
feature_columns = set(initial_df.columns)
feature_columns.remove(label_column)
feature_columns = list(feature_columns)

print (label_column, feature_columns)

quality ['chlorides', 'volatile acidity', 'density', 'citric acid', 'alcohol', 'sulphates', 'total sulfur dioxide', 'pH', 'free sulfur dioxide', 'fixed acidity', 'residual sugar']


In [93]:
initial_df.groupby(label_column).size().reset_index(name="count")

Unnamed: 0,quality,count
0,3,10
1,4,53
2,5,681
3,6,638
4,7,199
5,8,18


- 1599 rows
- 6 labels
- Dataset is not balanced
- No missing values
- All features are numeric and they are in different scales

# Train, Test split

In [94]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

test_size = 0.2
n_folds = 3

X_train, X_test, y_train, y_test = train_test_split(initial_df[feature_columns], initial_df[label_column], test_size=test_size, random_state=seed)

# Validation pipeline

In [95]:
from sklearn.metrics import confusion_matrix, classification_report

def apply_pipeline(name, model, params):
    pipe = Pipeline([('clf', model)])
    grid = GridSearchCV(pipe, {}, n_jobs=1, cv=n_folds)
    grid = grid.fit(X_train, y_train)
    predictions = grid.predict(X_train)
    
    print ("Model: "+ name)
    print ("Best cross-validation score: " + str(grid.best_score_))
    print (grid.cv_results_)
    print ("Best parameters: " + str(grid.best_params_))
    print (confusion_matrix(y_train, predictions))
    print (classification_report(y_train, predictions))

# Models and parameters grids

In [100]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier

models_params = [
    ('LinearSVC', LinearSVC(random_state=seed), {}),
    ('KNN', KNeighborsClassifier(), {}),
    ('Decision Tree', DecisionTreeClassifier(), {}),
    ('Gradient Boosting', GradientBoostingClassifier(), {}),
    ('Random Forest', RandomForestClassifier(), {}),
    ('Logistic Regression', LogisticRegression(), {}),
    ('Gaussian Naive Bayes', GaussianNB(), {}),
    ('Multinomial Naive Bayes', MultinomialNB(), {}),
    ('Multilayer Perceptron', MLPClassifier(), {})
]

In [101]:
for name, model, params in models_params:
    apply_pipeline(name, model, params)

Model: LinearSVC
Best cross-validation score: 0.47458952306489444
{'split1_test_score': array([0.43559719]), 'split2_test_score': array([0.44103774]), 'std_test_score': array([0.05120802]), 'mean_score_time': array([0.00060399]), 'split2_train_score': array([0.45146199]), 'split0_train_score': array([0.56051704]), 'std_train_score': array([0.05832679]), 'params': [{}], 'mean_test_score': array([0.47458952]), 'mean_fit_time': array([0.1981283]), 'split1_train_score': array([0.42605634]), 'rank_test_score': array([1], dtype=int32), 'std_fit_time': array([0.00175397]), 'mean_train_score': array([0.47934512]), 'std_score_time': array([3.56940209e-05]), 'split0_test_score': array([0.54672897])}
Best parameters: {}
[[  0   0   7   1   0   0]
 [  0   1  18  16   3   0]
 [  1   7 216 268  45   0]
 [  0   3  73 317 126   0]
 [  0   0   4  74  86   0]
 [  0   0   0   4   9   0]]
             precision    recall  f1-score   support

          3       0.00      0.00      0.00         8
          4

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Model: Gradient Boosting
Best cross-validation score: 0.6239249413604379
{'split1_test_score': array([0.61592506]), 'split2_test_score': array([0.64150943]), 'std_test_score': array([0.01239708]), 'mean_score_time': array([0.00427095]), 'split2_train_score': array([0.92982456]), 'split0_train_score': array([0.92361927]), 'std_train_score': array([0.00530923]), 'params': [{}], 'mean_test_score': array([0.62392494]), 'mean_fit_time': array([0.68796134]), 'split1_train_score': array([0.93661972]), 'rank_test_score': array([1], dtype=int32), 'std_fit_time': array([0.01391248]), 'mean_train_score': array([0.93002118]), 'std_score_time': array([7.45457119e-05]), 'split0_test_score': array([0.61448598])}
Best parameters: {}
[[  8   0   0   0   0   0]
 [  0  30   4   4   0   0]
 [  0   0 486  51   0   0]
 [  0   0  66 450   3   0]
 [  0   0   6  28 130   0]
 [  0   0   0   0   0  13]]
             precision    recall  f1-score   support

          3       1.00      1.00      1.00         8
   

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Model: Multilayer Perceptron
Best cross-validation score: 0.5043002345582487
{'split1_test_score': array([0.5175644]), 'split2_test_score': array([0.49056604]), 'std_test_score': array([0.01101436]), 'mean_score_time': array([0.00123866]), 'split2_train_score': array([0.54853801]), 'split0_train_score': array([0.50293772]), 'std_train_score': array([0.02017868]), 'params': [{}], 'mean_test_score': array([0.50430023]), 'mean_fit_time': array([0.18497459]), 'split1_train_score': array([0.54225352]), 'rank_test_score': array([1], dtype=int32), 'std_fit_time': array([0.04489684]), 'mean_train_score': array([0.53124308]), 'std_score_time': array([0.00023116]), 'split0_test_score': array([0.5046729])}
Best parameters: {}
[[  0   0   6   2   0   0]
 [  0   0  22  15   1   0]
 [  0   0 347 190   0   0]
 [  0   0 149 369   1   0]
 [  0   0  18 136  10   0]
 [  0   0   1  11   1   0]]
             precision    recall  f1-score   support

          3       0.00      0.00      0.00         8
     

  'precision', 'predicted', average, warn_for)
