## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('wine-train.csv', index_col='wine')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1942,8.3,0.36,0.57,15.0,0.052,35.0,256.0,1.0,2.93,0.64,8.6,5.0
3847,6.4,0.32,0.23,16.2,0.055,36.0,176.0,0.999,3.26,0.54,9.1,5.0
3183,6.5,0.24,0.38,1.0,0.027,31.0,90.0,0.989,3.24,0.36,12.3,6.0
2745,6.7,0.44,0.22,4.3,0.032,19.0,99.0,0.99,3.26,0.53,12.8,7.0
2977,6.6,0.23,0.2,11.4,0.044,45.0,131.0,0.996,2.96,0.51,9.7,6.0


In [4]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

In [5]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

## Análise dos dados

In [6]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3265 entries, 1942 to 4334
Data columns (total 12 columns):
fixed_acidity           3265 non-null float64
volatile_acidity        3265 non-null float64
citric_acid             3265 non-null float64
residual_sugar          3265 non-null float64
chlorides               3265 non-null float64
free_sulfur_dioxide     3265 non-null float64
total_sulfur_dioxide    3265 non-null float64
density                 3265 non-null float64
ph                      3265 non-null float64
sulphates               3265 non-null float64
alcohol                 3265 non-null float64
quality                 3265 non-null float64
dtypes: float64(12)
memory usage: 331.6 KB


In [7]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [8]:
# sumário estatístico das características numéricas
data.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0
mean,6.857,0.278,0.335,6.356,0.046,35.094,137.643,0.994,3.188,0.488,10.52,5.893
std,0.837,0.099,0.122,5.144,0.022,16.429,42.145,0.003,0.15,0.113,1.238,0.875
min,3.9,0.08,0.0,0.6,0.009,2.0,9.0,0.987,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,107.0,0.992,3.08,0.41,9.4,5.0
50%,6.8,0.26,0.32,5.1,0.043,34.0,134.0,0.994,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.85,0.05,45.0,167.0,0.996,3.28,0.55,11.4,6.0
max,14.2,0.965,1.66,65.8,0.301,138.5,313.0,1.039,3.82,1.08,14.0,9.0


In [9]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
fixed_acidity,1.0,-0.031,0.276,0.087,0.031,-0.049,0.095,0.26,-0.419,-0.011,-0.123,-0.108
volatile_acidity,-0.031,1.0,-0.147,0.074,0.074,-0.099,0.09,0.037,-0.034,-0.046,0.062,-0.194
citric_acid,0.276,-0.147,1.0,0.09,0.143,0.107,0.129,0.142,-0.17,0.062,-0.073,-0.016
residual_sugar,0.087,0.074,0.09,1.0,0.091,0.314,0.406,0.845,-0.197,-0.022,-0.453,-0.101
chlorides,0.031,0.074,0.143,0.091,1.0,0.102,0.2,0.259,-0.102,0.014,-0.366,-0.208
free_sulfur_dioxide,-0.049,-0.099,0.107,0.314,0.102,1.0,0.625,0.307,0.002,0.055,-0.262,0.027
total_sulfur_dioxide,0.095,0.09,0.129,0.406,0.2,0.625,1.0,0.529,-0.011,0.124,-0.458,-0.168
density,0.26,0.037,0.142,0.845,0.259,0.307,0.529,1.0,-0.097,0.081,-0.772,-0.301
ph,-0.419,-0.034,-0.17,-0.197,-0.102,0.002,-0.011,-0.097,1.0,0.164,0.127,0.116
sulphates,-0.011,-0.046,0.062,-0.022,0.014,0.055,0.124,0.081,0.164,1.0,-0.026,0.061


In [10]:
# show variable correlation which is more than 0.7 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.7].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,residual_sugar,density,alcohol
residual_sugar,,0.845,
density,0.845,,-0.772
alcohol,,-0.772,


In [11]:
data.groupby('quality').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3.0,7.75,0.322,0.341,5.905,0.043,39.55,127.8,0.995,3.169,0.444,10.27
4.0,7.072,0.388,0.304,4.756,0.05,22.22,121.559,0.994,3.172,0.476,10.148
5.0,6.934,0.303,0.339,7.275,0.051,35.935,150.133,0.995,3.166,0.478,9.815
6.0,6.848,0.261,0.338,6.419,0.045,35.583,137.002,0.994,3.188,0.491,10.577
7.0,6.74,0.264,0.326,5.045,0.038,34.143,124.229,0.992,3.219,0.504,11.387
8.0,6.655,0.275,0.327,5.954,0.039,36.471,124.074,0.992,3.212,0.476,11.539
9.0,7.15,0.3,0.35,3.1,0.025,42.0,129.0,0.99,3.28,0.42,12.6
