## Importação dos pacotes

In [47]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [48]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [139]:
# carregar arquivo de dados de treino
data = pd.read_csv('wine-train.csv', index_col='wine')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1942,8.3,0.36,0.57,15.0,0.052,35.0,256.0,1.0,2.93,0.64,8.6,5.0
3847,6.4,0.32,0.23,16.2,0.055,36.0,176.0,0.999,3.26,0.54,9.1,5.0
3183,6.5,0.24,0.38,1.0,0.027,31.0,90.0,0.989,3.24,0.36,12.3,6.0
2745,6.7,0.44,0.22,4.3,0.032,19.0,99.0,0.99,3.26,0.53,12.8,7.0
2977,6.6,0.23,0.2,11.4,0.044,45.0,131.0,0.996,2.96,0.51,9.7,6.0


In [140]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

## Análise dos dados

In [51]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3265 entries, 1942 to 4334
Data columns (total 12 columns):
fixed_acidity           3265 non-null float64
volatile_acidity        3265 non-null float64
citric_acid             3265 non-null float64
residual_sugar          3265 non-null float64
chlorides               3265 non-null float64
free_sulfur_dioxide     3265 non-null float64
total_sulfur_dioxide    3265 non-null float64
density                 3265 non-null float64
ph                      3265 non-null float64
sulphates               3265 non-null float64
alcohol                 3265 non-null float64
quality                 3265 non-null float64
dtypes: float64(12)
memory usage: 331.6 KB


In [52]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [53]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,3265.0,6.857,0.837,3.9,6.3,6.8,7.3,14.2
volatile_acidity,3265.0,0.278,0.099,0.08,0.21,0.26,0.32,0.965
citric_acid,3265.0,0.335,0.122,0.0,0.27,0.32,0.39,1.66
residual_sugar,3265.0,6.356,5.144,0.6,1.7,5.1,9.85,65.8
chlorides,3265.0,0.046,0.022,0.009,0.036,0.043,0.05,0.301
free_sulfur_dioxide,3265.0,35.094,16.429,2.0,23.0,34.0,45.0,138.5
total_sulfur_dioxide,3265.0,137.643,42.145,9.0,107.0,134.0,167.0,313.0
density,3265.0,0.994,0.003,0.987,0.992,0.994,0.996,1.039
ph,3265.0,3.188,0.15,2.72,3.08,3.18,3.28,3.82
sulphates,3265.0,0.488,0.113,0.22,0.41,0.47,0.55,1.08


In [54]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
fixed_acidity,1.0,-0.031,0.276,0.087,0.031,-0.049,0.095,0.26,-0.419,-0.011,-0.123,-0.108
volatile_acidity,-0.031,1.0,-0.147,0.074,0.074,-0.099,0.09,0.037,-0.034,-0.046,0.062,-0.194
citric_acid,0.276,-0.147,1.0,0.09,0.143,0.107,0.129,0.142,-0.17,0.062,-0.073,-0.016
residual_sugar,0.087,0.074,0.09,1.0,0.091,0.314,0.406,0.845,-0.197,-0.022,-0.453,-0.101
chlorides,0.031,0.074,0.143,0.091,1.0,0.102,0.2,0.259,-0.102,0.014,-0.366,-0.208
free_sulfur_dioxide,-0.049,-0.099,0.107,0.314,0.102,1.0,0.625,0.307,0.002,0.055,-0.262,0.027
total_sulfur_dioxide,0.095,0.09,0.129,0.406,0.2,0.625,1.0,0.529,-0.011,0.124,-0.458,-0.168
density,0.26,0.037,0.142,0.845,0.259,0.307,0.529,1.0,-0.097,0.081,-0.772,-0.301
ph,-0.419,-0.034,-0.17,-0.197,-0.102,0.002,-0.011,-0.097,1.0,0.164,0.127,0.116
sulphates,-0.011,-0.046,0.062,-0.022,0.014,0.055,0.124,0.081,0.164,1.0,-0.026,0.061


In [55]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,residual_sugar,free_sulfur_dioxide,total_sulfur_dioxide,density,alcohol
residual_sugar,,,,0.845,
free_sulfur_dioxide,,,0.625,,
total_sulfur_dioxide,,0.625,,,
density,0.845,,,,-0.772
alcohol,,,,-0.772,


In [141]:
data.groupby('quality').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3.0,7.75,0.322,0.341,5.905,0.043,39.55,127.8,0.995,3.169,0.444,10.27
4.0,7.072,0.388,0.304,4.756,0.05,22.22,121.559,0.994,3.172,0.476,10.148
5.0,6.934,0.303,0.339,7.275,0.051,35.935,150.133,0.995,3.166,0.478,9.815
6.0,6.848,0.261,0.338,6.419,0.045,35.583,137.002,0.994,3.188,0.491,10.577
7.0,6.74,0.264,0.326,5.045,0.038,34.143,124.229,0.992,3.219,0.504,11.387
8.0,6.655,0.275,0.327,5.954,0.039,36.471,124.074,0.992,3.212,0.476,11.539
9.0,7.15,0.3,0.35,3.1,0.025,42.0,129.0,0.99,3.28,0.42,12.6


In [11]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
numeric_feats

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'ph', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [12]:
from scipy.stats import skew
skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
skewed_feats

Index(['volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides',
       'density', 'sulphates'],
      dtype='object')

In [13]:
data.head(10).T

wine,1942,3847,3183,2745,2977,1421,1709,4601,2326,2922
fixed_acidity,8.3,6.4,6.5,6.7,6.6,6.2,6.8,6.9,6.9,6.5
volatile_acidity,0.36,0.32,0.24,0.44,0.23,0.18,0.22,0.23,0.35,0.44
citric_acid,0.57,0.23,0.38,0.22,0.2,0.49,0.3,0.35,0.55,0.47
residual_sugar,15.0,16.2,1.0,4.3,11.4,4.5,13.6,6.9,11.95,5.45
chlorides,0.052,0.055,0.027,0.032,0.044,0.047,0.055,0.03,0.038,0.014
free_sulfur_dioxide,35.0,36.0,31.0,19.0,45.0,17.0,50.0,45.0,22.0,44.0
total_sulfur_dioxide,256.0,176.0,90.0,99.0,131.0,90.0,180.0,116.0,111.0,137.0
density,1.0,0.999,0.989,0.99,0.996,0.992,0.998,0.992,0.997,0.99
ph,2.93,3.26,3.24,3.26,2.96,3.27,3.44,2.8,3.11,3.13
sulphates,0.64,0.54,0.36,0.53,0.51,0.37,0.39,0.54,0.29,0.32


In [14]:
data.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [27]:
# https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

print(IQR)

'''
print('')
print(data.mean())
print('')
print(data.std())
'''

fixed_acidity           0.097
volatile_acidity        0.124
citric_acid             0.072
residual_sugar          0.125
chlorides               0.048
free_sulfur_dioxide     0.161
total_sulfur_dioxide    0.197
density                 0.085
ph                      0.182
sulphates               0.163
alcohol                 0.333
quality                 0.167
dtype: float64


"\nprint('')\nprint(data.mean())\nprint('')\nprint(data.std())\n"

In [32]:
# https://www.kdnuggets.com/2017/02/removing-outliers-standard-deviation-python.html

#TODO: remover outliers para cada valor de "quality" (de 3 a 9)

spread = 3

removed_ids = ((data < (Q1 - spread * IQR)) | (data > (Q3 + spread * IQR))).index.values
print('removed ids:', removed_ids)

data_out = data[~((data < (Q1 - spread * IQR)) | (data > (Q3 + spread * IQR))).any(axis=1)]
#data_out = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

print('before:', data.shape, '=> after:', data_out.shape,
      ':: removed %d lines' % int(data.shape[0] - data_out.shape[0]))

removed ids: [1942 3847 3183 ...  895  563 4334]
before: (3265, 12) => after: (3130, 12) :: removed 135 lines


In [33]:
data_out.head(10).T

wine,1942,3847,3183,2745,2977,1421,1709,4601,2326,2922
fixed_acidity,0.427,0.243,0.252,0.272,0.262,0.223,0.282,0.291,0.291,0.252
volatile_acidity,0.316,0.271,0.181,0.407,0.169,0.113,0.158,0.169,0.305,0.407
citric_acid,0.343,0.139,0.229,0.133,0.12,0.295,0.181,0.211,0.331,0.283
residual_sugar,0.221,0.239,0.006,0.057,0.166,0.06,0.199,0.097,0.174,0.074
chlorides,0.147,0.158,0.062,0.079,0.12,0.13,0.158,0.072,0.099,0.017
free_sulfur_dioxide,0.242,0.249,0.212,0.125,0.315,0.11,0.352,0.315,0.147,0.308
total_sulfur_dioxide,0.812,0.549,0.266,0.296,0.401,0.266,0.562,0.352,0.336,0.421
density,0.25,0.222,0.041,0.059,0.172,0.092,0.218,0.103,0.188,0.053
ph,0.191,0.491,0.473,0.491,0.218,0.5,0.655,0.073,0.355,0.373
sulphates,0.488,0.372,0.163,0.36,0.337,0.174,0.198,0.372,0.081,0.116


In [168]:
spread = 0.5

data['outlier'] = False

for quality in np.arange(3, 10):
#if (True):

    #quality = 5
    dtqu = data[data.quality == quality]
    
    Q1 = dtqu.quantile(0.25)
    Q3 = dtqu.quantile(0.75)
    IQR = Q3 - Q1
    
    removed_ids = ~((dtqu < (Q1 - spread * IQR)) | (dtqu > (Q3 + spread * IQR))).any(axis=1)
    df_rids = pd.DataFrame(data=removed_ids, columns=['outlier'])
    
    data.update(df_rids)
    
    #print(quality, dtqu[dtqu.outlier == True].count())
    print(quality, data[data.outlier == False].shape[0])

3 3263
4 3240
5 3042
6 2754
7 2626
8 2592
9 2592


In [164]:
removed_ids.head()

wine
827    False
876    False
dtype: bool

In [165]:
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,outlier
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1942,8.3,0.36,0.57,15.0,0.052,35.0,256.0,1.0,2.93,0.64,8.6,5.0,False
3847,6.4,0.32,0.23,16.2,0.055,36.0,176.0,0.999,3.26,0.54,9.1,5.0,True
3183,6.5,0.24,0.38,1.0,0.027,31.0,90.0,0.989,3.24,0.36,12.3,6.0,False
2745,6.7,0.44,0.22,4.3,0.032,19.0,99.0,0.99,3.26,0.53,12.8,7.0,False
2977,6.6,0.23,0.2,11.4,0.044,45.0,131.0,0.996,2.96,0.51,9.7,6.0,False


In [166]:
data[['ph','outlier']].groupby('outlier').count()

Unnamed: 0_level_0,ph
outlier,Unnamed: 1_level_1
False,2592
True,673


In [167]:
data[data.outlier == False].shape
#data_out.shape

(2592, 13)

In [169]:
data[data.outlier == False].to_csv('wine-train-without-outliers.csv')

In [39]:
data_out.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))

data_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
data_scaled.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1942,0.427,0.316,0.343,0.221,0.147,0.242,0.812,0.25,0.191,0.488,0.1,0.333
3847,0.243,0.271,0.139,0.239,0.158,0.249,0.549,0.222,0.491,0.372,0.183,0.333
3183,0.252,0.181,0.229,0.006,0.062,0.212,0.266,0.041,0.473,0.163,0.717,0.5
2745,0.272,0.407,0.133,0.057,0.079,0.125,0.296,0.059,0.491,0.36,0.8,0.667
2977,0.262,0.169,0.12,0.166,0.12,0.315,0.401,0.172,0.218,0.337,0.283,0.5


In [24]:
data_scaled.to_csv('wine-train-scaled.csv')