## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('wine-train.csv', index_col='wine')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3472,6.7,0.16,0.36,2.0,0.045,24.0,131.0,0.993,3.3,0.59,10.5,bad
3455,7.3,0.23,0.24,0.9,0.031,29.0,86.0,0.989,2.9,0.38,12.2,bad
3322,5.7,0.26,0.3,1.8,0.039,30.0,105.0,0.99,3.48,0.52,12.5,good
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.989,3.34,0.38,12.8,good
2123,6.8,0.25,0.27,10.7,0.076,47.0,154.0,0.997,3.05,0.38,9.0,bad


In [4]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3265 entries, 3472 to 3696
Data columns (total 12 columns):
fixed_acidity           3265 non-null float64
volatile_acidity        3265 non-null float64
citric_acid             3265 non-null float64
residual_sugar          3265 non-null float64
chlorides               3265 non-null float64
free_sulfur_dioxide     3265 non-null float64
total_sulfur_dioxide    3265 non-null float64
density                 3265 non-null float64
ph                      3265 non-null float64
sulphates               3265 non-null float64
alcohol                 3265 non-null float64
quality                 3265 non-null object
dtypes: float64(11), object(1)
memory usage: 318.8+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [7]:
data.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,3265.0,6.863,0.841,4.2,6.3,6.8,7.4,14.2
volatile_acidity,3265.0,0.278,0.1,0.08,0.21,0.26,0.32,1.1
citric_acid,3265.0,0.336,0.123,0.0,0.27,0.32,0.39,1.66
residual_sugar,3265.0,6.529,5.136,0.6,1.8,5.4,10.2,65.8
chlorides,3265.0,0.046,0.022,0.009,0.036,0.043,0.05,0.346
free_sulfur_dioxide,3265.0,35.549,17.194,2.0,23.5,34.0,46.0,289.0
total_sulfur_dioxide,3265.0,138.691,43.242,10.0,108.0,134.0,168.0,440.0
density,3265.0,0.994,0.003,0.987,0.992,0.994,0.996,1.039
ph,3265.0,3.184,0.149,2.77,3.08,3.17,3.27,3.82
sulphates,3265.0,0.49,0.113,0.23,0.41,0.47,0.55,1.08


In [9]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
fixed_acidity,1.0,-0.009,0.277,0.099,0.014,-0.036,0.111,0.265,-0.421,-0.013,-0.119
volatile_acidity,-0.009,1.0,-0.139,0.067,0.071,-0.094,0.088,0.036,-0.039,-0.026,0.066
citric_acid,0.277,-0.139,1.0,0.104,0.113,0.103,0.139,0.158,-0.152,0.071,-0.081
residual_sugar,0.099,0.067,0.104,1.0,0.09,0.33,0.422,0.848,-0.185,-0.015,-0.465
chlorides,0.014,0.071,0.113,0.09,1.0,0.11,0.204,0.253,-0.094,0.012,-0.358
free_sulfur_dioxide,-0.036,-0.094,0.103,0.33,0.11,1.0,0.632,0.32,0.008,0.053,-0.262
total_sulfur_dioxide,0.111,0.088,0.139,0.422,0.204,0.632,1.0,0.542,0.007,0.122,-0.457
density,0.265,0.036,0.158,0.848,0.253,0.32,0.542,1.0,-0.088,0.072,-0.777
ph,-0.421,-0.039,-0.152,-0.185,-0.094,0.008,0.007,-0.088,1.0,0.14,0.118
sulphates,-0.013,-0.026,0.071,-0.015,0.012,0.053,0.122,0.072,0.14,1.0,-0.008


In [10]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.7].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,residual_sugar,density,alcohol
residual_sugar,,0.848,
density,0.848,,-0.777
alcohol,,-0.777,


In [11]:
data.groupby('quality').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bad,6.899,0.282,0.338,6.858,0.048,35.886,142.495,0.995,3.176,0.488,10.248
good,6.735,0.265,0.328,5.341,0.038,34.337,125.004,0.992,3.212,0.495,11.429


In [12]:
spread = 0.5

data['outlier'] = False

for quality in ['good', 'bad']:
#for quality in np.arange(3, 10):
#if (True):

    #quality = 5
    dtqu = data[data.quality == quality].drop(['quality'], axis=1)
    
    Q1 = dtqu.quantile(0.25)
    Q3 = dtqu.quantile(0.75)
    IQR = Q3 - Q1
    
    removed_ids = ~((dtqu < (Q1 - spread * IQR)) | (dtqu > (Q3 + spread * IQR))).any(axis=1)
    df_rids = pd.DataFrame(data=removed_ids, columns=['outlier'])
    
    data.update(df_rids)
    
    #print(quality, dtqu[dtqu.outlier == True].count())
    print(quality, data[data.outlier == False].shape[0])

good 3077
bad 2582


In [13]:
removed_ids.head()

wine
3472    False
3455    False
2123    False
4678    False
3159    False
dtype: bool

In [14]:
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,outlier
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3472,6.7,0.16,0.36,2.0,0.045,24.0,131.0,0.993,3.3,0.59,10.5,bad,False
3455,7.3,0.23,0.24,0.9,0.031,29.0,86.0,0.989,2.9,0.38,12.2,bad,False
3322,5.7,0.26,0.3,1.8,0.039,30.0,105.0,0.99,3.48,0.52,12.5,good,False
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.989,3.34,0.38,12.8,good,False
2123,6.8,0.25,0.27,10.7,0.076,47.0,154.0,0.997,3.05,0.38,9.0,bad,False


In [15]:
data[['ph','outlier']].groupby('outlier').count()

Unnamed: 0_level_0,ph
outlier,Unnamed: 1_level_1
False,2582
True,683


In [16]:
data_without_outliers = data[data.outlier == False].drop(['outlier'], axis=1)
data_without_outliers.shape

(2582, 12)

In [17]:
data_without_outliers.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3472,6.7,0.16,0.36,2.0,0.045,24.0,131.0,0.993,3.3,0.59,10.5,bad
3455,7.3,0.23,0.24,0.9,0.031,29.0,86.0,0.989,2.9,0.38,12.2,bad
3322,5.7,0.26,0.3,1.8,0.039,30.0,105.0,0.99,3.48,0.52,12.5,good
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.989,3.34,0.38,12.8,good
2123,6.8,0.25,0.27,10.7,0.076,47.0,154.0,0.997,3.05,0.38,9.0,bad


In [18]:
data_without_outliers.to_csv('wine-train-without-outliers.csv')

In [19]:
data = data_without_outliers
data.quality = data.quality.map({'good': 1, 'bad': 0})
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3472,6.7,0.16,0.36,2.0,0.045,24.0,131.0,0.993,3.3,0.59,10.5,0
3455,7.3,0.23,0.24,0.9,0.031,29.0,86.0,0.989,2.9,0.38,12.2,0
3322,5.7,0.26,0.3,1.8,0.039,30.0,105.0,0.99,3.48,0.52,12.5,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.989,3.34,0.38,12.8,1
2123,6.8,0.25,0.27,10.7,0.076,47.0,154.0,0.997,3.05,0.38,9.0,0


In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))

data_filtered_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
data_filtered_scaled.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3472,0.25,0.078,0.217,0.021,0.107,0.077,0.281,0.11,0.505,0.424,0.403,0.0
3455,0.31,0.147,0.145,0.005,0.065,0.094,0.177,0.041,0.124,0.176,0.677,0.0
3322,0.15,0.176,0.181,0.018,0.089,0.098,0.221,0.054,0.676,0.341,0.726,1.0
4896,0.13,0.206,0.181,0.008,0.039,0.063,0.233,0.03,0.543,0.176,0.774,1.0
2123,0.26,0.167,0.163,0.155,0.199,0.157,0.335,0.185,0.267,0.176,0.161,0.0


In [21]:
data_filtered_scaled.to_csv('wine-train-filtered-scaled.csv')